MapReduce功能实现五---去重(Distinct)、计数(Count)

来源:互联网 发布:linux dd u盘加载驱动 编辑:程序博客网 时间:2024/06/11 22:52

MapReduce功能实现系列:

MapReduce功能实现一---Hbase和Hdfs之间数据相互转换

MapReduce功能实现二---排序

MapReduce功能实现三---Top N

MapReduce功能实现四---小综合(从hbase中读取数据统计并在hdfs中降序输出Top 3)

MapReduce功能实现五---去重(Distinct)、计数(Count)

MapReduce功能实现六---最大值(Max)、求和(Sum)、平均值(Avg)

MapReduce功能实现七---小综合(多个job串行处理计算平均值)

MapReduce功能实现八---分区(Partition)

MapReduce功能实现九---Pv、Uv

MapReduce功能实现十---倒排索引(Inverted Index)

MapReduce功能实现十一---join


一、去重

类似于db中的select distinct(x) from table , 去重处理甚至比WordCount还要简单

[hadoop@h71 q1]$ vi hello.txthello worldhello hadoophello hivehello hadoophello worldhello world[hadoop@h71 q1]$ hadoop fs -mkdir /user/hadoop/dedup_in[hadoop@h71 q1]$ hadoop fs -put hello.txt /user/hadoop/dedup_in

java代码:

import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class Dedup {    public static class RemoveDupMapper extends Mapper<Object, Text, Text, NullWritable> {    public void map(Object key, Text value, Context context) throws IOException, InterruptedException {    context.write(value, NullWritable.get());    }    }    public static class RemoveDupReducer extends Reducer<Text, NullWritable, Text, NullWritable> {    public void reduce(Text key, Iterable<NullWritable> values, Context context)        throws IOException, InterruptedException {    context.write(key, NullWritable.get());    }    }    public static void main(String[] args) throws Exception{        Configuration conf = new Configuration();        conf.set("mapred.jar","Dedup.jar");   //去掉这行也能运行,目前还不知道这行有什么用        String[] ioArgs=new String[]{"dedup_in","dedup_out"};        String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();        if (otherArgs.length != 2) {        System.err.println("Usage: Data Deduplication <in> <out>");        System.exit(2);        }                Job job = new Job(conf, "Data Deduplication");        job.setJarByClass(Dedup.class);        //设置Map、Combine和Reduce处理类        job.setMapperClass(RemoveDupMapper.class);        job.setCombinerClass(RemoveDupReducer.class);        job.setReducerClass(RemoveDupReducer.class);        //设置输出类型        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(NullWritable.class);        //设置输入和输出目录        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));        System.exit(job.waitForCompletion(true) ? 0 : 1);}}

在Linux中运行代码;

[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/javac Dedup.java [hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar Dedup*class[hadoop@h71 q1]$ hadoop jar xx.jar Dedup

查看结果;

[hadoop@h71 q1]$ hadoop fs -cat /user/hadoop/dedup_out/part-r-00000
hello hadoop
hello hive
hello world


二、计数器的使用

[hadoop@h71 q1]$ hadoop fs -mkdir /user/hadoop/mapinput[hadoop@h71 q1]$ hadoop fs -put hello.txt /user/hadoop/mapinput

java代码:

import java.io.IOException;import java.net.URI;import java.net.URISyntaxException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Counter;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * mapreduce中计数器的使用 */public class WordCountApp {private static final String INPUT_PATH = "hdfs://h71:9000/user/hadoop/mapinput";private static final String OUTPUT_PATH = "hdfs://h71:9000/user/hadoop/mapoutput";public static void main(String[] args) throws IOException, URISyntaxException,       ClassNotFoundException, InterruptedException {Configuration conf = new Configuration();conf.set("mapred.jar","wcapp.jar");final FileSystem fileSystem = FileSystem.get(new URI(OUTPUT_PATH), conf);fileSystem.delete(new Path(OUTPUT_PATH), true);final Job job = new Job(conf, WordCountApp.class.getSimpleName());job.setJarByClass(WordCountApp.class);FileInputFormat.setInputPaths(job, INPUT_PATH);job.setMapperClass(MyMapper.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(LongWritable.class);job.setReducerClass(MyReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(LongWritable.class);FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));job.waitForCompletion(true);}public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {final String line = value.toString();StringTokenizer tokenizer = new StringTokenizer(line);final Counter counter = context.getCounter("Sensitive", "hello");if (value.toString().contains("hello")) {counter.increment(1L);   //当查询到包含hello的词语时,计数器加1}while(tokenizer.hasMoreTokens()) {String target = tokenizer.nextToken();if(target.equals("hello")){//只过滤输出hello的计数context.write(new Text(target), new LongWritable(1));}}}}public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {@Overrideprotected void reduce(Text key, Iterable<LongWritable> value,Reducer<Text, LongWritable, Text, LongWritable>.Context context)throws IOException, InterruptedException {long times = 0l;while (value.iterator().hasNext()) {times += value.iterator().next().get();}context.write(key, new LongWritable(times));}}}

[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/javac WordCountApp.java [hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar WordCountApp*class[hadoop@h71 q1]$ hadoop jar xx.jar WordCountApp

在控制台打印的信息中你会看到:
        Sensitive
                hello=6

[hadoop@h71 q1]$ hadoop fs -cat /user/hadoop/mapoutput/part-r-00000
hello   6


三、记录计数(Count)

[hadoop@h71 q1]$ vi ceshi.txt
2
8
8
3
2
3
5
3
0
2
7
[hadoop@h71 q1]$ hadoop fs -put ceshi.txt /input


这个跟WordCount略有不同,类似于Select Count(*) from tables的效果,代码也超级简单,直接拿WordCount改一改就行了

java代码:

import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class RowCount {    public static class RowCountMapper extends Mapper<Object, Text, Text, IntWritable> {        private final static IntWritable one = new IntWritable(1);        private final  static Text countKey = new Text("count");        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {                context.write(countKey, one);        }    }    public static class RowCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {        private IntWritable result = new IntWritable();        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {            int sum = 0;            for (IntWritable val : values) {                sum += val.get();            }            result.set(sum);            context.write(key, result);        }    }    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();        if (otherArgs.length < 2) {            System.err.println("Usage: RowCount <in> [<in>...] <out>");            System.exit(2);        }        Job job = Job.getInstance(conf, "word count");        job.setJarByClass(RowCount.class);        job.setMapperClass(RowCountMapper.class);        job.setCombinerClass(RowCountReducer.class);        job.setReducerClass(RowCountReducer.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        for (int i = 0; i < otherArgs.length - 1; ++i) {            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));        }        FileOutputFormat.setOutputPath(job,                new Path(otherArgs[otherArgs.length - 1]));        System.exit(job.waitForCompletion(true) ? 0 : 1);    }}

[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/javac RowCount.java 
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar RowCount*class
[hadoop@h71 q1]$ hadoop jar xx.jar RowCount /input/ceshi.txt /output


[hadoop@h71 q1]$ hadoop fs -cat /output/part-r-00000
count   11


注:如果只想输出一个数字,不需要"count"这个key,可以改进一下:

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;import java.io.IOException;public class RowCount2 {    public static class RowCount2Mapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {        public long count = 0;        public void map(LongWritable key, Text value, Context context)                throws IOException, InterruptedException {            count += 1;        }        protected void cleanup(Context context) throws IOException, InterruptedException {            context.write(new LongWritable(count), NullWritable.get());        }    }    public static class RowCount2Reducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {        public long count = 0;        public void reduce(LongWritable key, Iterable<NullWritable> values, Context context)                throws IOException, InterruptedException {            count += key.get();        }        protected void cleanup(Context context) throws IOException, InterruptedException {            context.write(new LongWritable(count), NullWritable.get());        }    }    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();        if (otherArgs.length < 2) {            System.err.println("Usage: FindMax <in> [<in>...] <out>");            System.exit(2);        }        Job job = Job.getInstance(conf, "RowCount2");        job.setJarByClass(RowCount2.class);        job.setMapperClass(RowCount2Mapper.class);        job.setCombinerClass(RowCount2Reducer.class);        job.setReducerClass(RowCount2Reducer.class);        job.setOutputKeyClass(LongWritable.class);        job.setOutputValueClass(NullWritable.class);        for (int i = 0; i < otherArgs.length - 1; ++i) {            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));        }        FileOutputFormat.setOutputPath(job,                new Path(otherArgs[otherArgs.length - 1]));        System.exit(job.waitForCompletion(true) ? 0 : 1);    }}

[hadoop@h71 q1]$ hadoop fs -cat /output/part-r-00000
11


这样输出结果就只有一个数字11了.
注意: 这里context.write(xxx)只能写在cleanup方法中, 该方法在Mapper和Reducer接口中都有, 在map方法及reduce方法执行完后,会触发cleanup方法. 大家可以尝试下,把context.write(xxx)写在map和reduce方法中试试看,结果会出现多行记录,而不是预期的仅1个数字.