WordCount 到 Hdfs

来源:互联网 发布:淘宝优惠微信群 编辑:程序博客网 时间:2024/06/08 04:47
import java.io.IOException;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
 
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
 
public class WordCount {
    //计数变量
    privatestatic final IntWritable ONE = new IntWritable(1);
    /**
     *
     * @author 汤高
     *  Mapper<longwritable, intwritable="">中  LongWritable,IntWritable是Hadoop数据类型表示长整型和整形
     *
     *  LongWritable, Text表示输入类型 (比如本应用单词计数输入是 偏移量(字符串中的第一个单词的其实位置),对应的单词(值))
     *  Text, IntWritable表示输出类型  输出是单词  和他的个数
     *  注意:map函数中前两个参数LongWritable key, Text value和输出类型不一致
     *      所以后面要设置输出类型 要使他们一致
     */
    //Map过程
    publicstatic class WordCountMapper extends Mapper<longwritable, intwritable=""> {
        /***
         *
         */
        @Override
        protectedvoid map(LongWritable key, Text value, Mapper<longwritable, intwritable="">.Context context)
                throwsIOException, InterruptedException {
            //默认的map的value是每一行,我这里自定义的是以空格分割
            String[] vs = value.toString().split("\\s");
            for(String v : vs) {
                //写出去
                context.write(newText(v), ONE);
            }
 
        }
    }
    //Reduce过程
    /***
     * @author 汤高
     * Text, IntWritable输入类型,从map过程获得 既map的输出作为Reduce的输入
     * Text, IntWritable输出类型
     */
    publicstatic class WordCountReducer extends Reducer<text, intwritable="">{
        @Override
        protectedvoid reduce(Text key, Iterable<intwritable> values,
                Reducer<text, intwritable="">.Context context)throws IOException, InterruptedException {
            intcount=0;
            for(IntWritable v:values){
                count+=v.get();//单词个数加一
            }
 
            context.write(key,new IntWritable(count));
        }
 
    }
 
    publicstatic void main(String[] args) {
 
        Configuration conf=newConfiguration();
        try{
 
 
            //得到一个Job 并设置名字
            Job job=Job.getInstance(conf,"wordcount1");
            //设置Jar 使本程序在Hadoop中运行
            job.setJarByClass(WordCount.class);
            //设置Map处理类
            job.setMapperClass(WordCountMapper.class);
            //设置map的输出类型,因为不一致,所以要设置
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);
            //设置Reduce处理类
            job.setReducerClass(WordCountReducer.class);
            //设置输入和输出目录
            FileInputFormat.addInputPath(job,new Path("hdfs://192.168.52.140:9000/in_2/"));
            FileOutputFormat.setOutputPath(job,new Path("hdfs://192.168.52.140:9000/myhbase"+System.currentTimeMillis()));
            //启动运行
            System.exit(job.waitForCompletion(true) ?0:1);
        }catch (IOException e) {
            e.printStackTrace();
        }catch (ClassNotFoundException e) {
            e.printStackTrace();
        }catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
 
0 0