package org.lukey.hadoop.muloutput; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; import org.apache.hadoop.util.GenericOptionsParser; public class TestMultipleOutput { static String baseOutputPath = "/user/hadoop/test_out"; private static MultipleOutputs<Text, IntWritable> mos; // Mapper static class WordsOfClassCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text className = new Text(); @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub FileSplit fileSplit = (FileSplit) context.getInputSplit(); // 文件名 String fileName = fileSplit.getPath().getName(); // 文件夹名 String dirName = fileSplit.getPath().getParent().getName(); className.set(dirName + "/" + fileName); // Country:ABDBI 1 mos.write(value, one, className.toString()); // context.write(className, one); } @Override protected void cleanup(Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub mos.close(); } @Override protected void setup(Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub mos = new MultipleOutputs<Text, IntWritable>(context); } } // Reducer static class WordsOfClassCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> { // result 表示每个文件里面单词个数 IntWritable result = new IntWritable(); @Override protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub int sum = 0; for (IntWritable value : values) { sum += value.get(); } result.set(sum); context.write(key, result); } } // Client public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.out.println("Usage <in> <out>"); System.exit(-1); } Job job = new Job(conf, "file count"); job.setJarByClass(TestMultipleOutput.class); job.setMapperClass(WordsOfClassCountMapper.class); job.setReducerClass(WordsOfClassCountReducer.class); FileSystem fileSystem = FileSystem.get(conf); Path path = new Path(otherArgs[0]); FileStatus[] fileStatus = fileSystem.listStatus(path); for (FileStatus fs : fileStatus) { if (fs.isDir()) { Path p = new Path(fs.getPath().toString()); FileInputFormat.addInputPath(job, p); }else{ FileInputFormat.addInputPath(job, fs.getPath()); } } FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
时间: 2024-10-12 16:33:34