设计思路:
使用mapreduce的默认排序,按照key值进行排序的,如果key为封装int的IntWritable类型,那么MapReduce按照数字大小对key排序,如果key为封装为String的Text类型,那么MapReduce按照字典顺序对字符串排序。
首先map阶段将输入的数字作为key, 并记录相同key出现的次数,在reduce阶段将输入的key作为输出的value,如果相同值存在多个,循环便利输出。
源数据:file1
2 32 654 32 15 756 65223
file2
5956 22 650 92
file3
26 54 6 2 15
源代码
package com.duking.hadoop; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.Mapper.Context; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class DataSort { public static class Map extends Mapper<Object, Text, IntWritable, IntWritable> { private static IntWritable data = new IntWritable(); // 实现map函数 public void map(Object key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); // 将输入的每一行数据转换为String类型 data.set(Integer.parseInt(line)); // 将String 转换为Integer context.write(data, new IntWritable(1)); // 将 date->key // 统计key出现的次数自增为value } } // reduce将输入中的key复制到输出数据的key上,并直接输出 这是数据区重的思想 public static class Reduce extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> { private static IntWritable linenum = new IntWritable(1); private IntWritable result = new IntWritable(); // 实现reduce函数 public void reduce(IntWritable key, Iterable<IntWritable> values, //Iterable转为List Context context) throws IOException, InterruptedException { for (IntWritable val : values) { context.write(linenum, key); linenum = new IntWritable(linenum.get() + 1); } } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.set("mapred.job.tracker", "192.168.60.129:9000"); // 指定带运行参数的目录为输入输出目录 String[] otherArgs = new GenericOptionsParser(conf, args) .getRemainingArgs(); /* * 指定工程下的input2为文件输入目录 output2为文件输出目录 String[] ioArgs = new String[] { * "input2", "output2" }; * * String[] otherArgs = new GenericOptionsParser(conf, ioArgs) * .getRemainingArgs(); */ if (otherArgs.length != 2) { // 判断路径参数是否为2个 System.err.println("Usage: Data Deduplication <in> <out>"); System.exit(2); } // set maprduce job name Job job = new Job(conf, "Data sort"); job.setJarByClass(DataSort.class); // 设置Map、Combine和Reduce处理类 job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); // 设置输出类型 job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); // 设置输入和输出目录 FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
结果
1 2 2 2 3 6 4 15 5 15 6 22 7 26 8 32 9 32 10 54 11 92 12 650 13 654 14 756 15 5956 16 65223
时间: 2024-10-09 22:14:58