package cn.sjq.mr.sort;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Comparator;
import java.util.Random;
import java.util.TreeSet;
import java.util.UUID;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.junit.Test;
/**
* MapReduce实现单个商品支付金额最大的前N个用户排行(TopN)
* 输入数据:
* order.data1...10 10个订单文件,每个文件5000-10000条的购买记录,格式如下:
* orderid userid payment productid
* c13a009e-a950-42f6-8eab-8e28d1406fe0,U10102000139,1008, 21010185
c5d2a564-619c-4e1a-a350-7ce981bbc22d,U10102000191,1357, 21010117
1d762edd-0044-4773-a413-ab0440081b1e,U10102000150,2173, 21010124
e82c2848-6d6e-4fdf-8d7d-83059376846b,U10102000162,2310, 21010197
......
最终输出数据(TopN):
单个商品支付金额最大的前10个用户排行
用户ID 商品ID 支付总额
U10102000178 21010139 38084
U10102000171 21010130 37329
U10102000113 21010191 34700
U10102000102 21010124 34523
U10102000167 21010118 33939
U10102000184 21010156 33870
U10102000129 21010137 32839
U10102000115 21010124 32793
U10102000145 21010199 32630
U10102000123 21010149 32328
实现逻辑:本程序的实现逻辑同统计热销商品实现逻辑类似
* 实现逻辑:
* Mapper端:
* (1)实现数据分片,将读取的数据分片通过map方法处理后输出到Combiner
* (2)数据的输出格式<k2:userid+productid> <v2:payment>
* <k2>Text <v2>Intwritable
* U10102000139&21010185 <1008>
* U10102000150&21010185 <1357>
* U10102000139&21010185 <2310>
* ... ...
* Combiner端:
* (1)Combiner是一种特殊的Reducer,使用Combiner需要注意不要改变程序原有逻辑,且保障Mapper端和Reducer端的数据类型一致
* (2)这里使用Combiner主要是为了实现
* 1)每个用户每个商品支付金额总和
* 2)通过在Combiner端对每个用户下同一个商品进行payment求和,这块可以大大减少数据在网络中传输,同时提高Reducer程序的执行效率
* (3)处理后数据输出格式如下:
* <k2`> <v2`>
* U10102000139&21010185 <20202>
* U10102000150&21010176 <11422>
* U10102000139&21010154 <10132>
* ... ...
* 注意:这里输出为局部TopN排行
*
* Reducer端:
* (1)Reducer端主要对Combiner端输出的多个局部排行的TopN条数据进行全局排行汇总
* (2)由于最终输出只会到一个文件,因此需要保障Reducer Tasks任务数为1
* (3)通过Reducer处理后,最终输出为
* <k3> <v4>
* U10102000139&21010185 <39872>
* U10102000150&21010176 <21422>
* U10102000139&21010154 <10132>
* ... ...
* @author songjq
*
*/
public class UserPaymentTopN {
/**
* 数据来源:
* 利用Java代码构造简单的订单数据,这里构造多个数据文件,每个文件5000-10000行数据
* 构造数据格式:
* orderid ,userid, payment, productid
53d419fa-0df4-4b6d-8214-dac158bf33e7,U10102000186, 2008, 210100
7a200107-1711-4f83-a09d-76b21ef37575,U10102000182, 1155, 210100
367d1d43-2a38-48a1-a3bc-9065d215f093,U10102000177, 1951, 210100
6082506e-0cfb-47e2-902b-f5cbceac4a21,U10102000121, 2619, 210100
通过该程序,我们就构造了10个order.data文件,且每个文件中数据为5000-10000行
如果要通过MapReduce来对payment进行TopN排行,那数据的数据量为5万-10万行,足以支持我们的测试。
* @author songjq
*
*/
public static class OrderData {
public static void main(String[] args) throws Exception {
for(int i=0;i<10;i++) {
FileOutputStream out = new FileOutputStream("D:\\test\\tmp\\userTopN\\order.data"+(i+1));
int lines = 5000+new Random().nextInt(5000);
int count = 0;
while(count<lines) {
//订单ID,采用UUID是为了防止生成在多个文件的中订单ID全局唯一
UUID uuid = UUID.randomUUID();
//商品支付金额
int payment = 1000+new Random().nextInt(2000);
//用户ID,随机构造100-200之间编号的用户,用户数<=100
int userid = 100+new Random().nextInt(100);
//产品ID,随机构造100-200之间编号的商品,商品数<=100
int productId = 100+new Random().nextInt(100);
String orderdata = uuid+",U10102000"+userid+","+payment+",21010"+productId+"\n";
out.write(orderdata.getBytes());
count++;
}
out.flush();
out.close();
}
}
}
/**
* Mapper端:
* (1)实现数据分片,将读取的数据分片通过map方法处理后输出到Combiner
* (2)数据的输出格式<k2:userid+productid> <v2:payment>
* <k2>Text <v2>Intwritable
* U10102000139&21010185 <1008>
* U10102000150&21010185 <1357>
* U10102000139&21010185 <2310>
* ... ...
* @author songjq
*
*/
static class UserPaymentTopNMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
private Text tkey = new Text();
private LongWritable tvalue = new LongWritable();
/*
* 读取文件分片,并处理后输出到Combiner
* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
*/
@Override
protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {
//读入一行数据
String line = v1.toString();
//分词处理
String[] order = line.split(",");
if(null!=order && order.length == 4) {
//用户编码ID
String userid = order[1];
//商品ID
String productId = order[3];
//商品金额
long payment = Long.valueOf(order[2]);
//<k2>
tkey.set(userid+"&"+productId);
//<v2>
tvalue.set(payment);
//通过context将数据传递到Combiner
context.write(tkey, tvalue);
}else {
return;
}
}
}
/**
* Combiner端:
* (1)Combiner是一种特殊的Reducer,使用Combiner需要注意不要改变程序原有逻辑,且保障Mapper端和Reducer端的数据类型一致
* (2)这里使用Combiner主要是为了实现
* 1)每个用户每个商品支付金额总和
* 2)通过在Combiner端对每个用户下同一个商品进行payment求和,这块可以大大减少数据在网络中传输,同时提高Reducer程序的执行效率
* (3)处理后数据输出格式如下:
* <k2`> <v2`>
* U10102000139&21010185 <20202>
* U10102000150&21010176 <11422>
* U10102000139&21010154 <10132>
* ... ...
* 注意:这里输出为局部TopN排行
* @author songjq
*
*/
static class UserPaymentTopNCombiner extends Reducer<Text, LongWritable, Text, LongWritable>{
@Override
protected void reduce(Text k3_, Iterable<LongWritable> v3_, Context ctx)
throws IOException, InterruptedException {
//商品次数求和
long count = 0;
for(LongWritable val:v3_) {
count += val.get();
}
ctx.write(k3_, new LongWritable(count));
}
}
/**
* Reducer端:
* (1)Reducer端主要对Combiner端输出的多个局部排行的TopN条数据进行全局排行汇总
* (2)由于最终输出只会到一个文件,因此需要保障Reducer Tasks任务数为1
* (3)通过Reducer处理后,最终输出为
* <k3> <v4>
* U10102000139&21010185 <39872>
* U10102000150&21010176 <21422>
* U10102000139&21010154 <10132>
* ... ...
* @author songjq
*
*/
static class UserPaymentTopNReducer extends Reducer<Text, LongWritable, Text, Text>{
//实现思路和Combiner一致
private TreeSet<String[]> treeSet = null;
//全局前N条商品排名
private Integer N = null;
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
//获取全局N
N = Integer.valueOf(context.getConfiguration().get("Global_N"));
//实例化treeSet,并对其内容按照商品购买次数进行排序
treeSet = new TreeSet<String[]>(new Comparator<String[]>() {
@Override
public int compare(String[] o1, String[] o2) {
long payment1 = Long.valueOf(o1[1]);
long payment2 = Long.valueOf(o2[1]);
int result = 0;
if(payment1>payment2) {
result = -1;
}else if(payment1<payment2) {
result = 1;
}
return result;
}
});
}
/*
* 对Combiner输出的数据进行全局排行
* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
*/
@Override
protected void reduce(Text k3, Iterable<LongWritable> v3,
Context ctx) throws IOException, InterruptedException {
//汇总Combiner任务输出过来的商品次数
long payment_total = 0;
for(LongWritable val:v3) {
payment_total+=val.get();
}
String[] arys = {k3.toString(),String.valueOf(payment_total)};
treeSet.add(arys);
//treeSet超过N条记录,则删除最后一个节点
if(treeSet.size()>N) {
treeSet.pollLast();
}
}
/*
* reduce方法结束后执行,这里将treeSet结果集写到HDFS
* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Reducer#cleanup(org.apache.hadoop.mapreduce.Reducer.Context)
*/
@Override
protected void cleanup(Context context)
throws IOException, InterruptedException {
context.write(new Text("单个商品支付金额最大的前"+N+"个用户排行"), new Text());
context.write(new Text("用户ID\t\t\t商品ID\t\t"), new Text("支付总额"));
for(String[] ary:treeSet) {
String[] arr = ary[0].split("&");
context.write(new Text(arr[0]+"\t"+arr[1]), new Text(ary[1]));
}
}
}
/**
* 提交任务Job
* @throws Exception
*/
@Test
public void UserPaymentTopNJob() throws Exception {
Configuration conf = new Configuration();
conf.set("Global_N", "10");
Job job = Job.getInstance(conf);
job.setJarByClass(UserPaymentTopN.class);
//Mapper
job.setMapperClass(UserPaymentTopNMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
//Combiner
job.setCombinerClass(UserPaymentTopNCombiner.class);
//Reducer
job.setReducerClass(UserPaymentTopNReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//必须设置为1
job.setNumReduceTasks(1);
//输入路径
FileInputFormat.setInputPaths(job, "D:\\test\\tmp\\userTopN");
job.setInputFormatClass(TextInputFormat.class);
//输出路径
Path outpath = new Path("D:\\test\\tmp\\UserPaymentTopNout");
outpath.getFileSystem(conf).delete(outpath, true);
FileOutputFormat.setOutputPath(job, outpath);
job.waitForCompletion(true);
}
}
原文地址:http://blog.51cto.com/2951890/2156408