Hadoop MapReduce编程 API入门系列之邮箱统计分析（十九）

　　不多说，直接上代码。

　　假如这里有一份邮箱数据文件，我们期望统计邮箱出现次数并按照邮箱的类别，将这些邮箱分别输出到不同文件路径下。

代码

package zhouls.bigdata.myMapReduce.Email;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

//假如这里有一份邮箱数据文件，我们期望统计邮箱出现次数并按照邮箱的类别，将这些邮箱分别输出到不同文件路径下。
/*[email protected]m
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]

/out/163-r-00000
/out/126-r-00000
/out/21cn-r-00000
/out/gmail-r-00000
/out/qq-r-00000
/out/sina-r-00000
/out/sohu-r-00000
/out/yahoo-r-00000
/out/part-r-00000
*/

public class Email extends Configured implements Tool
{

public static class MailMapper extends Mapper<LongWritable, Text, Text, IntWritable>
{
private final static IntWritable one = new IntWritable(1);

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
context.write(value, one);//将value和one写入到context里。
}
}

public static class MailReducer extends Reducer<Text, IntWritable, Text, IntWritable>
{
private IntWritable result = new IntWritable();
private MultipleOutputs<Text, IntWritable> multipleOutputs;

@Override
protected void setup(Context context) throws IOException ,InterruptedException
{
multipleOutputs = new MultipleOutputs<Text, IntWritable>(context);
}
protected void reduce(Text Key, Iterable<IntWritable> Values,Context context) throws IOException, InterruptedException
{//[email protected]
int begin = Key.toString().indexOf("@");//indexOf() 方法可返回某个指定的字符串值在字符串中首次出现的位置。即begin是9
int end = Key.toString().indexOf(".");//indexOf() 方法可返回某个指定的字符串值在字符串中首次出现的位置。即end是12
if(begin>=end)
{
return;
}
//获取邮箱类别，比如 qq
String name = Key.toString().substring(begin+1, end);//substring()是去除指定字符串的方法，及substring(10，12)
int sum = 0;
for (IntWritable value : Values)
{
sum += value.get();
}
result.set(sum);
multipleOutputs.write(Key, result, name);//将Key和result和name一起写入multipleOutputs
}
@Override
protected void cleanup(Context context) throws IOException ,InterruptedException
{
multipleOutputs.close();
}
}

public int run(String[] arg0) throws Exception {

Configuration conf = new Configuration();// 读取配置文件
Path mypath = new Path(arg0[1]);//下标为1，即是输出路径
FileSystem hdfs = mypath.getFileSystem(conf);//FileSystem对象hdfs
if (hdfs.isDirectory(mypath))
{
hdfs.delete(mypath, true);
}
Job job = Job.getInstance();// 新建一个任务
job.setJarByClass(Email.class);// 主类

job.setMapperClass(MailMapper.class);// Mapper
job.setReducerClass(MailReducer.class);// Reducer

job.setOutputKeyClass(Text.class);// key输出类型
job.setOutputValueClass(IntWritable.class);// value输出类型

FileInputFormat.addInputPath(job, new Path(arg0[0]));// 文件输入路径
FileOutputFormat.setOutputPath(job, new Path(arg0[1]));// 文件输出路径
job.waitForCompletion(true);

return 0;
}

public static void main(String[] args) throws Exception
{
//集群路径
// String[] args0 = { "hdfs://HadoopMaster:9000/email/email.txt",
// "hdfs://HadoopMaster:9000/out/email"};

//本地路径
String[] args0 = { "./data/email/email.txt",
"out/email/"};

int ec = ToolRunner.run( new Configuration(), new Email(), args0);
System. exit(ec);
}
}

时间： 2024-12-20 11:11:35

Hadoop MapReduce编程 API入门系列之邮箱统计分析（十九）

Hadoop MapReduce编程 API入门系列之邮箱统计分析（十九）的相关文章

Hadoop MapReduce编程 API入门系列之压缩和计数器（三十）

Hadoop MapReduce编程 API入门系列之挖掘气象数据版本2（九）

Hadoop MapReduce编程 API入门系列之处理Excel通话记录（二十）

Hadoop MapReduce编程 API入门系列之FOF（Fund of Fund）（二十三）

Hadoop MapReduce编程 API入门系列之网页流量版本1（二十二）

Hadoop MapReduce编程 API入门系列之统计学生成绩版本2（十八）

Hadoop MapReduce编程 API入门系列之统计学生成绩版本1（十七）

Hadoop MapReduce编程 API入门系列之倒排索引（二十四）

Hadoop MapReduce编程 API入门系列之Crime数据分析（二十五）（未完）