朴素贝叶斯之MapReduce版

1,统计词出现的次数

1/计算类别的先验概率

*输入格式:类别+文档id+文档词(切分成A,b,c)

*输出格式:类别+文档出现次数+文档出现的词的总数

2/计算每个词的条件概率

*输入格式:类别+文档id+文档词(切分成A,b,c)

*输出格式:类别+词+词的总数

3/假设二分类问题-计算概率值

* 1类别+文档出现次数+文档出现的词的总数

* 2类别+词+词的总数

* 3类别+词+log(词的总数/文档出现的词的总数),类别-log(文档出现次数/sum(文档出现次数))

* 输入格式:类别+词+词的总数

* 输出格式:"词","类别+log()值概率"+1,2+类别的先验概率

* 4/假设二分类问题-测试

* 1类别+文档出现次数+文档出现的词的总数

* 2类别+词+词的总数

* 3类别+词+log(词的总数/文档出现的词的总数),类别-log(文档出现次数/sum(文档出现次数))

*输入格式:新文档id+文档词(切分成A,b,c)

*输出格式:新文档id+类别

这个版本基本写了MapReduce的朴素贝叶斯思路--具体优化和修改以后再弄

Python版实现

http://blog.csdn.net/q383700092/article/details/51773364

R语言版调用函数

http://blog.csdn.net/q383700092/article/details/51774069

MapReduce简化实现版

http://blog.csdn.net/q383700092/article/details/51778765

spark版

后续添加

Bayes1

package com.ml.mapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * 1/计算类别的先验概率
 * 汇总到dict1.txt
 *输入格式:类别+文档id+文档词(切分成A,b,c)
 *输出格式:类别+文档出现次数+文档出现的词的总数
 */
public class Bayes1 extends Configured implements Tool {

	public static enum Counter {
		PARSER_ERR
	}

	public static class MyMap extends Mapper<LongWritable, Text, Text, Text> {
		private Text mykey = new Text();// 类别id
		private Text myval = new Text();// 文档id+文档长度

		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String[] array = value.toString().split(",");
			String[] doc=array[2].split("-");
			mykey.set(array[0]);
			myval.set("1"+","+doc.length);
			context.write(mykey, myval);
		};
	}

	public static class MyReduce extends Reducer<Text, Text, Text, Text> {
		private Text val = new Text();

		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			// 用于计算该类别总的个数
			int sum = 0;
			//计算出现词的总个数
			int wordsum = 0;
			// 循环遍历 Interable
			for (Text value : values) {
				// 累加
				String[] array = value.toString().split(",");
				sum += Integer.parseInt(array[0]);
				wordsum += Integer.parseInt(array[1]);
				val.set(sum+","+wordsum);
			}
			context.write(key, val);
		};
	}

	@Override
	public int run(String[] args) throws Exception {
		// 1 conf
		Configuration conf = new Configuration();
		conf.set("mapred.textoutputformat.separator", ",");// key value分隔符
		// 2 create job
		// Job job = new Job(conf, ModuleMapReduce.class.getSimpleName());
		Job job = this.parseInputAndOutput(this, conf, args);
		// 3 set job
		// 3.1 set run jar class
		// job.setJarByClass(ModuleReducer.class);
		// 3.2 set intputformat
		job.setInputFormatClass(TextInputFormat.class);
		// 3.3 set input path
		// FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.4 set mapper
		job.setMapperClass(MyMap.class);
		// 3.5 set map output key/value class
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		// 3.6 set partitioner class
		// job.setPartitionerClass(HashPartitioner.class);
		// 3.7 set reduce number
		// job.setNumReduceTasks(1);
		// 3.8 set sort comparator class
		// job.setSortComparatorClass(LongWritable.Comparator.class);
		// 3.9 set group comparator class
		// job.setGroupingComparatorClass(LongWritable.Comparator.class);
		// 3.10 set combiner class
		// job.setCombinerClass(null);
		// 3.11 set reducer class
		job.setReducerClass(MyReduce.class);
		// 3.12 set output format

		job.setOutputFormatClass(TextOutputFormat.class);
		// 3.13 job output key/value class
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		// 3.14 set job output path
		// FileOutputFormat.setOutputPath(job, new Path(args[1]));
		// 4 submit job
		boolean isSuccess = job.waitForCompletion(true);
		// 5 exit
		// System.exit(isSuccess ? 0 : 1);
		return isSuccess ? 0 : 1;
	}

	public Job parseInputAndOutput(Tool tool, Configuration conf, String[] args)
			throws Exception {
		// validate
		if (args.length != 2) {
			System.err.printf("Usage:%s [genneric options]<input><output>\n",
					tool.getClass().getSimpleName());
			ToolRunner.printGenericCommandUsage(System.err);
			return null;
		}
		// 2 create job
		Job job = new Job(conf, tool.getClass().getSimpleName());
		// 3.1 set run jar class
		job.setJarByClass(tool.getClass());
		// 3.3 set input path
		FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.14 set job output path
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		return job;
	}

	public static void main(String[] args) throws Exception {
		args = new String[] {
				"hdfs://192.168.192.129:9000/ml/bayesTrain.txt",
				// "hdfs://hadoop-00:9000/home910/liyuting/output/" };
				"hdfs://192.168.192.129:9000/ml/bayes/" };
		// run mapreduce
		int status = ToolRunner.run(new Bayes1(), args);
		// 5 exit
		System.exit(status);
	}
}
</pre>Bayes2<p></p><p></p><pre code_snippet_id="1734263" snippet_file_name="blog_20160628_2_8203234" name="code" class="java">package com.ml.mapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * 2/计算每个词的条件概率
 * 汇总到dict2.txt
 *输入格式:类别+文档id+文档词(切分成A,b,c)
 *输出格式:类别+词+词的总数
 */
public class Bayes2 extends Configured implements Tool {

	public static enum Counter {
		PARSER_ERR
	}

	public static class MyMap extends Mapper<LongWritable, Text, Text, Text> {
		private Text mykey = new Text();//类别+词
		private Text myval = new Text();//出现个数

		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String[] array = value.toString().split(",");
			String[] doc=array[2].split("-");
			for (String str : doc) {
				mykey.set(array[0]+ ","+ str);
				myval.set("1");
				context.write(mykey, myval);
			}
		};
	}

	public static class MyReduce extends Reducer<Text, Text, Text, Text> {
		private Text val = new Text();

		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			// 用于计算每个类别里面每个词出现的总数
			int sum = 0;
			// 循环遍历 Interable
			for (Text value : values) {
				// 累加
				String array = value.toString();
				sum += Integer.parseInt(array);
				val.set(sum + "");
			}
			context.write(key, val);
		};
	}

	@Override
	public int run(String[] args) throws Exception {
		// 1 conf
		Configuration conf = new Configuration();
		conf.set("mapred.textoutputformat.separator", ",");// key value分隔符
		// 2 create job
		// Job job = new Job(conf, ModuleMapReduce.class.getSimpleName());
		Job job = this.parseInputAndOutput(this, conf, args);
		// 3 set job
		// 3.1 set run jar class
		// job.setJarByClass(ModuleReducer.class);
		// 3.2 set intputformat
		job.setInputFormatClass(TextInputFormat.class);
		// 3.3 set input path
		// FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.4 set mapper
		job.setMapperClass(MyMap.class);
		// 3.5 set map output key/value class
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		// 3.6 set partitioner class
		// job.setPartitionerClass(HashPartitioner.class);
		// 3.7 set reduce number
		// job.setNumReduceTasks(1);
		// 3.8 set sort comparator class
		// job.setSortComparatorClass(LongWritable.Comparator.class);
		// 3.9 set group comparator class
		// job.setGroupingComparatorClass(LongWritable.Comparator.class);
		// 3.10 set combiner class
		// job.setCombinerClass(null);
		// 3.11 set reducer class
		job.setReducerClass(MyReduce.class);
		// 3.12 set output format

		job.setOutputFormatClass(TextOutputFormat.class);
		// 3.13 job output key/value class
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		// 3.14 set job output path
		// FileOutputFormat.setOutputPath(job, new Path(args[1]));
		// 4 submit job
		boolean isSuccess = job.waitForCompletion(true);
		// 5 exit
		// System.exit(isSuccess ? 0 : 1);
		return isSuccess ? 0 : 1;
	}

	public Job parseInputAndOutput(Tool tool, Configuration conf, String[] args)
			throws Exception {
		// validate
		if (args.length != 2) {
			System.err.printf("Usage:%s [genneric options]<input><output>\n",
					tool.getClass().getSimpleName());
			ToolRunner.printGenericCommandUsage(System.err);
			return null;
		}
		// 2 create job
		Job job = new Job(conf, tool.getClass().getSimpleName());
		// 3.1 set run jar class
		job.setJarByClass(tool.getClass());
		// 3.3 set input path
		FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.14 set job output path
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		return job;
	}

	public static void main(String[] args) throws Exception {
		args = new String[] {
				"hdfs://192.168.192.129:9000/ml/bayesTrain.txt",
				// "hdfs://hadoop-00:9000/home910/liyuting/output/" };
				"hdfs://192.168.192.129:9000/ml/bayes/pword/" };
		// run mapreduce
		int status = ToolRunner.run(new Bayes2(), args);
		// 5 exit
		System.exit(status);
	}
}

Bayes3

package com.ml.mapreduce;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * 3/假设二分类问题-计算概率值
 * 1类别+文档出现次数+文档出现的词的总数
 * 2类别+词+词的总数
 * 3类别+词+log(词的总数/文档出现的词的总数),类别-log(文档出现次数/sum(文档出现次数))
 *
 * 输入格式:类别+词+词的总数
 * 输出格式:"词","类别+log()值概率"+1,2+类别的先验概率
 */
public class Bayes3 extends Configured implements Tool {

	public static enum Counter {
		PARSER_ERR
	}

	public static class MyMap extends Mapper<LongWritable, Text, Text, Text> {
		private Text mykey = new Text();// 类别+词
		private Text myval = new Text();// 出现个数

		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			 BufferedReader br = null;
	            //获得当前作业的DistributedCache相关文件
	            Path[] distributePaths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
	            String lines = null;
	            String[] class1 = {"0","0"};
	            String[] class0 = {"0","0"};
	            for(Path p : distributePaths){
	                if(p.getParent().toString().endsWith("bayes")){
	                    //读缓存文件,并放到mem中
	                    br = new BufferedReader(new FileReader(p.toString()));
	                    while(null!=(lines=br.readLine())){
	                        String[] pall= lines.split(",");
	                        if (pall[0].equals("1")) {
	                        	class1[0]=pall[1];
	                        	class1[1]=pall[2];
							}else {
								class0[0]=pall[1];
								class0[1]=pall[2];
							}
	                    }
	                }
	            }
			String[] array = value.toString().split(",");
			Double plog=0.0;
			if (array[0].equals("1")) {
				mykey.set(array[1]);// 词
				plog=Math.log(Double.parseDouble(array[2])/Double.parseDouble(class1[1]));
				myval.set(array[0]+","+plog);// 类别+log概率
				context.write(mykey, myval);
			}else {
				mykey.set(array[1]);// 词
				plog=Math.log(Double.parseDouble(array[2])/Double.parseDouble(class0[1]));
				myval.set(array[0]+","+plog);// 类别+log概率
				context.write(mykey, myval);
			}

		};
	}

	public static class MyReduce extends Reducer<Text, Text, Text, Text> {
		private Text val = new Text();

		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {

			String vals="tab";
			for (Text value : values) {
				// 累加
				vals=vals+","+value.toString();
			}
			val.set(vals);
			context.write(key, val);
		};
	}

	@Override
	public int run(String[] args) throws Exception {
		// 1 conf
		Configuration conf = new Configuration();
		conf.set("mapred.textoutputformat.separator", ",");// key value分隔符
		DistributedCache.addCacheFile(new Path(args[2]).toUri(), conf);//为该job添加缓存文件
		// 2 create job
		// Job job = new Job(conf, ModuleMapReduce.class.getSimpleName());
		Job job = this.parseInputAndOutput(this, conf, args);
		// 3 set job
		// 3.1 set run jar class
		// job.setJarByClass(ModuleReducer.class);
		// 3.2 set intputformat
		job.setInputFormatClass(TextInputFormat.class);
		// 3.3 set input path
		// FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.4 set mapper
		job.setMapperClass(MyMap.class);
		// 3.5 set map output key/value class
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		// 3.6 set partitioner class
		// job.setPartitionerClass(HashPartitioner.class);
		// 3.7 set reduce number
//		 job.setNumReduceTasks(0);
		// 3.8 set sort comparator class
		// job.setSortComparatorClass(LongWritable.Comparator.class);
		// 3.9 set group comparator class
		// job.setGroupingComparatorClass(LongWritable.Comparator.class);
		// 3.10 set combiner class
		// job.setCombinerClass(null);
		// 3.11 set reducer class
		job.setReducerClass(MyReduce.class);
		// 3.12 set output format

		job.setOutputFormatClass(TextOutputFormat.class);
		// 3.13 job output key/value class
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		// 3.14 set job output path
		// FileOutputFormat.setOutputPath(job, new Path(args[1]));
		// 4 submit job
		boolean isSuccess = job.waitForCompletion(true);
		// 5 exit
		// System.exit(isSuccess ? 0 : 1);
		return isSuccess ? 0 : 1;
	}

	public Job parseInputAndOutput(Tool tool, Configuration conf, String[] args)
			throws Exception {
		// validate
//		if (args.length != 2) {
//			System.err.printf("Usage:%s [genneric options]<input><output>\n",
//					tool.getClass().getSimpleName());
//			ToolRunner.printGenericCommandUsage(System.err);
//			return null;
//		}
		// 2 create job
		Job job = new Job(conf, tool.getClass().getSimpleName());
		// 3.1 set run jar class
		job.setJarByClass(tool.getClass());
		// 3.3 set input path
		FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.14 set job output path
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		return job;
	}

	public static void main(String[] args) throws Exception {
		args = new String[] {
				"hdfs://192.168.192.129:9000/ml/bayes/pword/part-r-00000",
				// "hdfs://hadoop-00:9000/home910/liyuting/output/" };
				"hdfs://192.168.192.129:9000/ml/bayes/pall/",
				"hdfs://192.168.192.129:9000/ml/bayes/part-r-00000"};
		// run mapreduce
		int status = ToolRunner.run(new Bayes3(), args);
		// 5 exit
		System.exit(status);
	}
}

Bayes4

package com.ml.mapreduce;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * 4/假设二分类问题-测试
 * 1类别+文档出现次数+文档出现的词的总数
 * 2类别+词+词的总数
 * 3类别+词+log(词的总数/文档出现的词的总数),类别-log(文档出现次数/sum(文档出现次数))
 *
 *输入格式:新文档id+文档词(切分成A,b,c)
 *输出格式:新文档id+类别
 */
public class Bayes4 extends Configured implements Tool {

	public static enum Counter {
		PARSER_ERR
	}

	public static class MyMap extends Mapper<LongWritable, Text, Text, Text> {
		private Text mykey = new Text();//类别+词
		private Text myval = new Text();//出现个数
		Map zidianString=new HashMap();//key是词 value是概率值-假设字典可以读到内存中//不能的话切分读取
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			 BufferedReader br = null;
	            //获得当前作业的DistributedCache相关文件
	            Path[] distributePaths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
	            String lines = null;
	            for(Path p : distributePaths){
	                if(p.getParent().toString().endsWith("pall")){
	                    //读缓存文件,并放到mem中
	                    br = new BufferedReader(new FileReader(p.toString()));
	                    while(null!=(lines=br.readLine())){
	                        String[] pall= lines.split(",");
	                        if (pall.length>4) {
	                        	if (pall[2].equals("1")) {
	                        		zidianString.put(pall[0], pall[2]+","+pall[3]+","+pall[4]+","+pall[5]);
								}else {
									zidianString.put(pall[0], pall[4]+","+pall[5]+","+pall[2]+","+pall[3]);
								}
							}else {
								if (pall[2].equals("1")) {
									zidianString.put(pall[0], pall[2]+","+pall[3]+","+"0"+","+"0.0");
								}else {
									zidianString.put(pall[0], "1"+","+"0.0"+","+pall[2]+","+pall[3]);
								}

							}
	                    }
	                }
	            }
			String[] array = value.toString().split(",");
			String[] doc=array[1].split("-");
			for (String str : doc) {
				if (zidianString.containsKey(str)) {
					String[] kk=zidianString.get(str).toString().split(",");//类别+概率
					mykey.set(array[0]);//文档id
					myval.set(kk[0]+","+kk[1]+","+kk[2]+","+kk[3]);//类别+log概率
					context.write(mykey, myval);
				}
			}
		};
	}

	public static class MyReduce extends Reducer<Text, Text, Text, Text> {
		private Text val = new Text();

		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			// 用于计算每个类别里面词的概率
			Double sum=0.5;//类别1的先验概率 --需要提前算好0-0这里可以考虑读入--等有空再修改
			Double sum2=0.5;//类别0的先验概率
			// 循环遍历 Interable
			for (Text value : values) {
				// 累加
				String[] array = value.toString().split(",");
					sum += Double.parseDouble(array[1]);//似然概率
					sum2 += Double.parseDouble(array[3]);//似然概率
			}
			if (sum>sum2) {
				val.set("类别1");
			}else {
				val.set("类别0");
			}
			context.write(key, val);
		};
	}

	@Override
	public int run(String[] args) throws Exception {
		// 1 conf
		Configuration conf = new Configuration();
		conf.set("mapred.textoutputformat.separator", ",");// key value分隔符
		DistributedCache.addCacheFile(new Path(args[2]).toUri(), conf);//为该job添加缓存文件
		// 2 create job
		// Job job = new Job(conf, ModuleMapReduce.class.getSimpleName());
		Job job = this.parseInputAndOutput(this, conf, args);
		// 3 set job
		// 3.1 set run jar class
		// job.setJarByClass(ModuleReducer.class);
		// 3.2 set intputformat
		job.setInputFormatClass(TextInputFormat.class);
		// 3.3 set input path
		// FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.4 set mapper
		job.setMapperClass(MyMap.class);
		// 3.5 set map output key/value class
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		// 3.6 set partitioner class
		// job.setPartitionerClass(HashPartitioner.class);
		// 3.7 set reduce number
//		 job.setNumReduceTasks(0);
		// 3.8 set sort comparator class
		// job.setSortComparatorClass(LongWritable.Comparator.class);
		// 3.9 set group comparator class
		// job.setGroupingComparatorClass(LongWritable.Comparator.class);
		// 3.10 set combiner class
		// job.setCombinerClass(null);
		// 3.11 set reducer class
		job.setReducerClass(MyReduce.class);
		// 3.12 set output format

		job.setOutputFormatClass(TextOutputFormat.class);
		// 3.13 job output key/value class
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		// 3.14 set job output path
		// FileOutputFormat.setOutputPath(job, new Path(args[1]));
		// 4 submit job
		boolean isSuccess = job.waitForCompletion(true);
		// 5 exit
		// System.exit(isSuccess ? 0 : 1);
		return isSuccess ? 0 : 1;
	}

	public Job parseInputAndOutput(Tool tool, Configuration conf, String[] args)
			throws Exception {
		// validate
//		if (args.length != 2) {
//			System.err.printf("Usage:%s [genneric options]<input><output>\n",
//					tool.getClass().getSimpleName());
//			ToolRunner.printGenericCommandUsage(System.err);
//			return null;
//		}
		// 2 create job
		Job job = new Job(conf, tool.getClass().getSimpleName());
		// 3.1 set run jar class
		job.setJarByClass(tool.getClass());
		// 3.3 set input path
		FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.14 set job output path
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		return job;
	}

	public static void main(String[] args) throws Exception {
		args = new String[] {
				"hdfs://192.168.192.129:9000/ml/test.txt",
				// "hdfs://hadoop-00:9000/home910/liyuting/output/" };
				"hdfs://192.168.192.129:9000/ml/bayes/result/",
				"hdfs://192.168.192.129:9000/ml/bayes/pall/part-r-00000"};
		// run mapreduce
		int status = ToolRunner.run(new Bayes4(), args);
		// 5 exit
		System.exit(status);
	}
}
时间: 2024-10-16 22:22:40

朴素贝叶斯之MapReduce版的相关文章

基于MapReduce的朴素贝叶斯算法的实现与分析

一.朴素贝叶斯(Na?ve Bayes)分类器 1.1 公式 朴素贝叶斯是一个概率分类器 文档 d 属于类别 c 的概率计算如下(多项式模型): nd是文档的长度(词条的个数) P(tk |c) 是词项tk 出现在类别c中文档的概率,即类别c文档的一元语言模型 P(tk |c) 度量的是当c是正确类别时tk 的贡献 P(c) 是类别c的先验概率 如果文档的词项无法提供属于哪个类别的信息,那么我们直接选择P(c)最高的那个类别 1.2 具有最大后验概率的类别 §朴素贝叶斯分类的目标是寻找"最佳&q

【Spark MLlib速成宝典】模型篇04朴素贝叶斯【Naive Bayes】(Python版)

目录 朴素贝叶斯原理 朴素贝叶斯代码(Spark Python) 朴素贝叶斯原理 详见博文:http://www.cnblogs.com/itmorn/p/7905975.html 返回目录 朴素贝叶斯代码(Spark Python) 代码里数据:https://pan.baidu.com/s/1jHWKG4I 密码:acq1 # -*-coding=utf-8 -*- from pyspark import SparkConf, SparkContext sc = SparkContext('

NLP系列(4)_朴素贝叶斯实战与进阶(转)

http://blog.csdn.net/han_xiaoyang/article/details/50629608 作者: 寒小阳 && 龙心尘 时间:2016年2月. 出处:http://blog.csdn.net/han_xiaoyang/article/details/50629608 http://blog.csdn.net/longxinchen_ml/article/details/50629613 声明:版权所有,转载请联系作者并注明出处 1.引言 前两篇博文介绍了朴素贝叶

挖掘算法(1)朴素贝叶斯算法

原文:http://www.blogchong.com/post/NaiveBayes.html 1 文档说明 该文档为朴素贝叶斯算法的介绍和分析文档,并且结合应用实例进行了详细的讲解. 其实朴素贝叶斯的概念以及流程都被写烂了,之所以写这些是方便做个整理,记录备忘.而实例部分进行了详细的描述,网络上该实例比较简单,没有过程. 至于最后部分,则是对朴素贝叶斯的一个扩展了,当然只是简单的描述了一下过程,其中涉及到的中文分词以及TFIDF算法,有时间再具体补上. 2 算法介绍 2.1 贝叶斯定理 (1

NLP系列(5)_从朴素贝叶斯到N-gram语言模型

作者: 龙心尘 && 寒小阳 时间:2016年2月. 出处:http://blog.csdn.net/longxinchen_ml/article/details/50646528 http://blog.csdn.net/han_xiaoyang/article/details/50646667 声明:版权所有,转载请联系作者并注明出处 1. 引言:朴素贝叶斯的局限性 我们在之前文章<NLP系列(2)_用朴素贝叶斯进行文本分类(上)>探讨过,朴素贝叶斯的局限性来源于其条件独立

模式识别之贝叶斯---朴素贝叶斯(naive bayes)算法及实现

处女文献给我最喜欢的算法了 ⊙▽⊙ ---------------------------------------------------我是机智的分割线---------------------------------------------------- [important] 阅读之前你需要了解:1.概率论与数理统计基础 2.基本的模式识别概念 [begin] 贝叶斯决策论是模式分类问题最基础的概念,其中朴素贝叶斯更是由于其简洁成为学习模式分类问题的基础. 朴素贝叶斯的理论基础:源于概率论

NLP系列(4)_朴素贝叶斯实战与进阶

作者: 寒小阳 && 龙心尘 时间:2016年2月. 出处: http://blog.csdn.net/han_xiaoyang/article/details/50629608 http://blog.csdn.net/longxinchen_ml/article/details/50629613 声明:版权所有,转载请联系作者并注明出处 1.引言 前两篇博文介绍了朴素贝叶斯这个名字读着"萌蠢"但实际上简单直接高效的方法,我们也介绍了一下贝叶斯方法的一些细节.按照老规

统计学习方法与Python实现(三)——朴素贝叶斯法

统计学习方法与Python实现(三)——朴素贝叶斯法 iwehdio的博客园:https://www.cnblogs.com/iwehdio/ 1.定义 朴素贝叶斯法是基于贝叶斯定理与特征条件独立假设的分类方法. 对于给定的训练数据集,首先基于特征条件独立假设学习输入输出的联合概率分布.然后基于此模型,对给定的输入x,利用贝叶斯定理求出后验概率最大的输出y,从而进行决策分类. 朴素贝叶斯法学习到的是生成数据的机制,属于生成模型. 设Ω为试验E的样本空间,A为E的事件,B1~Bn为Ω的一个划分,则

[白话解析] 深入浅出朴素贝叶斯模型原理及应用

[白话解析] 深入浅出朴素贝叶斯模型原理及应用 0x00 摘要 朴素贝叶斯模型是机器学习中经常提到的概念.但是相信很多朋友都是知其然而不知其所以然.本文将尽量使用易懂的方式介绍朴素贝叶斯模型原理,并且通过具体应用场景和源码来帮助大家深入理解这个概念. 0x01 IT相关概念 1. 分类问题 已知m个样本 (x1,y1), ...... (xm,ym),x是特征变量,y是对应的类别.要求得一个模型函数或者映射规则h,对于新的样本 xt,能够尽量准确的预测出 yt = h(xt). 我们也可以从概率