使用hadoop统计多个文本中每个单词数目

程序源码

import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class WordCount {
    public static class WordCountMap extends
            Mapper<LongWritable, Text, Text, IntWritable> {
        private final IntWritable one = new IntWritable(1);
        private Text word = new Text();

        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
            StringTokenizer token = new StringTokenizer(line);
            while (token.hasMoreTokens()) {
                word.set(token.nextToken());
                context.write(word, one);
            }
        }
    }

    public static class WordCountReduce extends
            Reducer<Text, IntWritable, Text, IntWritable> {
        public void reduce(Text key, Iterable<IntWritable> values,
                Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            context.write(key, new IntWritable(sum));
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = new Job(conf);
        job.setJarByClass(WordCount.class);
        job.setJobName("wordcount");
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.setMapperClass(WordCountMap.class);
        job.setReducerClass(WordCountReduce.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.waitForCompletion(true);
    }
}

1 编译源码

javac -classpath /opt/hadoop-1.2.1/hadoop-core-1.2.1.jar:/opt/hadoop-1.2.1/lib/commons-cli-1.2.jar -d ./word_count_class/ WordCount.java
将源码编译成class文件并放在当前文件夹下的word_count_class目录，当然，首先需要创建该目录

2 将源码打成jar包

进入源码目录

jar -cvf wordcount.jar *

3 上传输入文件

先在hadoop中为本次任务创建一个输入文件存放目录

hadoop fs -mkdir input_wordcount

将input目录下的所有文本文件上传到hadoop中的input_wordcount目录下

hadoop fs -put input/* input_wordcount/

4 上传jar并执行

hadoop jar word_count_class/wordcount.jar WordCount input_wordcount output_wordcount

5 查看计算结果

程序输出目录

hadoop fs -ls output_wordcount

程序输出内容

hadoop fs -cat output_wordcount/part-r-00000

时间： 2024-12-11 12:16:43

使用hadoop统计多个文本中每个单词数目的相关文章

如何使用linux命令统计文本中某个单词的出现频率

动态分配内存输出文本中的单词的四种做法

题目:有一段文本,将文本中的所有单词,存放到一个字符指针数组中(要求每个单词内存恰好). 第一种做法 char c[] = " asd afil kjgl rip kjgdr gds sdg gs "; char b[10] = {0}; char *a[10] = {NULL}; int i = 0, j = 0,k = 0; //i使字符不断后移,j用来标识指针a,k用来标识中间字符数组b; while (1) {//把字符串的'\0' 作为i

统计txt文档中的单词个数

public class Bean { private char name; private double pinlv; public Bean(char name,double pl) { this.name=name; this.pinlv=pl; } public char getName() { return name; } public void setName(char name) { this.name = name; } public double getPinlv() { re

《深入理解C#》代码片段-用Dictionary<TKey,TValue>统计文本中的单词

1 public class Words 2 { 3 public static Dictionary<string, int> CountWords(string text) 4 { 5 Dictionary<string, int> frequencies;//创建从单词到频率的新映射 6 frequencies = new Dictionary<string, int>(); 7 string[] words = Regex.Split(text, @"

统计字符串中的单词数目

统计字符串中单词的数目,更复杂的话从一个文本中读出字符串并生成单词数目统计结果. 第一个问题:这个问题的解决方案是,字符串之所以可以成为单词就是因为有空格符的出现,那么对于字符串中单词的数目来说,只需要统计其中空格符出现的次数就可以了~~~ 第二个问题,从文本中读出字符串并统计每一个单词的统计结果,那么久需要借助于字典map了,每一个单词使用了一个位置 ,如果是已经出现的单词,那么就给相应的单词数量加一,如果没有出现在字符串中,那么就添加该单词. 对于一串字符串来说,如果需要对于

C++统计一段文字中各单词出现的频率

#include <iostream> using namespace std; /* run this program using the console pauser or add your own getch, system("pause") or input loop */ class SqString{private: char * base; int length;public: SqString() { } SqString(char * s) { lengt

一个简单的程序，统计文本文档中的单词和汉字数，逆序排列（出现频率高的排在最前面）。python实现。

仅简单统计英文. from collections import Counter f = open('1') c = Counter() for line in f: g = (x for x in line.split()) c.update(Counter(g)) f.close() print sorted(dict(c).items(), key = lambda x : x[1], reverse = True) 运行结果. [('cd', 5), ('xy', 2), ('ab',

Count words and letters-计算用户输入一行文本中的单词数和每个字母出现次数

//Count words and letters #include<iostream> #include<cstring> #include<cstdlib> #include<cctype> using namespace std; int main() { int words_count = 1; int char_count[26] = {0}; char ch; cout<<"Inpu

统计文本中重复的内容

1.统计一个文本中重复的内容 package count; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Iterator; import java.util.Map;