利用CombineFileInputFormat把ntf_data导入到Hbase里

package com.mr.test;

import java.io.IOException;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;

public class CombineSmallfileInputFormat extends CombineFileInputFormat<LongWritable, BytesWritable> {

	@Override
	public RecordReader<LongWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {

		CombineFileSplit combineFileSplit = (CombineFileSplit) split;
		CombineFileRecordReader<LongWritable, BytesWritable> recordReader = new CombineFileRecordReader<LongWritable, BytesWritable>(combineFileSplit, context, CombineSmallfileRecordReader.class);
		try {
			recordReader.initialize(combineFileSplit, context);
		} catch (InterruptedException e) {
			new RuntimeException("Error to initialize CombineSmallfileRecordReader.");
		}
		return recordReader;
	}

}
package com.mr.test;

import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

public class CombineSmallfileRecordReader extends RecordReader<LongWritable, BytesWritable> {

	private CombineFileSplit combineFileSplit;
	private LineRecordReader lineRecordReader = new LineRecordReader();
	private Path[] paths;
	private int totalLength;
	private int currentIndex;
	private float currentProgress = 0;
	private LongWritable currentKey;
	private BytesWritable currentValue = new BytesWritable();

	public CombineSmallfileRecordReader(CombineFileSplit combineFileSplit, TaskAttemptContext context, Integer index) throws IOException {
		super();
		this.combineFileSplit = combineFileSplit;
		this.currentIndex = index; // 当前要处理的小文件Block在CombineFileSplit中的索引
	}

	@Override
	public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
		this.combineFileSplit = (CombineFileSplit) split;
		// 处理CombineFileSplit中的一个小文件Block,因为使用LineRecordReader,需要构造一个FileSplit对象,然后才能够读取数据
		FileSplit fileSplit = new FileSplit(combineFileSplit.getPath(currentIndex), combineFileSplit.getOffset(currentIndex), combineFileSplit.getLength(currentIndex), combineFileSplit.getLocations());
		lineRecordReader.initialize(fileSplit, context);

		this.paths = combineFileSplit.getPaths();
		totalLength = paths.length;
		context.getConfiguration().set("map.input.file.name", combineFileSplit.getPath(currentIndex).getName());
	}

	@Override
	public LongWritable getCurrentKey() throws IOException, InterruptedException {
		currentKey = lineRecordReader.getCurrentKey();
		return currentKey;
	}

	@Override
	public BytesWritable getCurrentValue() throws IOException, InterruptedException {
		System.out.println("lineRecordReader:"+lineRecordReader.getCurrentValue().toString());
		byte[] content = lineRecordReader.getCurrentValue().toString().getBytes();
		System.out.println("content:"+new String(content));
		currentValue = new BytesWritable();
		currentValue.set(content, 0, content.length);
		System.out.println("currentValue:"+new String(currentValue.getBytes()));
		return currentValue;
	}
	public static void main(String args[]){
		BytesWritable cv = new BytesWritable();
		String str1 = "1234567";
		String str2 = "123450";
		cv.set(str1.getBytes(), 0, str1.getBytes().length);
		System.out.println(new String(cv.getBytes()));

		cv.setCapacity(0);

		cv.set(str2.getBytes(), 0, str2.getBytes().length);
		System.out.println(new String(cv.getBytes()));
	}
	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
		if (currentIndex >= 0 && currentIndex < totalLength) {
			return lineRecordReader.nextKeyValue();
		} else {
			return false;
		}
	}

	@Override
	public float getProgress() throws IOException {
		if (currentIndex >= 0 && currentIndex < totalLength) {
			currentProgress = (float) currentIndex / totalLength;
			return currentProgress;
		}
		return currentProgress;
	}

	@Override
	public void close() throws IOException {
		lineRecordReader.close();
	}
}
package com.mr.test;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class BulkImportData {

	public static class TokenizerMapper extends
			Mapper<Object, BytesWritable, Text, Text> {
		public Text _key = new Text();
		public Text _value = new Text();
		public void map(Object key, BytesWritable value, Context context)
				throws IOException, InterruptedException {
			_value.set(value.getBytes());
			String tmp = _value.toString().trim();
			System.out.println(tmp);
			tmp = tmp.replace("\\x00", "");
			_value.set(tmp);
			String filename = context.getConfiguration().get("map.input.file.name");
			String[] splits = _value.toString().split(",");
			if(splits.length==3){
				filename = filename.replace("mv_", "");
				filename = filename.replace(".txt", "");
				_key.set(splits[0]+"_"+filename);
				context.write(_key, _value);
			}
		}
	}
	public static class IntSumReducer extends
			TableReducer<Text, Text, ImmutableBytesWritable> {
		public void reduce(Text key, Iterable<Text> values,
				Context context) throws IOException, InterruptedException {
			Iterator<Text> itr = values.iterator();
			while(itr.hasNext()){
				Text t = itr.next();
				String[] strs = t.toString().split(",");
				if(strs.length!=3)continue;
				Put put = new Put(key.getBytes());
				put.add(Bytes.toBytes("content"), Bytes.toBytes("score"), Bytes.toBytes(strs[1].trim()));
				put.add(Bytes.toBytes("content"), Bytes.toBytes("date"), Bytes.toBytes(strs[2].trim()));
				context.write(new ImmutableBytesWritable(key.getBytes()), put);
			}
		}
	}

	public static void main(String[] args) throws Exception {
		String tablename = "ntf_data";
		Configuration conf = HBaseConfiguration.create();
		HBaseAdmin admin = new HBaseAdmin(conf);
		if (admin.tableExists(tablename)) {
			admin.disableTable(tablename);
			admin.deleteTable(tablename);
		}
		HTableDescriptor htd = new HTableDescriptor(tablename);
		HColumnDescriptor hcd = new HColumnDescriptor("content");
		htd.addFamily(hcd);
		admin.createTable(htd);
		String[] otherArgs = new GenericOptionsParser(conf, args)
				.getRemainingArgs();
		if (otherArgs.length != 1) {
			System.err
					.println("Usage: wordcount <in> <out>" + otherArgs.length);
			System.exit(2);
		}
		Job job = new Job(conf, "h");
		job.setMapperClass(TokenizerMapper.class);
		job.setJarByClass(BulkImportData.class);
		job.setInputFormatClass(CombineSmallfileInputFormat.class);
		job.setNumReduceTasks(5);
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		TableMapReduceUtil.initTableReducerJob(tablename, IntSumReducer.class,
				job);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

利用CombineFileInputFormat把ntf_data导入到Hbase里,布布扣,bubuko.com

时间: 2024-10-14 13:02:25

利用CombineFileInputFormat把ntf_data导入到Hbase里的相关文章

mysql数据导入到hbase

思路:读取到一个数据库里所有的表名,然后通过sqoop循环导入到hbase 实现过程中发现 不会写shell是个硬伤 最后只能分两步进行操作 1.sel_tabs.sh /usr/bin/mysql -hIp地址 -u用户名 -p密码 -D数据库名<<EOF use select table_name from information_schema.tables where table_schema='数据库名' and table_type='base table'; EOF 运行 bash

mysql通过sqoop导入到hbase中时数据量为1000w时出现Incorrect key file for table &#39;/tmp/#sql_458_0.MYI&#39;; try to repair it

问题:mysql通过sqoop导入到hbase中时数据量为1000w时出现Incorrect key file for table '/tmp/#sql_458_0.MYI'; try to repair it,数据量为100w等时没该问题 分析:出现该问题时因为mysql的临时目录(默认为/tmp)太小 解决方法:参考:http://blog.sina.com.cn/s/blog_4c197d420101bdn9.html mysql通过sqoop导入到hbase中时数据量为1000w时出现I

HBase里的HRegion

首先,要区分,HRegion服务器包含两大部分:HLog部分和HRegion部分 HBase里的HRegion服务器  HBase里的HRegion 当表的大小超过设置值的时候,HBase会自动将表划分为不同的区域,每个区域包含所有行的一个子集.对用户来说,每个表是一堆数据的集合,每个表靠主键来区分.从物理上来说,一张表是被拆分成了多块,每一块就是一个HRegion. (注意,用户通过一系列HRegion服务器获取这些数据.一台机器上一般只运行一个HRegion服务器,而且每一分区段的HRegi

php利用PHPExcel类导出导入Excel用法

PHPExcel类是php一个excel表格处理插件了,下面我来给大家介绍利用PHPExcel类来导入与导出excel表格的应用方法,有需要了解的朋友不防参考参考(PHPExcel自己百度下载这里不介绍了). 导出Excel用法 //设置环境变量(新增PHPExcel) 1 set_include_path('.'. PATH_SEPARATOR . Yii::app()->basePath.'/lib/PHPExcel' . PATH_SEPARATOR . 2 3 get_include_p

HBase里的HRegion服务器

所有的数据库数据一般是保存在Hadoop分布式系统上面的,用户通过一系列HRegion服务器获取这些数据.一台机器上一般只运行一个HRegion服务器,而且每一分区段的HRegion也只会被一个HRegion服务器维护. HRegion服务器包含两大部分:HLog部分和HRegion部分.      HBase里的HRegion 其中HLog用来存储数据日志,采用的是先写日志的方式.HRegion部分由很多的HRegion组成,存储的是实际的数据.每一个HRegion又由很多的Store组成,每

关系型数据库导入到Hbase

#!/bin/sh . /etc/profile sqoop import #连接 --connect \ "jdbc:mysql://192.168.1.107:3306/test" #用户名 --username root#密码 --password 123456 #使用多少个Map - m\ 1 #如果有null用''替代 --null-string''#Mysql的表名 --table user #字段 --columns "id,host,user,password

HBase里的HStore

Store在HBase里称为HStore.HStore包括MemStore和StoreFiles.

怎么从svn服务器上把工程导入到MyEclipse里

怎么从svn服务器上把工程导入到MyEclipse里,步骤如下:

Sqoop1.4.4将MySQL数据库表中数据导入到HBase表中

问题导读:         1.--hbase-table.--hbase-row-key.--column-family及--hbase-create-table参数的作用? 2.Sqoop将关系型数据库表中数据导入HBase中,默认Rowkey是什么? 3.如果关系型数据库表中存在多关键字,该怎么办? 一.简介及部分重要参数介绍 Sqoop除了能够将数据从关系型数据库导入到HDFS和Hive中,还能够导入到HBase表中. --hbase-table:通过指定--hbase-table参数值