项目背景:
本公司需要处理一份数据,这份数据有这样的特点,一行数据长度是1924个,一共有88个字段,且知道每个字段按顺序的长度。但是如果直接创建表的话也是行的,只不过写很复杂的正则表达式。而且效率方面可能有问题的。然后现在的想法是自定义Inputformat的符号,往每个字段后面加一个hive默认的“\001”符号,这样创建表就会简单很多,而且有可能会提高表的查询的效率(因为还没测试过)。
项目环境:
hadoop-2.3.0-cdh5.0.0
hive-0.12.0-cdh5.0.0
Eclipse
操作步骤:
1:定义好文件,代码如下:
package org.dongfangguoxin.test;
import java.io.IOException;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
public class MyDemoInputFormat extends TextInputFormat {
public String toUseString(int[] ints,String a){
int beginIndex=0;
int endIndex=0;
int sum=0;
StringBuffer s=new StringBuffer("");
for (int i = 0; i < ints.length; i++) {
sum+=ints[i];
endIndex=sum;
String c=a.substring(beginIndex, endIndex);
//System.out.println(c);
beginIndex=sum;
s.append(c+"\001");
}
// System.out.println(s);
return s.toString();
}
@Override
public RecordReader<LongWritable, Text> getRecordReader(
InputSplit genericSplit, JobConf job, Reporter reporter)
throws IOException {
reporter.setStatus(genericSplit.toString());
MyDemoRecordReader reader = new MyDemoRecordReader(
new LineRecordReader(job, (FileSplit) genericSplit));
return reader;
}
public static class MyDemoRecordReader implements
RecordReader<LongWritable, Text> {
LineRecordReader reader;
Text text;
public MyDemoRecordReader(LineRecordReader reader) {
this.reader = reader;
text = reader.createValue();
}
@Override
public void close() throws IOException {
reader.close();
}
@Override
public LongWritable createKey() {
return reader.createKey();
}
@Override
public Text createValue() {
return new Text();
}
@Override
public long getPos() throws IOException {
return reader.getPos();
}
@Override
public float getProgress() throws IOException {
return reader.getProgress();
}
@Override
public boolean next(LongWritable key, Text value) throws IOException {
while (reader.next(key, text)) {
int[] ints=new int[]{4,8,8,15,50,6,4,10,15,6,4,2,12,2,24,14,14,10,200,500,20,32,15,15,15,15,16,16,6,17,2,2,10,10,10,14,16,16,2,12,2,15,3,15,10,30,5,10,3,8,10,64,5,26,2,12,2,2,4,2,30,5,4,2,30,24,60,48,16,15,40,10,16,16,20,20,16,10,4,6,10,10,10,24,24,24,24,24};
String str=text.toString();
MyDemoInputFormat ift=new MyDemoInputFormat();
String strReplace= ift.toUseString(ints, str);
Text txtReplace = new Text();
txtReplace.set(strReplace);
value.set(txtReplace.getBytes(), 0, txtReplace.getLength());
return true;
}
return false;
}
}
}
。
弄好后打成jar包,放到hive_home/lib里面
然后原来的创建表的语句是:
create table tab_lixiyuan1(
OBJECT_ID string,
DISTINCT_ID string,
SORTING_ID string,
FILE_ID string,
OUT_FILE_ID string,
TICKET_ID string,
USING_TYPE string,
FILE_DIS_ID string,
AB_FILE_ID string,
AB_TICKET_ID string,
AREA_ID string,
SORTING_MOTION string,
ABNORMAL_TYPE string,
TICKET_TYPE string,
FEE_NUMBER string,
BEGIN_TIME string,
END_TIME string,
BILL_TIMES string,
BILL_LIST string,
BYTES_LIST string,
BILL_BYTES string,
SERV_CODE string,
ASCEND_BYTES1 string,
DOWN_BYTES1 string,
ASCEND_BYTES2 string,
DOWN_BYTES2 string,
IMSI string,
IMEI string,
LAC string,
CELL_ID string,
DEALER string,
ROAM_DEALER string,
CALL_AREA_ID string,
VISIT_AREA_ID string,
USER_PROV string,
USER_ROAMPROV string,
SGSN_SERV_IP string,
GGSN_PGW_IP string,
PDP_FLAG string,
PDP_FEE_CLASS string,
PDP_TYPE string,
PDP_CAMEL string,
IMS_PDP_ID string,
SERV_FEE string,
INFO_FEE string,
FEE_CLASS string,
RECORD_ID string,
MB_ABILITY string,
RECORD_ROUTE string,
RECORD_LAC string,
RECORD_CELL_ID string,
APN_ID string,
APN_TYPE string,
APN_DEALER string,
SGSNING_ID string,
SGSN_PLMN string,
ROAM_TYPE string,
USER_TYPE string,
SERV_TYPE string,
SHUTDOWN_REASON string,
ADD_BYTE string,
USER_BILL_FLAG string,
RAT_FLAG string,
BILL_FLAG_MODE string,
GSN_ID string,
CAMEL_INFO string,
USER_ACCOUNT string,
AP_INFO string,
INCONTROL_IP string,
NAS_IP string,
AP_SSID string,
BILL_DES string,
PDNCC_ID string,
USER_CSG_INFO string,
IPV4_ADD string,
IMSSIGNALING string,
PGWADD_USERD string,
PDNTYPE_IP string,
SERV_PRI string,
WIRELESS_PRI string,
UPBANDWIDTH string,
DOWNBANDWIDTH string,
BANDWIDTH string,
RESERVED1 string,
RESERVED2 string,
RESERVED3 string,
RESERVED4 string,
RESERVED5 string
) partitioned by (month_id string,prov_id string,day_id string,week_id string,hour_id string) row format serde ‘org.apache.hadoop.hive.contrib.serde2.RegexSerDe‘ with serdeproperties ( ‘input.regex‘ = ‘(....)(........)(........)(...............)(..................................................)(......)(....)(..........)(...............)(......)(....)(..)(............)(..)(........................)(..............)(..............)(..........)(........................................................................................................................................................................................................)(....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................)(....................)(................................)(...............)(...............)(...............)(...............)(................)(................)(......)(.................)(..)(..)(..........)(..........)(..........)(..............)(................)(................)(..)(............)(..)(...............)(...)(...............)(..........)(..............................)(.....)(..........)(...)(........)(..........)(................................................................)(.....)(..........................)(..)(............)(..)(..)(....)(..)(..............................)(.....)(....)(..)(..............................)(........................)(............................................................)(................................................)(................)(...............)(........................................)(..........)(................)(................)(....................)(....................)(................)(..........)(....)(......)(..........)(..........)(..........)(........................)(........................)(........................)(........................)(........................)‘,‘output.format.string‘ = ‘%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s %16$s %17$s %18$s %19$s %20$s %21$s %22$s %23$s %24$s %25$s %26$s %27$s %28$s %29$s %30$s %31$s %32$s %33$s %34$s %35$s %36$s %37$s %38$s %39$s %40$s %41$s %42$s %43$s %44$s %45$s %46$s %47$s %48$s %49$s %50$s %51$s %52$s %53$s %54$s %55$s %56$s %57$s %58$s %59$s %60$s %61$s %62$s %63$s %64$s %65$s %66$s %67$s %68$s %69$s %70$s %71$s %72$s %73$s %74$s %75$s %76$s %77$s %78$s %79$s %80$s %81$s %82$s %83$s %84$s %85$s %86$s %87$s %88$s‘)
用了这个自定义的函数以后创建表的语句为:
CREATE TABLE tab_lixiyuan
(
a string ,b string ,c string ,d string,dds string,e string,f string ,g string,
aa string ,ba string ,ca string ,da string,daa string,ea string,fa string ,ga string,
ab string ,bb string ,cb string ,db string,dba string,eb string,fb string ,gb string,
ac string ,bc string ,cc string ,dc string,dca string,ec string,fc string ,gc string,
ad string ,bd string ,cd string ,dd string,dda string,ed string,fd string ,gd string,
ha string ,ia string ,ja string ,ka string ,la string ,ma string ,na string ,haa string ,
hb string ,ib string ,jb string ,kb string ,lb string ,mb string ,nb string ,hba string ,
hc string ,ic string ,jc string ,kc string ,lc string ,mc string ,nc string ,hca string ,
hd string ,id string ,jd string ,kd string ,ld string ,md string ,nd string ,hda string ,
he string ,ie string ,je string ,ke string ,le string ,me string ,ne string ,hea string ,
hf string ,iff string ,jf string ,kf string ,lf string ,mf string ,nf string ,hfa string
)
STORED AS INPUTFORMAT ‘org.dongfangguoxin.test.MyDemoInputFormat‘ OUTPUTFORMAT ‘org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat‘
搞定收工!