案例:
1. 把原始log数据加载到表beifenglog中;
2. 建立子表beifenglog_hour_visit,存贮常用字段,并从原表中提取数据到子表中;
3. 提取原表数据过程中,使用UDF处理字段信息,存储到子表中;
1. 把原始log数据加载到表beifenglog中;
create table beifenglog( remote_addr string, remote_user string, time_local string, request string, status string, body_bytes_sent string, request_body string, http_referer string, http_user_agent string, http_x_forwarded_for string, host string) row format serde ‘org.apache.hadoop.hive.contrib.serde2.RegexSerDe‘ with serdeproperties( "input.regex" = "(\\\"[\\d\\.]+\\\") (\\\"[^ ]+\\\") (\\\".*?\\\") (\\\".*?\\\") (\\\"\\d+\\\") (\\\"\\d+\\\") ([^ ]+) (\\\"[^ ]+\\\") (\\\".*?\\\") (\\\"[^ ]+\\\") (\\\"[^ ]+\\\")" ) stored as textfile;
加载原表数据
load data local inpath ‘/opt/test/beifenglog.data‘ overwrite into table beifenglog;
2. 建立子表beifenglog_hour_visit,存贮常用字段,并从原表中提取数据到子表中;
create table beifenglog_hour_visit( remote_addr string, time_local string, request string, http_referer string) row format delimited fields terminated by ‘\t‘ stored as orc tblproperties ("orc.compression"="snappy"); insert overwrite table beifenglog_hour_visit select remote_addr,time_local,request,http_referer from beifenglog ;
报错1:
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.ClassNotFoundException: Class org.apache.hadoop.hive.contrib.serde2.RegexSerDe not found
at org.apache.hadoop.hive.ql.exec.MapOperator.getConvertedOI(MapOperator.java:334)
at org.apache.hadoop.hive.ql.exec.MapOperator.setChildren(MapOperator.java:352)
at org.apache.hadoop.hive.ql.exec.mr.ExecMapper.configure(ExecMapper.java:126)
解决: hive-site.xml
<property> <name>hive.aux.jars.path</name> <value>file:///opt/modules/cdh/hive-0.13.1-cdh5.3.6/lib/hive-contrib-0.13.1-cdh5.3.6.jar</value> <description>Added by tiger.zeng on 20120202.These JAR file are available to all users for all jobs</description> </property>
报错2:
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.IllegalArgumentException: No enum constant org.apache.hadoop.hive.ql.io.orc.CompressionKind.snappy
at org.apache.hadoop.hive.ql.exec.FileSinkOperator.createBucketFiles(FileSinkOperator.java:469)
at org.apache.hadoop.hive.ql.exec.FileSinkOperator.processOp(FileSinkOperator.java:550)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:796)
at org.apache.hadoop.hive.ql.exec.SelectOperator.processOp(SelectOperator.java:84)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:796)
at org.apache.hadoop.hive.ql.exec.TableScanOperator.processOp(TableScanOperator.java:92)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:796)
at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:549)
... 9 more
解决:重新创建子表
("orc.compress"="snappy") --> ("orc.compression"="snappy")
3. 提取原表数据过程中,使用UDF处理字段信息,存储到子表中;
add jar /opt/test/hive.jar; create temporary function removequote as ‘org.gh.hadoop.hive.test.RemoveQuote‘; create temporary function formatdate as ‘org.gh.hadoop.hive.test.FormatDate‘; insert overwrite table beifenglog_hour_visit select removequote(remote_addr),formatdate(removequote(time_local)),removequote(request),removequote(http_referer) from beifenglog ;
FormatData.java内容如下:
import java.text.SimpleDateFormat; import java.util.Date; import java.util.Locale; import org.apache.hadoop.hive.ql.exec.UDF; public class FormatDate extends UDF{ private SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); private SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); public String evaluate(String dateStr){ if(dateStr == null){ return null; } Date date = null; String ret = null; try{ date = inputFormat.parse(dateStr); ret = outputFormat.format(date); }catch(Exception e){ e.printStackTrace(); return null; } return ret; } }
RemoveQuote.java内容如下:
package org.gh.hadoop.hive.test; import org.apache.hadoop.hive.ql.exec.UDF; public class RemoveQuote extends UDF{ public String evaluate(String str){ if(str == null){ return null; } return str.replaceAll("\"", ""); } }