ETL清洗数据
导Jar包
<dependencies> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>RELEASE</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.7.2</version> </dependency> </dependencies>
ETLUtil.java
public class ETLUtil { public static String etl(String original){ StringBuilder stringBuilder = new StringBuilder(); String[] fields = original.split("\t"); if (fields.length < 9){ return null; } //日志合规 //替换空格 fields[3] = fields[3].replace(" ", ""); for (int i = 0; i < fields.length - 1; i++){ if (i == fields.length - 1){ stringBuilder.append(fields[i]); }else if (i < 9){ stringBuilder.append(fields[i]).append("\t"); }else { stringBuilder.append(fields[i]).append("&"); } } return stringBuilder.toString(); } }
ETLMapper.java
public class ETLMapper extends Mapper<LongWritable, Text, Text, NullWritable> { Text k = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String original = value.toString(); String etlString = ETLUtil.etl(original); if (StringUtils.isNotEmpty(etlString)){ k.set(etlString); context.write(k, NullWritable.get()); context.getCounter("ETL", "True").increment(1); }else { context.getCounter("ETL", "False").increment(1); } } }
ETLDriver.java
public class ETLDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Job job = Job.getInstance(new Configuration()); job.setJarByClass(ETLDriver.class); job.setMapperClass(ETLMapper.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean b = job.waitForCompletion(true); System.exit(b ? 0 : 1); } }
[kris@hadoop102 hadoop-2.7.2]$ hadoop fs -mkdir -p /guli/user [kris@hadoop102 hadoop-2.7.2]$ hadoop fs -mkdir /guli/video [kris@hadoop102 hadoop-2.7.2]$ hadoop fs -mkdir /guli/etl [kris@hadoop102 datas]$ hadoop fs -moveFromLocal user.txt /guli/user [kris@hadoop102 datas]$ hadoop fs -moveFromLocal *.txt /guli/video [kris@hadoop102 hadoop-2.7.2]$ hadoop jar ETLVideo.jar com.atguigu.etl.ETLDriver /guli/video /guli/video_etl ETL False=5792 True=743569
创建表: create external table gulivideo_ori( videoId string, uploader string, age int, category array<string>, length int, views int, rate float, ratings int, comments int, relatedId array<string>) row format delimited fields terminated by "\t" collection items terminated by "&" stored as textfile location ‘/guli/video_etl‘; create external table gulivideo_user_ori( uploader string, videos int, friends int) row format delimited fields terminated by "\t" stored as textfile location ‘/guli/user‘; create table gulivideo_orc( videoId string, uploader string, age int, category array<string>, length int, views int, rate float, ratings int, comments int, relatedId array<string>) row format delimited fields terminated by "\t" collection items terminated by "&" stored as orc; create table gulivideo_user_orc( uploader string, videos int, friends int) row format delimited fields terminated by "\t" stored as orc; 0: jdbc:hive2://hadoop101:10000> insert into table gulivideo_orc select * from gulivideo_ori; 0: jdbc:hive2://hadoop101:10000> insert into table gulivideo_user_orc select * from gulivideo_user_ori;
1.--统计视频观看数Top10 select videoid, uploader, views from gulivideo_orc order by views desc limit 10; +--------------+------------------+-----------+--+ | videoid | uploader | views | +--------------+------------------+-----------+--+ | dMH0bHeiRNg | judsonlaipply | 42513417 | | 0XxI-hvPRRA | smosh | 20282464 | | 1dmVU08zVpA | NBC | 16087899 | | RB-wUgnyGv0 | ChrisInScotland | 15712924 | | QjA5faZF1A8 | guitar90 | 15256922 | | -_CSo1gOd48 | tasha | 13199833 | | 49IDp76kjPw | TexMachina | 11970018 | | tYnn51C3X_w | CowSayingMoo | 11823701 | | pv5zWaTEVkI | OkGo | 11672017 | | D2kJZOfq7zk | mrWoot | 11184051 | +--------------+------------------+-----------+--+ 10 rows selected (22.612 seconds) 使用group by的两个要素: (1) 出现在select后面的字段 要么是是聚合函数中的,要么就是group by 中的. (2) 要筛选结果 可以先使用where 再用group by 或者先用group by 再用having --2.统计视频类别热度Top10 (类别的videoid--视频的唯一id越多就代表热度高, 类别排序的多少排序;不能分组分组是在组内排序) ①统计视频类别: select videoid, categories from gulivideo_orc lateral view explode(category) tbl as categories ②按类别的热度排名 select t1.videoid, t1.categories, count(videoid) num from (select videoid, categories from gulivideo_orc lateral view explode(category) tbl as categories) t1 group by t1.categories order by num desc limit 10; --->拼一块:t1.videoid不能出现在select后边, select t1.categories, count(videoid) num from (select videoid, categories from gulivideo_orc lateral view explode(category) tbl as categories) t1 group by t1.categories order by num desc limit 10; +----------------+---------+--+ | t1.categories | num | +----------------+---------+--+ | Music | 179049 | | Entertainment | 127674 | | Comedy | 87818 | | Animation | 73293 | | Film | 73293 | | Sports | 67329 | | Gadgets | 59817 | | Games | 59817 | | Blogs | 48890 | | People | 48890 | +----------------+---------+--+ 10 rows selected (70.01 seconds)
3.--统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数 //所有类别中包含Top20视频的个数 //Expression not in GROUP BY key ‘videoid‘ not in GROUP BY key ‘views‘,后边有views,select后必须加views ############ ①观看数最高的20个视频: select videoid, category, views from gulivideo_orc order by views desc limit 20 ②把类别category炸开--所属类别 select videoid, categories, views from t1 lateral view explode(category) tbl categories --->前两句合起: select t1.videoid, categories, t1.views from (select videoid, category, views from gulivideo_orc order by views desc limit 20 ) t1 lateral view explode(category) tbl as categories; +--------------+----------------+-----------+--+ | t1.videoid | categories | t1.views | +--------------+----------------+-----------+--+ | dMH0bHeiRNg | Comedy | 42513417 | | 0XxI-hvPRRA | Comedy | 20282464 | | 1dmVU08zVpA | Entertainment | 16087899 | | RB-wUgnyGv0 | Entertainment | 15712924 | | QjA5faZF1A8 | Music | 15256922 | | -_CSo1gOd48 | People | 13199833 | | -_CSo1gOd48 | Blogs | 13199833 | | 49IDp76kjPw | Comedy | 11970018 | | tYnn51C3X_w | Music | 11823701 | | pv5zWaTEVkI | Music | 11672017 | | D2kJZOfq7zk | People | 11184051 | | D2kJZOfq7zk | Blogs | 11184051 | | vr3x_RRJdd4 | Entertainment | 10786529 | | lsO6D1rwrKc | Entertainment | 10334975 | | 5P6UU6m3cqk | Comedy | 10107491 | | 8bbTtPL1jRs | Music | 9579911 | | _BuRwH59oAo | Comedy | 9566609 | | aRNzWyD7C9o | UNA | 8825788 | | UMf40daefsI | Music | 7533070 | | ixsZy2425eY | Entertainment | 7456875 | | MNxwAU_xAMk | Comedy | 7066676 | | RUCZJVJ_M8o | Entertainment | 6952767 | +--------------+----------------+-----------+--+ ③类别中包含top20的视频的个数:在上条基础上加上按类别分组,计数组内videoid计数 ---> select categories, count(videoid) from (select videoid, category, views from gulivideo_orc order by views desc limit 20 ) t1 lateral view explode(category) tbl as categories group by categories +----------------+------+--+ | categories | _c1 | +----------------+------+--+ | Blogs | 2 | | Comedy | 6 | | Entertainment | 6 | | Music | 5 | | People | 2 | | UNA | 1 | +----------------+------ -- over里边不能使用limit, 怎么获取分区排序前几个呢?需要使用一个子查询;分区是数据存储上的分子文件,查询时还是在一张表 select t1.videoid, t1.views, t1.ran, t1.categories from( select videoid, views, categories, rank() over(partition by categories order by views desc) ran from gulivideo_orc lateral view explode(category) tbl as categories) t1 where t1.ran <= 5; +--------------+-----------+---------+----------------+--+ | t1.videoid | t1.views | t1.ran | t1.categories | +--------------+-----------+---------+----------------+--+ | 2GWPOPSXGYI | 3660009 | 1 | Animals | | xmsV9R8FsDA | 3164582 | 2 | Animals | | 12PsUW-8ge4 | 3133523 | 3 | Animals | | OeNggIGSKH8 | 2457750 | 4 | Animals | | WofFb_eOxxA | 2075728 | 5 | Animals | | sdUUx5FdySs | 5840839 | 1 | Animation | | 6B26asyGKDo | 5147533 | 2 | Animation | | H20dhY01Xjk | 3772116 | 3 | Animation | | 55YYaJIrmzo | 3356163 | 4 | Animation | | JzqumbhfxRo | 3230774 | 5 | Animation | | RjrEQaG5jPM | 2803140 | 1 | Autos ......
4.--统计视频观看数Top50所关联视频的所属类别排序Top50---relatedid---种类---; 炸开之后直接join,因它是张虚拟表,hive是不支持的select videoid, views, relatedid from gulivideo_orc order by views desc limit 50炸开单独写一个sql: t1 select distinct(tbl.relatedids) rid from t1 lateral view explode(relatedid) tbl as relatedids自己join自己下: t2 select g.videoid, g.category from t2 left join gulivideo_orc g on t2.vid=g.videoid把category炸开并排序:select cateegories, count(videoid) hot from t3 lateral view explode(category) tb12 as catogories group by categores order by hot desc; select categories, count(videoid) hot from(select g.videoid, g.category from(select distinct(tbl.relatedids) rid from(select videoid, views, relatedid from gulivideo_orc order by views desc limit 50) t1 lateral view explode(relatedid) tbl as relatedids) t2 join gulivideo_orc g on t2.rid=g.videoid) t3 lateral view explode(category) tbl2 as categories group by categories order by hot desc; +----------------+------+--+ | categories | hot | +----------------+------+--+ | Comedy | 217 | | Entertainment | 207 | | Music | 186 | | Blogs | 49 | | People | 49 | | Film | 46 | | Animation | 46 | | News | 21 | | Politics | 21 | | Games | 19 | | Gadgets | 19 | | Sports | 17 | | Places | 12 | | UNA | 12 | | Travel | 12 | | Howto | 12 | | DIY | 12 | | Animals | 11 | | Pets | 11 | | Autos | 3 | | Vehicles | 3 | +----------------+------+--+ 21 rows selected (115.239 seconds)
5.--统计每个类别中的视频热度Top10,以Music为例 创建类别表: create table gulivideo_category( videoid string, uploader string, age int, categoryid string, length int, views int, rate float, ratings int, comments int, relatedid array<string>) row format delimited fields terminated by "\t" collection items terminated by "&" stored as orc; 插入数据: insert into table gulivideo_category select videoid, uploader, age, categoryid, length, views, rate, ratings, comments, relatedid from gulivideo_orc lateral view explode(category) category as categoryid; --->把一张表全查出来:select categoryid, videoid, paiming from (select categoryid, videoid, rank() over(partition by categoryid order by views desc) paiming from gulivideo_category) t1where t1.paiming <= 10; select categoryid, videoid, views from gulivideo_category where categoryid="music" order by views desc limit 10; 6.--统计每个类别中视频流量Top10,以Music为例 select videoid, ratings from gulivideo_category where categoryid="music" order by ratings desc limit 10; 7.--统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频 ①上传视频最多的用户Top10: select videos,uploader from gulivideo_user_orc order by videos desc limit 10; ②找出这10个人上传的视频select g.videoid, rank() over(partition by g.uploader order by g.views desc) hot from t1 join gulivideo_orc g on t1.uploader = g.uploader③找出前20select t2.uploader, t2.videoid from t2 where t2.hot <= 20; select t2.uploader, t2.videoid from( select g.uploader, g.videoid, g.views, rank() over(partition by g.uploader order by g.views desc) hot from (select uploader,videos from gulivideo_user_orc order by videos desc limit 10) t1 left join gulivideo_orc g on t1.uploader=g.uploader) t2 where t2.hot <= 20; +----------------+--------------+--+ | t2.uploader | t2.videoid | +----------------+--------------+--+ | NULL | NULL | | NULL | NULL | | NULL | NULL | | NULL | NULL | | Ruchaneewan | xbYyjUdhtJw | | Ruchaneewan | 4dkKeIUkN7E | | Ruchaneewan | qCfuQA6N4K0 | | Ruchaneewan | TmYbGQaRcNM | | Ruchaneewan | dOlfPsFSjw0 | | expertvillage | -IxHBW0YpZw | | expertvillage | BU-fT5XI_8I | | expertvillage | ADOcaBYbMl0 | ... 8.--统计每个类别视频观看数Top10 select t.categoryid, t.videoid, t.ranking from( select categoryid, videoid, rank() over(partition by categoryid order by views desc) ranking from gulivideo_category) t where t.ranking <= 10; +----------------+--------------+------------+--+ | t.categoryid | t.videoid | t.ranking | +----------------+--------------+------------+--+ | Animals | 2GWPOPSXGYI | 1 | | Animals | xmsV9R8FsDA | 2 | | Animals | 12PsUW-8ge4 | 3 | | Animals | OeNggIGSKH8 | 4 | | Animals | WofFb_eOxxA | 5 | | Animals | AgEmZ39EtFk | 6 | | Animals | a-gW3RbJd8U | 7 | | Animals | 8CL2hetqpfg | 8 | | Animals | QmroaYVD_so | 9 | | Animals | Sg9x5mUjbH8 | 10 | | Animation | sdUUx5FdySs | 1 | | Animation | 6B26asyGKDo | 2 | | Animation | H20dhY01Xjk | 3 | | Animation | 55YYaJIrmzo | 4 | | Animation | JzqumbhfxRo | 5 | | Animation | eAhfZUZiwSE | 6 | | Animation | h7svw0m-wO0 | 7 | | Animation | tAq3hWBlalU | 8 | | Animation | AJzU3NjDikY | 9 | | Animation | ElrldD02if0 | 10 | | Autos | RjrEQaG5jPM | 1 | ...... 210 rows selected (24.379 seconds)
原文地址:https://www.cnblogs.com/shengyang17/p/10404223.html
时间: 2024-11-13 08:22:09