Hive| ETL清洗& 查询练习

ETL清洗数据

导Jar包

<dependencies>
        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>RELEASE</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.2</version>
        </dependency>

    </dependencies>

ETLUtil.java

public class ETLUtil {
    public static String etl(String original){
        StringBuilder stringBuilder = new StringBuilder();
        String[] fields = original.split("\t");
        if (fields.length < 9){
            return null;
        }
        //日志合规
        //替换空格
        fields[3] = fields[3].replace(" ", "");
        for (int i = 0; i < fields.length - 1; i++){
            if (i == fields.length - 1){
                stringBuilder.append(fields[i]);

            }else if (i < 9){
                stringBuilder.append(fields[i]).append("\t");
            }else {
                stringBuilder.append(fields[i]).append("&");
            }
        }
        return stringBuilder.toString();
    }
}

ETLMapper.java

public class ETLMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
    Text k = new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String original = value.toString();
        String etlString = ETLUtil.etl(original);
        if (StringUtils.isNotEmpty(etlString)){
            k.set(etlString);
            context.write(k, NullWritable.get());
            context.getCounter("ETL", "True").increment(1);

        }else {
            context.getCounter("ETL", "False").increment(1);
        }
    }
}

ETLDriver.java

public class ETLDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Job job = Job.getInstance(new Configuration());

        job.setJarByClass(ETLDriver.class);
        job.setMapperClass(ETLMapper.class);
        job.setNumReduceTasks(0);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);
    }
}
[kris@hadoop102 hadoop-2.7.2]$ hadoop fs -mkdir -p /guli/user
[kris@hadoop102 hadoop-2.7.2]$ hadoop fs -mkdir /guli/video
[kris@hadoop102 hadoop-2.7.2]$ hadoop fs -mkdir /guli/etl
[kris@hadoop102 datas]$ hadoop fs -moveFromLocal user.txt /guli/user
[kris@hadoop102 datas]$ hadoop fs -moveFromLocal *.txt /guli/video
[kris@hadoop102 hadoop-2.7.2]$ hadoop jar ETLVideo.jar com.atguigu.etl.ETLDriver /guli/video /guli/video_etl
 ETL
                False=5792
                True=743569
创建表:
create external table gulivideo_ori(
    videoId string,
    uploader string,
    age int,
    category array<string>,
    length int,
    views int,
    rate float,
    ratings int,
    comments int,
    relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile
location ‘/guli/video_etl‘;

create external table gulivideo_user_ori(
    uploader string,
    videos int,
    friends int)
row format delimited
fields terminated by "\t"
stored as textfile
location ‘/guli/user‘;

create table gulivideo_orc(
    videoId string,
    uploader string,
    age int,
    category array<string>,
    length int,
    views int,
    rate float,
    ratings int,
    comments int,
    relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as orc;

create table gulivideo_user_orc(
    uploader string,
    videos int,
    friends int)
row format delimited
fields terminated by "\t"
stored as orc;

0: jdbc:hive2://hadoop101:10000> insert into table gulivideo_orc select * from gulivideo_ori;
0: jdbc:hive2://hadoop101:10000> insert into table gulivideo_user_orc select * from gulivideo_user_ori;
                
1.--统计视频观看数Top10
select videoid, uploader, views from gulivideo_orc
order by views desc limit 10;
+--------------+------------------+-----------+--+
| videoid | uploader | views |
+--------------+------------------+-----------+--+
| dMH0bHeiRNg | judsonlaipply | 42513417 |
| 0XxI-hvPRRA | smosh | 20282464 |
| 1dmVU08zVpA | NBC | 16087899 |
| RB-wUgnyGv0 | ChrisInScotland | 15712924 |
| QjA5faZF1A8 | guitar90 | 15256922 |
| -_CSo1gOd48 | tasha | 13199833 |
| 49IDp76kjPw | TexMachina | 11970018 |
| tYnn51C3X_w | CowSayingMoo | 11823701 |
| pv5zWaTEVkI | OkGo | 11672017 |
| D2kJZOfq7zk | mrWoot | 11184051 |
+--------------+------------------+-----------+--+
10 rows selected (22.612 seconds)
使用group by的两个要素:
(1) 出现在select后面的字段 要么是是聚合函数中的,要么就是group by 中的.
(2) 要筛选结果 可以先使用where 再用group by 或者先用group by 再用having

--2.统计视频类别热度Top10 (类别的videoid--视频的唯一id越多就代表热度高, 类别排序的多少排序;不能分组分组是在组内排序)
①统计视频类别:
select videoid, categories from gulivideo_orc lateral view explode(category) tbl as categories
②按类别的热度排名
select t1.videoid, t1.categories, count(videoid) num from (select videoid, categories from gulivideo_orc lateral view explode(category) tbl as categories) t1
group by t1.categories order by num desc limit 10;
--->拼一块:t1.videoid不能出现在select后边,
select t1.categories, count(videoid) num from (select videoid, categories from gulivideo_orc lateral view explode(category) tbl as categories) t1
group by t1.categories order by num desc limit 10;

+----------------+---------+--+
| t1.categories | num |
+----------------+---------+--+
| Music | 179049 |
| Entertainment | 127674 |
| Comedy | 87818 |
| Animation | 73293 |
| Film | 73293 |
| Sports | 67329 |
| Gadgets | 59817 |
| Games | 59817 |
| Blogs | 48890 |
| People | 48890 |
+----------------+---------+--+
10 rows selected (70.01 seconds)
3.--统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数  //所有类别中包含Top20视频的个数

//Expression not in GROUP BY key ‘videoid‘
not in GROUP BY key ‘views‘,后边有views,select后必须加views
############
①观看数最高的20个视频:
select videoid, category, views from gulivideo_orc order by views desc limit 20
②把类别category炸开--所属类别
select videoid, categories, views from t1 lateral view explode(category) tbl categories
--->前两句合起:
select t1.videoid, categories, t1.views from (select videoid, category, views from gulivideo_orc order by views desc limit 20
) t1 lateral view explode(category) tbl as categories;
+--------------+----------------+-----------+--+
|  t1.videoid  |   categories   | t1.views  |
+--------------+----------------+-----------+--+
| dMH0bHeiRNg  | Comedy         | 42513417  |
| 0XxI-hvPRRA  | Comedy         | 20282464  |
| 1dmVU08zVpA  | Entertainment  | 16087899  |
| RB-wUgnyGv0  | Entertainment  | 15712924  |
| QjA5faZF1A8  | Music          | 15256922  |
| -_CSo1gOd48  | People         | 13199833  |
| -_CSo1gOd48  | Blogs          | 13199833  |
| 49IDp76kjPw  | Comedy         | 11970018  |
| tYnn51C3X_w  | Music          | 11823701  |
| pv5zWaTEVkI  | Music          | 11672017  |
| D2kJZOfq7zk  | People         | 11184051  |
| D2kJZOfq7zk  | Blogs          | 11184051  |
| vr3x_RRJdd4  | Entertainment  | 10786529  |
| lsO6D1rwrKc  | Entertainment  | 10334975  |
| 5P6UU6m3cqk  | Comedy         | 10107491  |
| 8bbTtPL1jRs  | Music          | 9579911   |
| _BuRwH59oAo  | Comedy         | 9566609   |
| aRNzWyD7C9o  | UNA            | 8825788   |
| UMf40daefsI  | Music          | 7533070   |
| ixsZy2425eY  | Entertainment  | 7456875   |
| MNxwAU_xAMk  | Comedy         | 7066676   |
| RUCZJVJ_M8o  | Entertainment  | 6952767   |
+--------------+----------------+-----------+--+
③类别中包含top20的视频的个数:在上条基础上加上按类别分组,计数组内videoid计数
--->
select categories, count(videoid) from (select videoid, category, views from gulivideo_orc order by views desc limit 20
) t1 lateral view explode(category) tbl as categories group by categories
+----------------+------+--+
|   categories   | _c1  |
+----------------+------+--+
| Blogs          | 2    |
| Comedy         | 6    |
| Entertainment  | 6    |
| Music          | 5    |
| People         | 2    |
| UNA            | 1    |
+----------------+------

-- over里边不能使用limit, 怎么获取分区排序前几个呢?需要使用一个子查询;分区是数据存储上的分子文件,查询时还是在一张表
select t1.videoid, t1.views, t1.ran, t1.categories from(
select videoid, views, categories, rank() over(partition by categories order by views desc) ran
from gulivideo_orc lateral view explode(category) tbl as categories) t1
where t1.ran <= 5;
+--------------+-----------+---------+----------------+--+
|  t1.videoid  | t1.views  | t1.ran  | t1.categories  |
+--------------+-----------+---------+----------------+--+
| 2GWPOPSXGYI  | 3660009   | 1       | Animals        |
| xmsV9R8FsDA  | 3164582   | 2       | Animals        |
| 12PsUW-8ge4  | 3133523   | 3       | Animals        |
| OeNggIGSKH8  | 2457750   | 4       | Animals        |
| WofFb_eOxxA  | 2075728   | 5       | Animals        |
| sdUUx5FdySs  | 5840839   | 1       | Animation      |
| 6B26asyGKDo  | 5147533   | 2       | Animation      |
| H20dhY01Xjk  | 3772116   | 3       | Animation      |
| 55YYaJIrmzo  | 3356163   | 4       | Animation      |
| JzqumbhfxRo  | 3230774   | 5       | Animation      |
| RjrEQaG5jPM  | 2803140   | 1       | Autos
......
4.--统计视频观看数Top50所关联视频的所属类别排序Top50---relatedid---种类---; 炸开之后直接join,因它是张虚拟表,hive是不支持的select videoid, views, relatedid from gulivideo_orc order by views desc limit 50炸开单独写一个sql: t1 select distinct(tbl.relatedids) rid from t1 lateral view explode(relatedid) tbl as relatedids自己join自己下: t2 select g.videoid, g.category from t2 left join gulivideo_orc g on t2.vid=g.videoid把category炸开并排序:select cateegories, count(videoid) hot from t3 lateral view explode(category) tb12 as catogories group by categores order by hot desc;

select categories, count(videoid) hot
from(select g.videoid, g.category
from(select distinct(tbl.relatedids) rid
from(select videoid, views, relatedid
from gulivideo_orc order by views desc limit 50) t1 lateral view explode(relatedid) tbl as relatedids) t2
join gulivideo_orc g on t2.rid=g.videoid) t3 lateral view explode(category) tbl2 as categories
group by categories order by hot desc;
+----------------+------+--+
|   categories   | hot  |
+----------------+------+--+
| Comedy         | 217  |
| Entertainment  | 207  |
| Music          | 186  |
| Blogs          | 49   |
| People         | 49   |
| Film           | 46   |
| Animation      | 46   |
| News           | 21   |
| Politics       | 21   |
| Games          | 19   |
| Gadgets        | 19   |
| Sports         | 17   |
| Places         | 12   |
| UNA            | 12   |
| Travel         | 12   |
| Howto          | 12   |
| DIY            | 12   |
| Animals        | 11   |
| Pets           | 11   |
| Autos          | 3    |
| Vehicles       | 3    |
+----------------+------+--+
21 rows selected (115.239 seconds)
5.--统计每个类别中的视频热度Top10,以Music为例
创建类别表:
create table gulivideo_category(
videoid string, uploader string, age int, categoryid string, length int, views int, rate float,
ratings int, comments int, relatedid array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as orc;
插入数据:
insert into table gulivideo_category
select videoid, uploader, age, categoryid, length, views, rate, ratings, comments, relatedid
from gulivideo_orc lateral view explode(category) category as categoryid;
--->把一张表全查出来:select categoryid, videoid, paiming from (select categoryid, videoid, rank() over(partition by categoryid order by views desc) paiming from gulivideo_category) t1where t1.paiming <= 10;

select categoryid, videoid, views
from gulivideo_category
where categoryid="music"
order by views desc limit 10;

6.--统计每个类别中视频流量Top10,以Music为例
select videoid, ratings
from gulivideo_category
where categoryid="music"
order by ratings desc limit 10;

7.--统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频

①上传视频最多的用户Top10:
select videos,uploader
from gulivideo_user_orc
order by videos desc limit 10;
②找出这10个人上传的视频select g.videoid, rank() over(partition by g.uploader order by g.views desc) hot from t1 join gulivideo_orc g on t1.uploader = g.uploader③找出前20select t2.uploader, t2.videoid from t2 where t2.hot <= 20;
select t2.uploader, t2.videoid from(
select g.uploader, g.videoid, g.views, rank() over(partition by g.uploader order by g.views desc) hot from
(select uploader,videos
from gulivideo_user_orc
order by videos desc limit 10) t1
left join gulivideo_orc g on t1.uploader=g.uploader) t2
where t2.hot <= 20;
 +----------------+--------------+--+
|  t2.uploader   |  t2.videoid  |
+----------------+--------------+--+
| NULL           | NULL         |
| NULL           | NULL         |
| NULL           | NULL         |
| NULL           | NULL         |
| Ruchaneewan    | xbYyjUdhtJw  |
| Ruchaneewan    | 4dkKeIUkN7E  |
| Ruchaneewan    | qCfuQA6N4K0  |
| Ruchaneewan    | TmYbGQaRcNM  |
| Ruchaneewan    | dOlfPsFSjw0  |
| expertvillage  | -IxHBW0YpZw  |
| expertvillage  | BU-fT5XI_8I  |
| expertvillage  | ADOcaBYbMl0  |
...

8.--统计每个类别视频观看数Top10

select t.categoryid, t.videoid, t.ranking
from(
select categoryid, videoid, rank() over(partition by categoryid order by views desc) ranking
from gulivideo_category) t
where t.ranking <= 10;

+----------------+--------------+------------+--+
|  t.categoryid  |  t.videoid   | t.ranking  |
+----------------+--------------+------------+--+
| Animals        | 2GWPOPSXGYI  | 1          |
| Animals        | xmsV9R8FsDA  | 2          |
| Animals        | 12PsUW-8ge4  | 3          |
| Animals        | OeNggIGSKH8  | 4          |
| Animals        | WofFb_eOxxA  | 5          |
| Animals        | AgEmZ39EtFk  | 6          |
| Animals        | a-gW3RbJd8U  | 7          |
| Animals        | 8CL2hetqpfg  | 8          |
| Animals        | QmroaYVD_so  | 9          |
| Animals        | Sg9x5mUjbH8  | 10         |
| Animation      | sdUUx5FdySs  | 1          |
| Animation      | 6B26asyGKDo  | 2          |
| Animation      | H20dhY01Xjk  | 3          |
| Animation      | 55YYaJIrmzo  | 4          |
| Animation      | JzqumbhfxRo  | 5          |
| Animation      | eAhfZUZiwSE  | 6          |
| Animation      | h7svw0m-wO0  | 7          |
| Animation      | tAq3hWBlalU  | 8          |
| Animation      | AJzU3NjDikY  | 9          |
| Animation      | ElrldD02if0  | 10         |
| Autos          | RjrEQaG5jPM  | 1          |
......
210 rows selected (24.379 seconds)

原文地址:https://www.cnblogs.com/shengyang17/p/10404223.html

时间: 2024-11-13 08:22:09

Hive| ETL清洗& 查询练习的相关文章

Hive[6] HiveQL 查询

6.1   SELECT ... FROM 语句 hive> SELECT name,salary FROM employees;    --普通查询 hive>SELECT e.name, e.salary FROM employees e;  --也支持别名查询 当用户选择的列是集合数据类型时,Hive会使用 JSON 语法应用于输出: hive> SELECT name,subordinates FROM employees;   显示  John Doe ["Mary

hbase.client.RetriesExhaustedException: Can&#39;t get the locations hive关联Hbase查询报错

特征1: hbase.client.RetriesExhaustedException: Can't get the locations 特征2: hbase日志报错如下:org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode = ConnectionLoss for /hbase/hbaseid 特征3: unexpected error, closing socket connection a

好程序员大数据学习路线之hive表的查询

好程序员大数据学习路线之hive表的查询 1.join 查询 1.永远是小结果集驱动大结果集(小表驱动大表,小表放在左表). 2.尽量不要使用join,但是join是难以避免的. left join . left outer join . left semi join(左半开连接,只显示左表信息) hive在0.8版本以后开始支持left join left join 和 left outer join 效果差不多 hive的join中的on只能跟等值连接 "=",不能跟< &g

Hive之简单查询不启用MapReduce

假设你想查询某个表的某一列.Hive默认是会启用MapReduce Job来完毕这个任务,例如以下: 01 hive> SELECT id, money FROM m limit 10; 02 Total MapReduce jobs = 1 03 Launching Job 1 out of 1 04 Number of reduce tasks is set to 0 since there's no reduce operator 05 Cannot run job locally: In

[转]Hive:简单查询不启用Mapreduce job而启用Fetch task

转自:http://www.iteblog.com/archives/831 如果你想查询某个表的某一列,Hive默认是会启用MapReduce Job来完成这个任务,如下: hive> SELECT id, money FROM m limit 10; Total MapReduce jobs = 1 Launching Job 1 out of 1 Number of reduce tasks is set to 0 since there's no reduce operator Cann

hive:子查询

hive本身支持的子查询非常有限,Hive不支持where子句中的子查询,只允许子查询在from中出现 错误写法: insert into table branch_atmzc_sum Select XT_OP_TRL, SA_TX_DT,"取款-存款",b.cr_tx_amt- a.cr_tx_amt as cr_tx_amt from branch_atmzc a join branch_atmzc b on (a.XT_OP_TRL = b.XT_OP_TRL and a.SA_

hive表信息查询:查看表结构、表操作等--转

原文地址:http://www.aboutyun.com/forum.PHP?mod=viewthread&tid=8590&highlight=Hive 问题导读:1.如何查看hive表结构?2.如何查看表结构信息?3.如何查看分区信息?4.哪个命令可以模糊搜索表? 1.hive模糊搜索表  show tables like '*name*'; 2.查看表结构信息  desc formatted table_name;  desc table_name; 3.查看分区信息  show p

hive表信息查询:查看表结构、表操作等

转自网友的,主要是自己备份下 有时候不记得! 问题导读:1.如何查看hive表结构?2.如何查看表结构信息?3.如何查看分区信息?4.哪个命令可以模糊搜索表 1.hive模糊搜索表 show tables like '*name*'; 2.查看表结构信息  desc formatted table_name;  desc table_name; 3.查看分区信息  show partitions table_name; 4.根据分区查询数据  select table_coulm from ta

Ambari Hive 无法正常查询 问题分析处理

1.今天遇到一个错误,在连接到hive之后,无法执行任何命令. $ beeline -u "jdbc:hive2://hostname.domain.cn:10000/default;principal=hive/[email protected];" Connecting to jdbc:hive2://hostname.domain.cn:10000/default;principal=hive/[email protected]; Connected to: Apache Hiv