知识点:
1、Hive复合数据类型:array
collect_set
collect_list
array_contains
sort_array
2、lateral view
explode(array)
lateral view out
需求:
click_log : cookie_id ad_id time
ad_list: ad_id ad_url catalog_list
统计:
cookie_catalog: cookie_id ad_catalog cat_weight
创建点击日志表:
CREATE TABLE click_log ( cookie_id STRING , ad_id STRING , ts STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t‘;
load data local inpath ‘/home/spark/software/data/click_log.txt‘ overwrite into table click_log;
select * from click_log; 11 ad_101 2014-05-01 06:01:12.334+01 22 ad_102 2014-05-01 07:28:12.342+01 33 ad_103 2014-05-01 07:50:12.33+01 11 ad_104 2014-05-01 09:27:12.33+01 22 ad_103 2014-05-01 09:03:12.324+01 33 ad_102 2014-05-02 19:10:12.343+01 11 ad_101 2014-05-02 09:07:12.344+01 35 ad_105 2014-05-03 11:07:12.339+01 22 ad_104 2014-05-03 12:59:12.743+01 77 ad_103 2014-05-03 18:04:12.355+01 99 ad_102 2014-05-04 00:36:39.713+01 33 ad_101 2014-05-04 19:10:12.343+01 11 ad_101 2014-05-05 09:07:12.344+01 35 ad_102 2014-05-05 11:07:12.339+01 22 ad_103 2014-05-05 12:59:12.743+01 77 ad_104 2014-05-05 18:04:12.355+01 99 ad_105 2014-05-05 20:36:39.713+01
collect_set功能:去除集合中重复的元素,结果是无序的、
select cookie_id, collect_set(ad_id) as orders from click_log group by cookie_id; 11 ["ad_101","ad_104"] 22 ["ad_104","ad_102","ad_103"] 33 ["ad_101","ad_102","ad_103"] 35 ["ad_105","ad_102"] 77 ["ad_104","ad_103"] 99 ["ad_105","ad_102"]
select cookie_id, collect_set(ad_id) as orders from click_log where ts > ‘2014-05-02‘ group by cookie_id; 11 ["ad_101"] 22 ["ad_104","ad_103"] 33 ["ad_101","ad_102"] 35 ["ad_105","ad_102"] 77 ["ad_104","ad_103"] 99 ["ad_105","ad_102"]
查询每个cookie_id访问过的ad_id的访问量
select cookie_id, ad_id, count(1) as amount from click_log group by cookie_id, ad_id; 11 ad_101 3 11 ad_104 1 22 ad_102 1 22 ad_103 2 22 ad_104 1 33 ad_101 1 33 ad_102 1 33 ad_103 1 35 ad_102 1 35 ad_105 1 77 ad_103 1 77 ad_104 1 99 ad_102 1 99 ad_105 1
hive0.13+才支持collect_list:不去除集合中的重复元素
select cookie_id, collect_list(ad_id) as orders from click_log group by cookie_id; 11 ["ad_101","ad_104","ad_101","ad_101"] 22 ["ad_102","ad_103","ad_104","ad_103"] 33 ["ad_103","ad_102","ad_101"] 35 ["ad_105","ad_102"] 77 ["ad_103","ad_104"] 99 ["ad_102","ad_105"]
创建广告类别表:
CREATE TABLE ad_list ( ad_id STRING , url STRING , catalogs array<STRING> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t‘ COLLECTION ITEMS TERMINATED BY ‘|‘;
load data local inpath ‘/home/spark/software/data/ad_list.txt‘ overwrite into table ad_list;
select * from ad_list; ad_101 http://www.google.com ["catalog8","catalog1"] ad_102 http://www.sohu.com ["catalog6","catalog3"] ad_103 http://www.baidu.com ["catalog7"] ad_104 http://www.qq.com ["catalog5","catalog1","catalog4","catalog9"] ad_105 http://sina.com []
CREATE TABLE ad_list_string ( ad_id STRING , url STRING , catalogs STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t‘;
load data local inpath ‘/home/spark/software/data/ad_list.txt‘ overwrite into table ad_list_string;
select * from ad_list_string; ad_101 http://www.google.com catalog8|catalog1 ad_102 http://www.sohu.com catalog6|catalog3 ad_103 http://www.baidu.com catalog7 ad_104 http://www.qq.com catalog5|catalog1|catalog4|catalog9 ad_105 http://sina.com
查询每个cookie_id访问过的ad_id的访问量以及该广告所属的类别
select click.cookie_id, click.ad_id, click.amount, ad_list_string.catalogs as orders from ( select cookie_id, ad_id, count(1) as amount from click_log group by cookie_id, ad_id ) click join ad_list_string on (ad_list_string.ad_id = click.ad_id); 11 ad_101 3 catalog8|catalog1 11 ad_104 1 catalog5|catalog1|catalog4|catalog9 22 ad_102 1 catalog6|catalog3 22 ad_103 2 catalog7 22 ad_104 1 catalog5|catalog1|catalog4|catalog9 33 ad_101 1 catalog8|catalog1 33 ad_102 1 catalog6|catalog3 33 ad_103 1 catalog7 35 ad_102 1 catalog6|catalog3 35 ad_105 1 77 ad_103 1 catalog7 77 ad_104 1 catalog5|catalog1|catalog4|catalog9 99 ad_102 1 catalog6|catalog3 99 ad_105 1
LATERAL VIEW OUTER explode(catalogs)将数组打开变成横向的视图形式,只有hive支持,impala等其他的是不支持的
select ad_id, catalog from ad_list LATERAL VIEW OUTER explode(catalogs) t AS catalog; ad_101 catalog8 ad_101 catalog1 ad_102 catalog6 ad_102 catalog3 ad_103 catalog7 ad_104 catalog5 ad_104 catalog1 ad_104 catalog4 ad_104 catalog9 ad_105 NULL
注意此处没有使用OUTER,那么就没有ad_105的值,带OUTER的保留空的数据行,类似于left outer join, 使用时一般都带上
select ad_id, catalog from ad_list LATERAL VIEW explode(catalogs) t AS catalog; ad_101 catalog8 ad_101 catalog1 ad_102 catalog6 ad_102 catalog3 ad_103 catalog7 ad_104 catalog5 ad_104 catalog1 ad_104 catalog4 ad_104 catalog9
此时数组中的结果是无序的
select ad_id, collect_set(catalog) from ad_list LATERAL VIEW OUTER explode(catalogs) t AS catalog group by ad_id; ad_101 ["catalog8","catalog1"] ad_102 ["catalog3","catalog6"] ad_103 ["catalog7"] ad_104 ["catalog9","catalog1","catalog5","catalog4"] ad_105 []
按照catalogs内部的元素进行排序,sort_array在spark中暂不支持
select ad_id, sort_array(catalogs) from ad_list; ad_101 ["catalog1","catalog8"] ad_102 ["catalog3","catalog6"] ad_103 ["catalog7"] ad_104 ["catalog1","catalog4","catalog5","catalog9"] ad_105 []
判断数组中是否包含指定的数值,常用在where条件中
select ad_id, catalogs from ad_list where array_contains(catalogs, ‘catalog1‘); ad_101 ["catalog8","catalog1"] ad_104 ["catalog5","catalog1","catalog4","catalog9"]
统计每个cookie_id访问过哪些类别,此时统计输出结果是无序的
select click.cookie_id, ad.catalog from click_log click left outer join ( select ad_id, catalog from ad_list LATERAL VIEW OUTER explode(catalogs) t AS catalog ) ad on (click.ad_id = ad.ad_id); 11 catalog8 11 catalog1 22 catalog6 22 catalog3 33 catalog7 11 catalog5 11 catalog1 11 catalog4 11 catalog9 22 catalog7 33 catalog6 33 catalog3 11 catalog8 11 catalog1 35 NULL 22 catalog5 22 catalog1 22 catalog4 22 catalog9 77 catalog7 99 catalog6 99 catalog3 33 catalog8 33 catalog1 11 catalog8 11 catalog1 35 catalog6 35 catalog3 22 catalog7 77 catalog5 77 catalog1 77 catalog4 77 catalog9 99 NULL
将统计结果写入cookie_cats表中,按照cookie_id和访问次数的降序排列
create table cookie_cats as select click.cookie_id, ad.catalog, count(1) as weight from click_log click left outer join ( select ad_id, catalog from ad_list LATERAL VIEW OUTER explode(catalogs) t AS catalog ) ad on (click.ad_id = ad.ad_id) group by click.cookie_id, ad.catalog order by cookie_id, weight desc;
select * from cookie_cats; 11 catalog1 4 11 catalog8 3 11 catalog5 1 11 catalog9 1 11 catalog4 1 22 catalog7 2 22 catalog9 1 22 catalog1 1 22 catalog3 1 22 catalog4 1 22 catalog5 1 22 catalog6 1 33 catalog3 1 33 catalog8 1 33 catalog7 1 33 catalog6 1 33 catalog1 1 35 NULL 1 35 catalog3 1 35 catalog6 1 77 catalog1 1 77 catalog4 1 77 catalog5 1 77 catalog7 1 77 catalog9 1 99 NULL 1 99 catalog3 1 99 catalog6 1
输出集合的元素是无序的
select cookie_id, collect_set(catalog) from cookie_cats group by cookie_id; 11 ["catalog8","catalog9","catalog1","catalog5","catalog4"] 22 ["catalog9","catalog1","catalog3","catalog5","catalog4","catalog7","catalog6"] 33 ["catalog8","catalog1","catalog3","catalog7","catalog6"] 35 ["catalog3","catalog6"] 77 ["catalog9","catalog1","catalog5","catalog4","catalog7"] 99 ["catalog3","catalog6"]
对集合中的元素进行排序
select cookie_id, sort_array(collect_set(catalog)) from cookie_cats group by cookie_id; 11 ["catalog1","catalog4","catalog5","catalog8","catalog9"] 22 ["catalog1","catalog3","catalog4","catalog5","catalog6","catalog7","catalog9"] 33 ["catalog1","catalog3","catalog6","catalog7","catalog8"] 35 ["catalog3","catalog6"] 77 ["catalog1","catalog4","catalog5","catalog7","catalog9"] 99 ["catalog3","catalog6"]
时间: 2024-10-06 04:49:28