知识点:
1、Hive复合数据类型map与Lateral View的使用;
map、str_to_map、map_keys、map_values,map与lateral view
2、通过translate进行简单数据保护;
Hive转换函数进行数据保护,确保企业应用信息安全
3、Hive的窗口和分析函数入门;
row_number、rank、dense_rank
创建订单表:
CREATE EXTERNAL TABLE f_orders ( user_id STRING , ts STRING , order_id STRING , items map<STRING,BIGINT> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t‘ COLLECTION ITEMS TERMINATED BY ‘|‘ MAP KEYS TERMINATED BY ‘:‘;
加载数据:
load data local inpath ‘/home/spark/software/data/f_orders.txt‘ overwrite into table f_orders;
查询数据:
select * from f_orders; 11 2014-05-01 06:01:12.334+01 10703007267488 {"item8":2,"item1":1} 22 2014-05-01 07:28:12.342+01 10101043505096 {"item6":3,"item3":2} 33 2014-05-01 07:50:12.33+01 10103043509747 {"item7":7} 11 2014-05-01 09:27:12.33+01 10103043501575 {"item5":5,"item1":1,"item4":1,"item9":1} 22 2014-05-01 09:03:12.324+01 10104043514061 {"item1":3} 33 2014-05-02 19:10:12.343+01 11003002067594 {"item4":2,"item1":1} 11 2014-05-02 09:07:12.344+01 10101043497459 {"item9":1} 35 2014-05-03 11:07:12.339+01 10203019269975 {"item5":1,"item1":1} 789 2014-05-03 12:59:12.743+01 10401003346256 {"item7":3,"item8":2,"item9":1} 77 2014-05-03 18:04:12.355+01 10203019262235 {"item5":2,"item1":1} 99 2014-05-04 00:36:39.713+01 10103044681799 {"item9":3,"item1":1} 33 2014-05-04 19:10:12.343+01 12345678901234 {"item5":1,"item1":1} 11 2014-05-05 09:07:12.344+01 12345678901235 {"item6":1,"item1":1} 35 2014-05-05 11:07:12.339+01 12345678901236 {"item5":2,"item1":1} 22 2014-05-05 12:59:12.743+01 12345678901237 {"item9":3,"item1":1} 77 2014-05-05 18:04:12.355+01 12345678901238 {"item8":3,"item1":1} 99 2014-05-05 20:36:39.713+01 12345678901239 {"item9":3,"item1":1}
从map中取值:map_keys, map_values
select map_keys(items), map_values(items) from f_orders where user_id = ‘35‘; ["item5","item1"] [1,1] ["item5","item1"] [2,1]
查询包含订单条目中有item8的订单
select * from f_orders where array_contains(map_keys(items), ‘item8‘); 11 2014-05-01 06:01:12.334+01 10703007267488 {"item1":1,"item8":2} 789 2014-05-03 12:59:12.743+01 10401003346256 {"item7":3,"item8":2,"item9":1} 77 2014-05-05 18:04:12.355+01 12345678901238 {"item1":1,"item8":3}
将f_orders中items列打开成横向视图
select user_id, order_id, item, amount from f_orders LATERAL VIEW explode(items) t AS item, amount; 11 10703007267488 item8 2 11 10703007267488 item1 1 22 10101043505096 item6 3 22 10101043505096 item3 2 33 10103043509747 item7 7 11 10103043501575 item5 5 11 10103043501575 item1 1 11 10103043501575 item4 1 11 10103043501575 item9 1 22 10104043514061 item1 3 33 11003002067594 item4 2 33 11003002067594 item1 1 11 10101043497459 item9 1 35 10203019269975 item5 1 35 10203019269975 item1 1 789 10401003346256 item7 3 789 10401003346256 item8 2 789 10401003346256 item9 1 77 10203019262235 item5 2 77 10203019262235 item1 1 99 10103044681799 item9 3 99 10103044681799 item1 1 33 12345678901234 item5 1 33 12345678901234 item1 1 11 12345678901235 item6 1 11 12345678901235 item1 1 35 12345678901236 item5 2 35 12345678901236 item1 1 22 12345678901237 item9 3 22 12345678901237 item1 1 77 12345678901238 item8 3 77 12345678901238 item1 1 99 12345678901239 item9 3 99 12345678901239 item1 1
创建订单条目表:
CREATE EXTERNAL TABLE d_items ( item_sku STRING, price DOUBLE, catalogs array<STRING> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t‘ COLLECTION ITEMS TERMINATED BY ‘|‘;
加载数据:
load data local inpath ‘/home/spark/software/data/d_items.txt‘ overwrite into table d_items;
查询数据:
select * from d_items; item1 100.2 ["catalogA","catalogD","catalogX"] item2 200.3 ["catalogA"] item3 300.4 ["catalogA","catalogX"] item4 400.5 ["catalogB"] item5 500.6 ["catalogB","catalogX"] item6 600.7 ["catalogB"] item7 700.8 ["catalogC"] item8 800.9 ["catalogC","catalogD"] item9 899.99 ["catalogC","catalogA"]
求每个人的每个订单的金额
select orders.user_id, orders.order_id, round(sum(d.price*orders.amount), 2) as order_price from ( select user_id, order_id, item, amount from f_orders LATERAL VIEW explode(items) t AS item, amount ) orders join d_items d on (orders.item = d.item_sku) group by orders.user_id, orders.order_id; 11 10101043497459 899.99 11 10103043501575 3903.69 11 10703007267488 1702.0 11 12345678901235 700.9 22 10101043505096 2402.9 22 10104043514061 300.6 22 12345678901237 2800.17 33 10103043509747 4905.6 33 11003002067594 901.2 33 12345678901234 600.8 35 10203019269975 600.8 35 12345678901236 1101.4 77 10203019262235 1101.4 77 12345678901238 2502.9 789 10401003346256 4604.19 99 10103044681799 2800.17 99 12345678901239 2800.17
求人和订单条目以及订单条目数量对应关系的数量
select user_id, item, amount from f_orders LATERAL VIEW explode(items) t AS item, amount; 11 item8 2 11 item1 1 22 item6 3 22 item3 2 33 item7 7 11 item5 5 11 item1 1 11 item4 1 11 item9 1 22 item1 3 33 item4 2 33 item1 1 11 item9 1 35 item5 1 35 item1 1 789 item7 3 789 item8 2 789 item9 1 77 item5 2 77 item1 1 99 item9 3 99 item1 1 33 item5 1 33 item1 1 11 item6 1 11 item1 1 35 item5 2 35 item1 1 22 item9 3 22 item1 1 77 item8 3 77 item1 1 99 item9 3 99 item1 1
订单条目与类别(类别打散后)的关系
select item_sku, catalog from d_items LATERAL VIEW explode(catalogs) t AS catalog; item1 catalogA item1 catalogD item1 catalogX item2 catalogA item3 catalogA item3 catalogX item4 catalogB item5 catalogB item5 catalogX item6 catalogB item7 catalogC item8 catalogC item8 catalogD item9 catalogC item9 catalogA
人和订单条目和订单条目数量以及与类别(类别打散后)的关系
select orders.user_id, orders.item, orders.amount, catalogs.catalog from ( select user_id, item, amount from f_orders LATERAL VIEW explode(items) t AS item, amount ) orders join ( select item_sku, catalog from d_items LATERAL VIEW explode(catalogs) t AS catalog ) catalogs on (orders.item = catalogs.item_sku) ; 11 item8 2 catalogC 11 item8 2 catalogD 11 item1 1 catalogA 11 item1 1 catalogD 11 item1 1 catalogX 22 item6 3 catalogB 22 item3 2 catalogA 22 item3 2 catalogX 33 item7 7 catalogC 11 item5 5 catalogB 11 item5 5 catalogX 11 item1 1 catalogA 11 item1 1 catalogD 11 item1 1 catalogX 11 item4 1 catalogB 11 item9 1 catalogC 11 item9 1 catalogA 22 item1 3 catalogA 22 item1 3 catalogD 22 item1 3 catalogX 33 item4 2 catalogB 33 item1 1 catalogA 33 item1 1 catalogD 33 item1 1 catalogX 11 item9 1 catalogC 11 item9 1 catalogA 35 item5 1 catalogB 35 item5 1 catalogX 35 item1 1 catalogA 35 item1 1 catalogD 35 item1 1 catalogX 789 item7 3 catalogC 789 item8 2 catalogC 789 item8 2 catalogD 789 item9 1 catalogC 789 item9 1 catalogA 77 item5 2 catalogB 77 item5 2 catalogX 77 item1 1 catalogA 77 item1 1 catalogD 77 item1 1 catalogX 99 item9 3 catalogC 99 item9 3 catalogA 99 item1 1 catalogA 99 item1 1 catalogD 99 item1 1 catalogX 33 item5 1 catalogB 33 item5 1 catalogX 33 item1 1 catalogA 33 item1 1 catalogD 33 item1 1 catalogX 11 item6 1 catalogB 11 item1 1 catalogA 11 item1 1 catalogD 11 item1 1 catalogX 35 item5 2 catalogB 35 item5 2 catalogX 35 item1 1 catalogA 35 item1 1 catalogD 35 item1 1 catalogX 22 item9 3 catalogC 22 item9 3 catalogA 22 item1 1 catalogA 22 item1 1 catalogD 22 item1 1 catalogX 77 item8 3 catalogC 77 item8 3 catalogD 77 item1 1 catalogA 77 item1 1 catalogD 77 item1 1 catalogX 99 item9 3 catalogC 99 item9 3 catalogA 99 item1 1 catalogA 99 item1 1 catalogD 99 item1 1 catalogX
将结果写到usr_cat_weight表中
create table usr_cat_weight as select orders.user_id, catalogs.catalog, sum(orders.amount) as weight from ( select user_id, item, amount from f_orders LATERAL VIEW explode(items) t AS item, amount ) orders join ( select item_sku, catalog from d_items LATERAL VIEW explode(catalogs) t AS catalog ) catalogs on (orders.item = catalogs.item_sku) group by orders.user_id, catalogs.catalog order by user_id, weight desc;
select * from usr_cat_weight; 11 catalogX 8 11 catalogB 7 11 catalogD 5 11 catalogA 5 11 catalogC 4 22 catalogA 9 22 catalogX 6 22 catalogD 4 22 catalogB 3 22 catalogC 3 33 catalogC 7 33 catalogX 3 33 catalogB 3 33 catalogA 2 33 catalogD 2 35 catalogX 5 35 catalogB 3 35 catalogA 2 35 catalogD 2 77 catalogD 5 77 catalogX 4 77 catalogC 3 77 catalogA 2 77 catalogB 2 789 catalogC 6 789 catalogD 2 789 catalogA 1 99 catalogA 8 99 catalogC 6 99 catalogD 2 99 catalogX 2
row_number: 行号
select user_id, catalog, weight, row_number() OVER (PARTITION BY user_id ORDER BY weight DESC) as row_num FROM usr_cat_weight where user_id < ‘33‘; 11 catalogX 8 1 11 catalogB 7 2 11 catalogA 5 3 11 catalogD 5 4 11 catalogC 4 5 22 catalogA 9 1 22 catalogX 6 2 22 catalogD 4 3 22 catalogC 3 4 22 catalogB 3 5
rank: 相同的值排名是相同的,排名值会跳过重复排名的
select user_id, catalog, weight, rank() OVER (PARTITION BY user_id ORDER BY weight DESC) as rnk FROM usr_cat_weight where user_id < ‘33‘; 11 catalogX 8 1 11 catalogB 7 2 11 catalogA 5 3 11 catalogD 5 3 11 catalogC 4 5 22 catalogA 9 1 22 catalogX 6 2 22 catalogD 4 3 22 catalogC 3 4 22 catalogB 3 4
dense_rank:排名值不会跳过重复排名的
select user_id, catalog, weight, dense_rank() OVER (PARTITION BY user_id ORDER BY weight DESC) as drnk FROM usr_cat_weight where user_id < ‘33‘; 11 catalogX 8 1 11 catalogB 7 2 11 catalogA 5 3 11 catalogD 5 3 11 catalogC 4 4 22 catalogA 9 1 22 catalogX 6 2 22 catalogD 4 3 22 catalogC 3 4 22 catalogB 3 4
CREATE TABLE usr_cat AS select user_id, catalog, row_number() OVER (PARTITION BY user_id ORDER BY weight DESC) as row_num FROM ( select orders.user_id, catalogs.catalog, sum(orders.amount) as weight from ( select user_id, item, amount from f_orders LATERAL VIEW explode(items) t AS item, amount ) orders join ( select item_sku, catalog from d_items LATERAL VIEW explode(catalogs) t AS catalog ) catalogs on (orders.item = catalogs.item_sku) group by orders.user_id, catalogs.catalog order by user_id, weight ) x ORDER BY user_id, row_num;
select * from usr_cat; 11 catalogX 1 11 catalogB 2 11 catalogA 3 11 catalogD 4 11 catalogC 5 22 catalogA 1 22 catalogX 2 22 catalogD 3 22 catalogC 4 22 catalogB 5 33 catalogC 1 33 catalogB 2 33 catalogX 3 33 catalogD 4 33 catalogA 5 35 catalogX 1 35 catalogB 2 35 catalogA 3 35 catalogD 4 77 catalogD 1 77 catalogX 2 77 catalogC 3 77 catalogA 4 77 catalogB 5 789 catalogC 1 789 catalogD 2 789 catalogA 3 99 catalogA 1 99 catalogC 2 99 catalogD 3 99 catalogX 4
创建用户表:
CREATE EXTERNAL TABLE d_users ( user_id STRING , gender STRING , birthday STRING , email STRING , regday STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\073‘;
加载数据:
load data local inpath ‘/home/spark/software/data/d_users.txt‘ overwrite into table d_users;
查询:
select * from d_users; 11 m 1981-01-01 张三@gmail.com 2014-04-21 22 w 1982-01-01 user22@abcn.net 2014-04-22 33 m 1983-01-01 user33@fxlive.de 2014-04-23 77 w 1977-01-01 user77@fxlive.fr 2014-05-01 88 m 1988-01-01 user88@fxlive.eu 2014-05-02 99 w 1999-01-01 user99@abcn.net 2014-05-03 789 m 2008-01-01 admin@abcn.net 2014-05-03
Hive转换函数translate进行简单数据保护
select user_id, birthday, translate(birthday, ‘0123456789‘, ‘1234567890‘), email, translate(email, ‘userfxgmail1234567890‘, ‘1234567890userfxgmail‘) from d_users; 11 1981-01-01 2092-12-12 user11@gmail.com 1234ss@7890u.co8 22 1982-01-01 2093-12-12 user22@abcn.net 1234ee@9bcn.n3t 33 1983-01-01 2094-12-12 user33@fxlive.de 1234rr@56u0v3.d3 77 1977-01-01 2088-12-12 user77@fxlive.fr 1234mm@56u0v3.54 88 1988-01-01 2099-12-12 user88@fxlive.eu 1234aa@56u0v3.31 99 1999-01-01 2000-12-12 user99@abcn.net 1234ii@9bcn.n3t 789 2008-01-01 3119-12-12 admin@abcn.net 9d80n@9bcn.n3t
时间: 2024-10-10 10:21:44