hive lab competition

create table user(
userid STRING,
sex STRING,
age INT,
career INT,
code STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘:‘;

LOAD DATA LOCAL INPATH ‘/home/hadoop03/rating/users.dat‘ overwrite into table user;

create table rating(
userid STRING,
movieid STRING,
rate INT,
tmpe TIMESTAMP
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘:‘;

LOAD DATA LOCAL INPATH ‘/home/hadoop03/rating/ratings.dat‘ overwrite into table rating;

create table movie(
movieid STRING,
moviename STRING,
movietype ARRAY<STRING>
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘^‘
COLLECTION ITEMS TERMINATED BY ‘|‘;

LOAD DATA LOCAL INPATH ‘/home/hadoop03/rating/movies.dat‘ overwrite into table movie;

// every age the highest movie
select aa.age,aa.mymovietype,aa.c from ( select jun.age,jun.mymovietype,jun.c,row_number() over (distribute by age sort by c desc) rownum from ( select xiong.age,mymovietype,count(mymovietype) as c from( select bear.age,mymovietype from ( select usr.age,mov.movietype from rating rat left join user usr on rat.userid = usr.userid left join movie mov on rat.movieid = mov.movieid where rat.rate=5) bear LATERAL VIEW explode(bear.movietype) movietype AS mymovietype) xiong group by xiong.age,xiong.mymovietype) jun ) aa where aa.rownum =1;

pmod(datediff(from_unixtime(tmpe,‘yyyy-MM-dd‘), ‘2001-01-01‘),7)
select pmod(datediff(from_unixtime(tmpe,‘yyyy-MM-dd‘), ‘2014-09-21‘),7) from rating limit 3;
select weekofyear(from_unixtime(tmpe,‘yyyy-MM-dd‘)), day(from_unixtime(tmpe,‘yyyy-MM-dd‘)) from rating limit 3;

// every day of week every movie type highest movie
select case xx.day when 0 then 7 else xx.day end, xx.mymovietype, xx.moviename from (select jun.day, jun.mymovietype, jun.moviename,jun.cou, row_number() over (distribute by jun.day, jun.mymovietype sort by jun.cou desc) rownum from (select bear.day,bear.mymovietype,bear.moviename,count(bear.tmpe) as cou from ( select * from (select pmod(datediff(from_unixtime(tmpe,‘yyyy-MM-dd‘), ‘2014-09-21‘),7) as day,rat.tmpe,mov.movietype,mov.moviename from rating rat join movie mov on rat.movieid = mov.movieid where rat.rate=5 ) xiong lateral view explode (xiong.movietype) movietype as mymovietype) bear group by bear.day,bear.mymovietype,bear.moviename ) jun ) xx where xx.rownum = 1;

select movieid,moviename,mymovietype from movie lateral view explode (movietype) movietype as mymovietype limit 9;

select distinct(mymovietype) from (select day(from_unixtime(tmpe,‘yyyy-MM-dd‘)) as day,rat.tmpe,mov.movietype,mov.moviename from rating rat join movie mov on rat.movieid = mov.movieid where rat.rate=5 ) xiong lateral view explode (xiong.movietype) movietype as mymovietype

===============================================================================================================================================================================

create table wifi(
phone STRING,
year STRING,
month STRING,
day STRING,
hour STRING,
minute STRING,
second STRING,
timezone STRING,
host STRING,
facility STRING,
service STRING,
mac STRING,
protocol STRING,
message STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘,‘;

load data local inpath ‘/home/hadoop03/rating/wifi_data.txt‘ overwrite into table wifi;

create table wifi2(
tt BIGINT,
mac STRING,
message STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘,‘;

insert overwrite table wifi2 select unix_timestamp(concat(year,‘-‘,month,‘-‘,day,‘ ‘,hour,‘:‘,minute,‘:‘,second)),mac,message from wifi;

1 select count(distinct(mac)) from wifi;

2 FROM ( SELECT A.mac, A.tt, MIN(B.tt - A.tt) AS tt FROM wifi2 A, wifi2 B WHERE A.mac = B.mac AND A.tt <= B.tt AND A.message LIKE ‘%association OK%‘ AND B.message LIKE ‘%deauthenticated%‘ GROUP BY A.mac, A.tt ) e select count(e.tt);

3 FROM ( SELECT A.mac, A.tt, MIN(B.tt - A.tt) AS tt FROM wifi2 A, wifi2 B WHERE A.mac = B.mac AND A.tt <= B.tt AND A.message LIKE ‘%association OK%‘ AND B.message LIKE ‘%deauthenticated%‘ GROUP BY A.mac, A.tt ) e select avg(e.tt);

4 Not sure.

The competition includes 2 parts, you have to work with you group members to finish 2 tests.

Hive 1:

We have a dvd shop providing the dvd rent service. Based on different age group or flavor, the dvd shop need do some recommendations to the customer according to below movie rating.

Here is the rate file contain 1,000,209 anonymous ratings of approximately 3,900 movies made by 6,040 users.

For the format file please check the readme included in the zip package.

Here is the demand:

1. 根据不同年龄段统计出rate最高的电影类型

2. 统计出每周每天推荐度最高的不同电影类型的电影名称

Hive 2：

Background:

The retail customer wanted us to do some research, driven by an idea: It must be possible to bring the concepts of tracking users in the online world to retail stores. one of the most important key performance indicators is revenue per square metre. We thought about bringing in some new metrics. From a wider perspective, data is produced by various sensors. With a real store in mind we figured out possible sensors stores could use – customer frequency counters at the doors, the cashier system, free WiFi access points, video capturing, temperature, background music, smells and many more. While for many of those sensors additional hardware and software is needed, for a few sensors solutions are around, e.g. video capturing with face or even eye recognition. We talked about our ideas with executives and consultants from the retail industry and they confirmed our idea is interesting to persue.

Solution:

We thought the most interesting sensor data (that doesn’t require additional hardware/software) could be the WiFi access points. Especially given that many visitors will have WiFi enabled mobile phones. With it’s log files we should be able to answer at least the following questions for a particular store:

How many people visited the store (unique visits)?
How many visits did we have in total?
What is the average visit duration?
How many people are new vs. returning?

Here is the raw data sample:

1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,MLME,MLME-AUTHENTICATE.indication(98:0c:82:dc:8b:15, OPEN_SYSTEM)

1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,MLME,MLME-DELETEKEYS.request(98:0c:82:dc:8b:15)

1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,IEEE 802.11,authenticated

1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,IEEE 802.11,association OK (aid 2)

1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,IEEE 802.11,associated (aid 2)

1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,MLME,MLME-ASSOCIATE.indication(98:0c:82:dc:8b:15)

1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,MLME,MLME-DELETEKEYS.request(98:0c:82:dc:8b:15)

1358757010,2013,1,21,9,30,10,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,IEEE 802.11,deauthenticated

The description of the column is as follows:

iso_8601 year month day hour minute second timezone host facility_level service_level mac_address protocol message

We are interested in “authentication
OK‘ and “deauthenticated” messages only. The messages from the
router are not standardized (as are protocols such as TCP). We found
that those two status messages are the closest ones to our
understanding of a “login”/”logout” on the router.

Now
we have the data we need to answer the following questions:

How
many people visited the store (unique visitors)?
Note: Unlike the
traditional customer frequency counter at the doors we have mac
addresses at the log files that are unique for mobile phones.
Supposed people do not change their mobile phones we can recognize
unique visitors and not just visits.
How
many visits did we have?
What
is the average visit duration?
How
many people are new vs. returning?

Conclusion:

时间： 2025-01-21 01:11:18

hive lab competition的相关文章

[Hive优化] 之 MapJoin

根据mapjoin的计算原理,MAPJION会把小表全部读入内存中,在map阶段直接拿另外一个表的数据和内存中表数据做匹配.这种情况下即使笛卡尔积也不会对任务运行速度造成太大的效率影响. mapjoin的应用场景如下: 1.有一个极小的表<1000行 2: 需要做不等值join的where操作(a.x < b.y 或者 a.x like b.y等,注:目前版本join下不支持不等值操作,不等值需加到where条件里) 如果把不等于写到where里会造成笛卡尔积,如果数据量很大,笛卡尔积的后果不

sqoop命令，mysql导入到hdfs、hbase、hive

1.测试MySQL连接 bin/sqoop list-databases --connect jdbc:mysql://192.168.1.187:3306/trade_dev --username 'mysql' --password '111111' 2.检验SQL语句 bin/sqoop eval --connect jdbc:mysql://192.168.1.187:3306/trade_dev --username 'mysql' --password '111111' --quer

学习Hive和Impala必看经典解析

Hive和Impala作为数据查询工具,它们是怎样来查询数据的呢?与Impala和Hive进行交互,我们有哪些工具可以使用呢? 我们首先明确Hive和Impala分别提供了对应查询的接口: (1)命令行shell: 1. Impala:impala shell 2. Hive:beeline(早期hive的命令行版本是hive shell,现在基本不使用) (2)Hue Web UI: 1.Hue里面提供了 Hive查询编辑器 2.Hue里面提供了Impala查询编辑器 3.Hue里面提供了元数

Hive报错 Failed with exception java.io.IOException:java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: ${system:user.name%7D

报错信息如下 Failed with exception java.io.IOException:java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: ${system:user.name%7D 解决方法: 编辑 hive-site.xml 文件,添加下边的属性 <property> <name>system:java.io.tmpdir<

hive lab competition

hive lab competition的相关文章

[Hive优化] 之 MapJoin

sqoop命令，mysql导入到hdfs、hbase、hive

学习Hive和Impala必看经典解析

Hive报错 Failed with exception java.io.IOException:java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: ${system:user.name%7D

Spark 整合hive 实现数据的读取输出

Hive JDBC——深入浅出学Hive

Hadoop Hive基础sql语法

hive安装以及hive on spark

Hive UDTF开发指南