hive lab competition

create table user(
userid STRING,
sex STRING,
age INT,
career INT,
code STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘:‘;

LOAD DATA LOCAL INPATH ‘/home/hadoop03/rating/users.dat‘ overwrite into table user;

create table rating(
userid STRING,
movieid STRING,
rate INT,
tmpe TIMESTAMP
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘:‘;

LOAD DATA LOCAL INPATH ‘/home/hadoop03/rating/ratings.dat‘ overwrite into table rating;

create table movie(
movieid STRING,
moviename STRING,
movietype ARRAY<STRING>
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘^‘
COLLECTION ITEMS TERMINATED BY ‘|‘;

LOAD DATA LOCAL INPATH ‘/home/hadoop03/rating/movies.dat‘ overwrite into table movie;

// every age the highest movie
select aa.age,aa.mymovietype,aa.c from ( select jun.age,jun.mymovietype,jun.c,row_number() over (distribute by age sort by c desc) rownum from ( select xiong.age,mymovietype,count(mymovietype) as c from( select bear.age,mymovietype from ( select usr.age,mov.movietype from rating rat left join user usr on rat.userid = usr.userid left join movie mov on rat.movieid = mov.movieid where rat.rate=5) bear LATERAL VIEW explode(bear.movietype) movietype AS mymovietype) xiong group by xiong.age,xiong.mymovietype) jun ) aa where aa.rownum =1;

pmod(datediff(from_unixtime(tmpe,‘yyyy-MM-dd‘), ‘2001-01-01‘),7)
select pmod(datediff(from_unixtime(tmpe,‘yyyy-MM-dd‘), ‘2014-09-21‘),7) from rating limit 3;
select weekofyear(from_unixtime(tmpe,‘yyyy-MM-dd‘)), day(from_unixtime(tmpe,‘yyyy-MM-dd‘)) from rating limit 3;

// every day of week every movie type highest movie
select case xx.day when 0 then 7 else xx.day end, xx.mymovietype, xx.moviename from (select jun.day, jun.mymovietype, jun.moviename,jun.cou, row_number() over (distribute by jun.day, jun.mymovietype sort by jun.cou desc) rownum  from (select bear.day,bear.mymovietype,bear.moviename,count(bear.tmpe) as cou from ( select * from (select pmod(datediff(from_unixtime(tmpe,‘yyyy-MM-dd‘), ‘2014-09-21‘),7) as day,rat.tmpe,mov.movietype,mov.moviename from rating rat join movie mov on rat.movieid = mov.movieid where rat.rate=5  ) xiong lateral view explode (xiong.movietype) movietype as mymovietype) bear  group by bear.day,bear.mymovietype,bear.moviename ) jun ) xx where xx.rownum = 1;

select movieid,moviename,mymovietype from movie lateral view explode (movietype) movietype as mymovietype limit 9;

select distinct(mymovietype) from (select day(from_unixtime(tmpe,‘yyyy-MM-dd‘)) as day,rat.tmpe,mov.movietype,mov.moviename from rating rat join movie mov on rat.movieid = mov.movieid where rat.rate=5  ) xiong lateral view explode (xiong.movietype) movietype as mymovietype

===============================================================================================================================================================================

create table wifi(
phone STRING,
year STRING,
month STRING,
day STRING,
hour STRING,
minute STRING,
second STRING,
timezone STRING,
host STRING,
facility STRING,
service STRING,
mac STRING,
protocol STRING,
message STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘,‘;

load data local inpath ‘/home/hadoop03/rating/wifi_data.txt‘ overwrite into table wifi;

create  table wifi2(
tt BIGINT,
mac STRING,
message STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘,‘;

insert overwrite table wifi2 select unix_timestamp(concat(year,‘-‘,month,‘-‘,day,‘ ‘,hour,‘:‘,minute,‘:‘,second)),mac,message from wifi;

1   select count(distinct(mac)) from wifi;

2   FROM ( SELECT A.mac, A.tt, MIN(B.tt - A.tt) AS tt FROM wifi2 A, wifi2 B WHERE A.mac = B.mac AND A.tt <= B.tt AND A.message LIKE ‘%association OK%‘ AND B.message LIKE ‘%deauthenticated%‘ GROUP BY A.mac, A.tt ) e select count(e.tt);

3   FROM ( SELECT A.mac, A.tt, MIN(B.tt - A.tt) AS tt FROM wifi2 A, wifi2 B WHERE A.mac = B.mac AND A.tt <= B.tt AND A.message LIKE ‘%association OK%‘ AND B.message LIKE ‘%deauthenticated%‘ GROUP BY A.mac, A.tt ) e select avg(e.tt);

4   Not sure.

The competition includes 2 parts, you have to work with you group members to finish 2 tests.

Hive 1:

We have a dvd shop providing the dvd rent service. Based on different age group or flavor, the dvd shop need do some recommendations to the customer according to below movie rating.

Here is the rate file contain 1,000,209 anonymous ratings of approximately 3,900 movies made by 6,040 users.

For the format file please check the readme included in the zip package.

Here is the demand:

1. 根据不同年龄段统计出rate最高的电影类型

2. 统计出每周每天推荐度最高的不同电影类型的电影名称

Hive 2:

Background:

The retail customer wanted us to do some research, driven by an idea: It must be possible to bring the concepts of tracking users in the online world to retail stores. one of the most important key performance indicators is revenue per square metre. We thought about bringing in some new metrics. From a wider perspective, data is produced by various sensors. With a real store in mind we figured out possible sensors stores could use – customer frequency counters at the doors, the cashier system, free WiFi access points, video capturing, temperature, background music, smells and many more. While for many of those sensors additional hardware and software is needed, for a few sensors solutions are around, e.g. video capturing with face or even eye recognition. We talked about our ideas with executives and consultants from the retail industry and they confirmed our idea is interesting to persue.

Solution:

We thought the most interesting sensor data (that doesn’t require additional hardware/software) could be the WiFi access points. Especially given that many visitors will have WiFi enabled mobile phones. With it’s log files we should be able to answer at least the following questions for a particular store:

  • How many people visited the store (unique visits)?
  • How many visits did we have in total?
  • What is the average visit duration?
  • How many people are new vs. returning?

Here is the raw data sample:

1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,MLME,MLME-AUTHENTICATE.indication(98:0c:82:dc:8b:15, OPEN_SYSTEM)

1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,MLME,MLME-DELETEKEYS.request(98:0c:82:dc:8b:15)

1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,IEEE 802.11,authenticated

1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,IEEE 802.11,association OK (aid 2)

1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,IEEE 802.11,associated (aid 2)

1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,MLME,MLME-ASSOCIATE.indication(98:0c:82:dc:8b:15)

1358756939,2013,1,21,9,28,59,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,MLME,MLME-DELETEKEYS.request(98:0c:82:dc:8b:15)

1358757010,2013,1,21,9,30,10,+01:00,buffalo,hostapd,wlan0,98:0c:82:dc:8b:15,IEEE 802.11,deauthenticated

The description of the column is as follows:

iso_8601 year month day hour minute second timezone host facility_level service_level mac_address protocol message

We are interested in “authentication
OK‘ and “deauthenticated” messages only. The messages from the
router are not standardized (as are protocols such as TCP). We found
that those two status messages are the closest ones to our
understanding of a “login”/”logout” on the router.

Now
we have the data we need to answer the following questions:

  • How
    many people visited the store (unique visitors)?
    Note: Unlike the
    traditional customer frequency counter at the doors we have mac
    addresses at the log files that are unique for mobile phones.
    Supposed people do not change their mobile phones we can recognize
    unique visitors and not just visits.
  • How
    many visits did we have?
  • What
    is the average visit duration?
  • How
    many people are new vs. returning?

Conclusion:

时间: 2025-01-21 01:11:18

hive lab competition的相关文章

[Hive优化] 之 MapJoin

根据mapjoin的计算原理,MAPJION会把小表全部读入内存中,在map阶段直接拿另外一个表的数据和内存中表数据做匹配.这种情况下即使笛卡尔积也不会对任务运行速度造成太大的效率影响. mapjoin的应用场景如下: 1.有一个极小的表<1000行 2: 需要做不等值join的where操作(a.x < b.y 或者 a.x like b.y等,注:目前版本join下不支持不等值操作,不等值需加到where条件里) 如果把不等于写到where里会造成笛卡尔积,如果数据量很大,笛卡尔积的后果不

sqoop命令,mysql导入到hdfs、hbase、hive

1.测试MySQL连接 bin/sqoop list-databases --connect jdbc:mysql://192.168.1.187:3306/trade_dev --username 'mysql' --password '111111' 2.检验SQL语句 bin/sqoop eval --connect jdbc:mysql://192.168.1.187:3306/trade_dev --username 'mysql' --password '111111' --quer

学习Hive和Impala必看经典解析

Hive和Impala作为数据查询工具,它们是怎样来查询数据的呢?与Impala和Hive进行交互,我们有哪些工具可以使用呢? 我们首先明确Hive和Impala分别提供了对应查询的接口: (1)命令行shell: 1. Impala:impala shell 2. Hive:beeline(早期hive的命令行版本是hive shell,现在基本不使用) (2)Hue Web UI: 1.Hue里面提供了 Hive查询编辑器 2.Hue里面提供了Impala查询编辑器 3.Hue里面提供了元数

Hive报错 Failed with exception java.io.IOException:java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: ${system:user.name%7D

报错信息如下 Failed with exception java.io.IOException:java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: ${system:user.name%7D 解决方法: 编辑 hive-site.xml 文件,添加下边的属性 <property> <name>system:java.io.tmpdir<

Spark 整合hive 实现数据的读取输出

实验环境: linux centOS 6.7 vmware虚拟机 spark-1.5.1-bin-hadoop-2.1.0 apache-hive-1.2.1 eclipse 或IntelJIDea 本次使用eclipse. 代码: import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.DataFrame; import o

Hive JDBC——深入浅出学Hive

第一部分:搭建Hive JDBC开发环境 搭建:Steps ?新建工程hiveTest ?导入Hive依赖的包 ?Hive  命令行启动Thrift服务 ?hive --service hiveserver & 第二部分:基本操作对象的介绍 Connection ?说明:与Hive连接的Connection对象 ?Hive 的连接 ?jdbc:hive://IP:10000/default" ?获取Connection的方法 ?DriverManager.getConnection(&q

Hadoop Hive基础sql语法

Hive 是基于Hadoop 构建的一套数据仓库分析系统,它提供了丰富的SQL查询方式来分析存储在Hadoop 分布式文件系统中的数据,可以将结构化的数据文件映射为一张数据库表,并提供完整的SQL查询功能,可以将SQL语句转换为MapReduce任务进行运行,通过自己的SQL 去查询分析需要的内容,这套SQL 简称Hive SQL,使不熟悉mapreduce 的用户很方便的利用SQL 语言查询,汇总,分析数据.而mapreduce开发人员可以把己写的mapper 和reducer 作为插件来支持

hive安装以及hive on spark

spark由于一些链式的操作,spark 2.1目前只支持hive1.2.1 hive 1.2安装 到http://mirror.bit.edu.cn/apache/hive/hive-1.2.1/ 网址下载hive1.2.1的部署包 2.配置系统环境变量/etc/profile export HIVE_HOME=/opt/hive-1.2.1 export PATH=$PATH:$HIVE_HOME/bin source /etc/profile 使刚刚的配置生效 3. 解压 tar -xvf

Hive UDTF开发指南

在这篇文章中,我们将深入了解用户定义表函数(UDTF),该函数的实现是通过继承org.apache.Hadoop.hive.ql.udf.generic.GenericUDTF这个抽象通用类,UDTF相对UDF更为复杂,但是通过它,我们读入一个数据域,输出多行多列,而UDF只能输出单行单列. 代码 文章中所有的代码可以在这里找到:hive examples.GitHub repository 示例数据 首先先创建一张包含示例数据的表:people,该表只有name一列,该列中包含了一个或多个名字