#变量
#引入变量 given_dayno="‘20190601‘" …… dayno=${given_dayno}
#退出 exit ${v_job_stat}
#参数
#map过大 跑不动number of mappers: 230691; number of reducers: 1099 set mapred.max.split.size=1000000000;
#并行执行(union all 多的情况) set hive.exec.parallel=true; set hive.exec.parallel.thread.number=8;
#跟大家分享个hive调优的方法 今天有个任务,数据量不大(几个字段),但是记录数3kw左右,做join有个stage跑了一个多小时都没跑完 --时间长原因 map数太少,运行时该stage被只分配了11个map --解决方法:适当加大map set mapreduce.input.fileinputformat.split.maxsize=20000000; set mapreduce.input.fileinputformat.split.minsize.per.node=10000000; set mapreduce.input.fileinputformat.split.minsize.per.rack=10000000; 优化后,脚本大概20分钟就跑完了
#常见的设置参数 set hive.merge.mapredfiles=true; set hive.merge.mapfiles=true; set hive.merge.smallfiles.avgsize=536870912; set mapred.max.split.size=134217728; set mapreduce.map.memory.mb=4096; set mapreduce.reduce.memory.mb=6144; set hive.auto.convert.join=true; set hive.exec.parallel=true; set hive.exec.parallel.thread.number=8; set hive.exec.compress.intermediate=true; set hive.intermediate.compression.codec=org.apache.hadoop.io.compress.SnappyCodec; set hive.exec.reducers.bytes.per.reducer=134217728;
#hql
#count(0)也算一条记录 错:count(if(act_code in (‘20090031‘,‘20180031‘,‘20090080‘),1,0)) 对:count(if(act_code in (‘20090031‘,‘20180031‘,‘20090080‘),1,null)) sum(if(act_code in (‘20090031‘,‘20180031‘,‘20090080‘),1,0))
#增加列 alter table table_name add columns (col_name bigint);
#修改名 ALTER TABLE name RENAME TO new_name ALTER TABLE name ADD COLUMNS (col_spec[, col_spec ...]) ALTER TABLE name DROP [COLUMN] column_name ALTER TABLE name CHANGE column_name new_name new_type ALTER TABLE name REPLACE COLUMNS (col_spec[, col_spec ...])
#根据分区字段删除分区: ALTER TABLE my_partition_test_table DROP IF EXISTS PARTITION (p_loctype=‘MHA‘);
#substring,注意,从1开始,不是0; #截取 从字符串的第 4 个字符位置开始取,只取 2 个字符。 select substring(‘example.com‘, 4, 2); #substring 取最后两位 select substring(‘abc‘,length(‘abc‘)-1,2)
#建表 CREATE TABLE tableName ( col1 string, col2 bigint ) partitioned by (dayno string) row format delimited fields terminated by ‘\t‘ COLLECTION ITEMS TERMINATED BY ‘,‘ MAP KEYS TERMINATED BY ‘:‘ stored as orcfile;
#分享一个自己遇到的MAPJOIN代码运行效率调优的实例:调优前运行5小时, MAPJOIN调优后运行1小时左右,大表小表join时使用MAPJOIN能有效提升运行效率。 优化后代码截取段: select /*+ MAPJOIN(t2,t3) */ t1.imei,t1.gender,t1.age,t1.city_grade_name,t1.model,t3.series from dw.f_agt_user_tag t1 inner join ( select max(dayno) as dayno from dw.f_agt_user_tag ) t2 on t1.dayno = t2.dayno inner join ( select model_level_1 as model,series from dw.f_dim_model_info group by model_level_1,series ) t3 on t1.model=t3.model where t1.imei is not null group by t1.imei,t1.gender,t1.age,t1.city_grade_name,t1.model,t3.series 参考阅读资料: https://blog.csdn.net/kwu_ganymede/article/details/51365002 https://blog.csdn.net/liuxiao723846/article/details/78739097 https://help.aliyun.com/knowledge_detail/40270.html
#PARTITION BY order by 计算累计 SELECT cookieid, createtime, pv, SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS pv1, -- 默认为从起点到当前行 SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv2, --从起点到当前行,结果同pv1 SUM(pv) OVER(PARTITION BY cookieid) AS pv3, --分组内所有行 SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4, --当前行+往前3行 SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5, --当前行+往前3行+往后1行 SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6 ---当前行+往后所有行 FROM lxw1234; cookieid createtime pv pv1 pv2 pv3 pv4 pv5 pv6 ----------------------------------------------------------------------------- cookie1 2015-04-10 1 1 1 26 1 6 26 cookie1 2015-04-11 5 6 6 26 6 13 25 cookie1 2015-04-12 7 13 13 26 13 16 20 cookie1 2015-04-13 3 16 16 26 16 18 13 cookie1 2015-04-14 2 18 18 26 17 21 10 cookie1 2015-04-15 4 22 22 26 16 20 8 pv1: 分组内从起点到当前行的pv累积,如,11号的pv1=10号的pv+11号的pv, 12号=10号+11号+12号 pv2: 同pv1 pv3: 分组内(cookie1)所有的pv累加 pv4: 分组内当前行+往前3行,如,11号=10号+11号, 12号=10号+11号+12号, 13号=10号+11号+12号+13号, 14号=11号+12号+13号+14号 pv5: 分组内当前行+往前3行+往后1行,如,14号=11号+12号+13号+14号+15号=5+7+3+2+4=21 pv6: 分组内当前行+往后所有行,如,13号=13号+14号+15号+16号=3+2+4+4=13,14号=14号+15号+16号=2+4+4=10
#索引失效 看看有没有索引,没有建,有,看是否失效; 比如dim int ,查询可以写成dim=1,或者dim=‘1‘,但是dim=‘1‘就会索引失效
#rlike(一列中包含另一列,如标题,包含低俗词) a.title rlike b.word
#笛卡尔积 cross join(匹配一个表,如含低俗词)
#开窗函数 分析函数 OVER(PARTITION BY)函数用法 分析函数用于计算基于组的某种聚合值,它和聚合函数的不同之处是:对于每个组返回多行,而聚合函数对于每个组只返回一行。 1:over后的写法: over(order by salary) 按照salary排序进行累计,order by是个默认的开窗函数 over(partition by deptno)按照部门分区 2、开窗的窗口范围: over(order by salary range between 5 preceding and 5 following):窗口范围为当前行数据幅度减5加5后的范围内的。 3、与over函数结合的几个函数介绍 a)rank()over() 、dense_rank()over()可以把并列第一查出来,row_number()over()只返回一个结果 b)rank()是跳跃排序,有两个第二名时接下来就是第四名;dense_rank()是连续排序,有两个第二名时仍然跟着第三名 select * from ( select name,class,s,rank()over(partition by class order by s desc) mm from t2 ) where mm=1; c)sum()over() d)first_value() over()和last_value() over()
#concat_ws 用|串联
concat_ws(‘|‘,collect_set(host)) as host
#筛选条件 select * from t1 left outer join t2 on t1.id=t2.id and t2.cnt>=40 表2先筛选大于等于40,再关联 select * from t1 left outer join t2 on t1.id=t2.id where t2.cnt>=40 最后筛选,如果有空,也会过滤掉
#多层聚类cube SELECT month, day, COUNT(DISTINCT cookieid) AS uv, GROUPING__ID FROM cookie5 GROUP BY month,day WITH CUBE ORDER BY GROUPING__ID; 等价于 SELECT NULL,NULL,COUNT(DISTINCT cookieid) AS uv,0 AS GROUPING__ID FROM cookie5 UNION ALL SELECT month,NULL,COUNT(DISTINCT cookieid) AS uv,1 AS GROUPING__ID FROM cookie5 GROUP BY month UNION ALL SELECT NULL,day,COUNT(DISTINCT cookieid) AS uv,2 AS GROUPING__ID FROM cookie5 GROUP BY day UNION ALL SELECT month,day,COUNT(DISTINCT cookieid) AS uv,3 AS GROUPING__ID FROM cookie5 GROUP BY month,day 参考:https://www.cnblogs.com/qingyunzong/p/8798987.html
#UNION ALL 可以将2个或多个表进行合并。每一个union子查询都必需具有相同的列,而且对应的每个字段的字段类型必须是一致的。 UNION ALL允许值重复;如果不允许重复值出现,则只需要UNION。
#cast类型转换 cast(dayno as string) BINARY类型只支持将BINARY类型转换为STRING类型;如果用户知道其值是数值的话,可以通过嵌套cast()的方式对其进行分类转换。
#ORDER BY和SORT BY ORDER BY:对查询结果集执行一个全局排序。 SORT BY:只会在每个reducer中对数据进行排序,即执行一个局部排序过程。 - ASC(默认值,升序);DESC(降序) 含有SORT BY的DISTRIBUTE BY
#JOIN #INNER JOIN:只有进行连接的两个表中都存在与连接标准相匹配的数据才会被保留下来; #OUTER JOIN #LEFT OUTER JOIN:JOIN操作符左边的表符合WHERE子句的所有记录将会被返回,JOIN操作符右边表中如果没有符合ON后面连接条件的记录时,那么从右边表指定选择的列将会是NULL。 #RIGHT OUTER JOIN:返回右边表所有符合WHERE语句的记录,左表中匹配不上的字段值用NULL代替。 #FULL OUTER JOIN:返回所有表中符合WHERE语句条件的所有记录;如果任一表的指定字段没有符合条件的值的话,那就使用NULL值代替。
#LIMIT 用于限制返回的行数
#CASE ... WHEN ... THEN句式:与if条件语句相似,用于处理单个列的查询结果 case when substring(imei,length(imei)-1,2) between ‘00‘ and ‘09‘ then ‘00-09‘ when substring(imei,length(imei)-1,2) between ‘10‘ and ‘19‘ then ‘10-19‘ when substring(imei,length(imei)-1,2) between ‘20‘ and ‘29‘ then ‘20-29‘ when substring(imei,length(imei)-1,2) between ‘30‘ and ‘39‘ then ‘30-39‘ when substring(imei,length(imei)-1,2) between ‘40‘ and ‘49‘ then ‘40-49‘ when substring(imei,length(imei)-1,2) between ‘50‘ and ‘59‘ then ‘50-59‘ when substring(imei,length(imei)-1,2) between ‘60‘ and ‘69‘ then ‘60-69‘ when substring(imei,length(imei)-1,2) between ‘70‘ and ‘79‘ then ‘70-79‘ when substring(imei,length(imei)-1,2) between ‘80‘ and ‘89‘ then ‘80-89‘ when substring(imei,length(imei)-1,2) between ‘90‘ and ‘99‘ then ‘90-99‘ end as imei ,sum(1)
#INSERT OVERWRITE/INTO insert overwrite table t partition (dt=${YYYYMMDD})
#装载数据 LOAD DATA LOCAL INPATH 路径
hql="
load data inpath ‘/tmp/erciyuan‘ overwrite into table dc_tmp.temp_20190612_magazine_download
;"
原文地址:https://www.cnblogs.com/syj-love-dy/p/11130194.html
时间: 2024-11-05 21:39:27