pig里面有一个TOP功能。我不知道为什么用不了。有时间去看看pig源代码。
SET job.name ‘top_k‘;
SET job.priority HIGH;
--REGISTER piggybank.jar;
REGISTER wizad-etl-udf-0.1.jar;
--DEFINE SequenceFileLoader org.apache.pig.piggybank.storage.SequenceFileLoader();
DEFINE SequenceFileLoader com.vpon.wizad.etl.pig.SequenceFileCSVLoader();
--%default cleanedLog /user/wizad/data/wizad/cleaned/2014-07-30/*/part*
%default cleanedLog /user/wizad/data/wizad/cleaned/$date/*/part*
%default output_path /user/wizad/tmp/hour_count
origin_cleaned_data = LOAD ‘$cleanedLog‘ USING SequenceFileLoader
AS (ad_network_id:chararray,
wizad_ad_id:chararray,
guid:chararray,
id:chararray,
create_time:chararray,
action_time:chararray,
log_type:chararray,
ad_id:chararray,
positioning_method:chararray,
location_accuracy:chararray,
lat:chararray,
lon:chararray,
cell_id:chararray,
lac:chararray,
mcc:chararray,
mnc:chararray,
ip:chararray,
connection_type:chararray,
imei:chararray,
android_id:chararray,
android_advertising_id:chararray,
udid:chararray,
openudid:chararray,
idfa:chararray,
mac_address:chararray,
uid:chararray,
density:chararray,
screen_height:chararray,
screen_width:chararray,
user_agent:chararray,
app_id:chararray,
app_category_id:chararray,
device_model_id:chararray,
carrier_id:chararray,
os_id:chararray,
device_type:chararray,
os_version:chararray,
country_region_id:chararray,
province_region_id:chararray,
city_region_id:chararray,
ip_lat:chararray,
ip_lon:chararray,
quadkey:chararray);
show_log= FILTER origin_cleaned_data by log_type==‘1‘;
--extract column for analyzing。提取子字段做为新属性
original_hour = FOREACH show_log GENERATE ad_network_id,wizad_ad_id,guid,app_category_id,log_type,SUBSTRING(create_time,11,13) AS hour; --(wizad_ad_id,guid,log_type,hour)
hour_group = GROUP original_hour BY (hour,app_category_id);--按属性分类,
hour_count = foreach hour_group{
--guid_data = $1.guid;
--uniq_guid = distinct guid_data;--去重处理。
查唯一个数。
ad_network_ids = original_hour.ad_network_id;
uniq_ad_network_ids = distinct ad_network_ids;
--统计每一个包下的个数,将后面uniq_ad_network_ids分成单个记录。
比方,uniq_ad_network_ids原值{3,5},现变成两条记录,分为(xx,3)(xx,5)两条记录
generate flatten(group), COUNT_STAR($1) AS pv, flatten(uniq_ad_network_ids);
}
describe hour_count;
--查看结构为:hour_count: {group::hour: chararray,group::app_category_id: chararray,pv: long,uniq_ad_network_ids::ad_network_id: chararray}
group_hour_count = group hour_count by (hour,ad_network_id);
top_2_data = foreach group_hour_count {
--top_dataset = TOP(2,hour_count.pv, hour_count);--top函数 不能用。有谁用过告诉一声。我就不用看源代码拉,哈哈
--hour_data = hour_count;
--top k实现方式。order排序,limit返回前k个。
order_hour_count = order hour_count by pv DESC;
top2_hour_count = limit order_hour_count 2;
--generate group, top2_hour_count.pv, top2_hour_count.app_category_id;-- 注意,后面是两个bag。分开的。
generate flatten(top2_hour_count );
}
版权声明:本文博客原创文章,博客,未经同意,不得转载。