茄子快传数据分析(一)----数据清理
2018年09月03日 18:41:44 amin_hui 阅读数:117
茄子快传原理
流程图:
数据
“events”: “1473367236143\u00010\u0001connectByQRCode\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000027\u0001\n1473367261933\u00010\u0001AppLaunch\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000028\u0001\n1473367280349\u00010\u0001connectByQRCode\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000029\u0001\n1473367331326\u00010\u0001AppLaunch\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000030\u0001\n1473367353310\u00010\u0001connectByQRCode\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000031\u0001\n1473367387087\u00010\u0001AppLaunch\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000032\u0001\n1473367402167\u00010\u0001connectByQRCode\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000033\u0001\n1473367451994\u00010\u0001AppLaunch\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000034\u0001\n1473367474316\u00010\u0001connectByQRCode\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000035\u0001\n1473367564181\u00010\u0001AppLaunch\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000036\u0001\n1473367589527\u00010\u0001connectByQRCode\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000037\u0001\n1473367610310\u00010\u0001AppLaunch\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000038\u0001\n1473367624647\u00010\u0001connectByQRCode\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000039\u0001\n1473368004298\u00010\u0001AppLaunch\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000040\u0001\n1473368017851\u00010\u0001connectByQRCode\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000041\u0001\n1473369599067\u00010\u0001AppLaunch\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000042\u0001\n1473369622274\u00010\u0001connectByQRCode\u0001\u00010\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u0001\u00011609072239570000043\u0001\n”,
数据解析
使用json在线解析器解析为:
“header”: {
“cid_sn”: “1501004207EE98AA”, sdn码
“mobile_data_type”: “”,
“os_ver”: “9”, 操作系统
“mac”: “88:1f:a1:03:7d:a8”, 物理地址
“resolution”: “2560x1337”, 分辨率
“commit_time”: “1473399829041”, 提交时间
“sdk_ver”: “103”, sdk版本
“device_id_type”: “mac”, 设备类型
“city”: “江门市”, 城市
“android_id”: “”, 安卓设备的安卓id
“device_model”: “MacBookPro11,1”,设备型号
“carrier”: “中国xx”, 运营商
“promotion_channel”: “1”, 推广渠道
“app_ver_name”: “1.7”, app版本号
“imei”: “”, 入网表示
“app_ver_code”: “23”, 公司内部版本码
“pid”: “pid”,
“net_type”: “3”, 网络类型
“device_id”: “m.88:1f:a1:03:7d:a8”, 设备ip
“app_device_id”: “m.88:1f:a1:03:7d:a8”,
“release_channel”: “appstore”, 发布渠道
“country”: “CN”,
“time_zone”: “28800000”, 时区编码
“os_name”: “ios”, 操作系统类型
“manufacture”: “apple”, 生产厂家
“commit_id”: “fde7ee2e48494b24bf3599771d7c2a78”, 事件标示
“app_token”: “XIAONIU_I”, app标示
“account”: “none”, 登陆账号
“app_id”: “com.appid.xiaoniu”, app组名
“build_num”: “YVF6R16303000403”, 编译号
“language”: “zh” 系统所使用语言
}
}
数据清理
1、 数据清理需求分析
release_channel,device_id,city,device_id_type,app_ver_name 这几个字段如果缺失,则过滤
将数据整成 字段,字段,字段,…… 这种形式
在每条数据中添加一个字段:user_id(值就是mac)
2、数据清理代码
public class AppLogClean {
public static class MapTask extends Mapper<LongWritable, Text, Text, NullWritable> {
StringBuilder sb = new StringBuilder();
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)
throws IOException, InterruptedException {
// 得到每行数据
String line = value.toString();
JSONObject ob1 = JSON.parseObject(line);
JSONObject ob2 = ob1.getJSONObject("header");
// 关键数据是否有丢失
// release_channel,device_id,city,device_id_type,app_ver_name
// 这几个字段如果缺失,则过滤
if (StringUtils.isBlank(ob2.getString("release_channel"))
|| StringUtils.isBlank(ob2.getString("device_id"))
|| StringUtils.isBlank(ob2.getString("city"))
|| StringUtils.isBlank(ob2.getString("device_id_type"))
|| StringUtils.isBlank(ob2.getString("app_ver_name"))
|| StringUtils.isBlank(ob2.getString("os_name"))
|| StringUtils.isBlank(ob2.getString("mac"))) {
return;
}
if (ob2.getString("app_ver_name").equals("android")) {
if (StringUtils.isBlank(ob2.getString("android_id"))) {
return;
}
}
sb.append(ob2.getString("cid_sn")).append(",");
sb.append(ob2.getString("mobile_data_type")).append(",");
sb.append(ob2.getString("os_ver")).append(",");
sb.append(ob2.getString("mac")).append(",");
sb.append(ob2.getString("resolution")).append(",");
sb.append(ob2.getString("commit_time")).append(",");
sb.append(ob2.getString("sdk_ver")).append(",");
sb.append(ob2.getString("device_id_type")).append(",");
sb.append(ob2.getString("city")).append(",");
sb.append(ob2.getString("android_id")).append(",");
sb.append(ob2.getString("device_model")).append(",");
sb.append(ob2.getString("carrier")).append(",");
sb.append(ob2.getString("promotion_channel")).append(",");
sb.append(ob2.getString("app_ver_name")).append(",");
sb.append(ob2.getString("imei")).append(",");
sb.append(ob2.getString("app_ver_code")).append(",");
sb.append(ob2.getString("pid")).append(",");
sb.append(ob2.getString("net_type")).append(",");
sb.append(ob2.getString("device_id")).append(",");
sb.append(ob2.getString("app_device_id")).append(",");
sb.append(ob2.getString("release_channel")).append(",");
sb.append(ob2.getString("country")).append(",");
sb.append(ob2.getString("time_zone")).append(",");
sb.append(ob2.getString("os_name")).append(",");
sb.append(ob2.getString("manufacture")).append(",");
sb.append(ob2.getString("commit_id")).append(",");
sb.append(ob2.getString("app_token")).append(",");
sb.append(ob2.getString("account")).append(",");
sb.append(ob2.getString("app_id")).append(",");
sb.append(ob2.getString("build_num")).append(",");
sb.append(ob2.getString("language")).append(",");
String uid = ob2.getString("mac");//???
sb.append(uid);
k.set(sb.toString());
context.write(k, NullWritable.get());
//清除sb的数据
sb.delete(0, sb.length());
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//设置map,设置driver,设置输出类型。。。
job.setJarByClass(AppLogClean.class);
job.setMapperClass(MapTask.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path("D:\\a\\appuserdata\\input\\20170101"));
FileOutputFormat.setOutputPath(job, new Path("D:\\a\\appuserdata\\out"));
//不需要reduce 可以设置为0
job.setNumReduceTasks(0);
boolean ret = job.waitForCompletion(true);
//System.exit(ret?0:1);
System.out.println(ret?"你很优秀":"滚去调bug!");
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
3、数据清理完的文件
4、清理完成的数据格式
1501004207EE98AA,,22,1c:77:f6:78:f5:75,1080x1920,1473396818952,103,mac,江门市,867830021735040,
字段与字段间以逗号隔开
原文地址:https://www.cnblogs.com/timxgb/p/10659955.html