# 打开文件path = ‘D:\\操作软件\\pydata-book-2nd-edition\\pydata-book-2nd-edition\\datasets\\bitly_usagov\\example.txt‘# json格式处理import json records = [json.loads(line) for line in open(path)]# 校验 print(records[0])# tz时区计数from pandas import DataFrame, Seriesimport pandas as pdimport numpy as np # 处理frame = DataFrame(records)cframe = frame[‘tz‘].fillna(‘MISS‘)cframe[cframe == ‘‘] == ‘KONG‘tz_count = cframe.value_counts()# 校验 print(tz_count[:20])# 画图import matplotlib as mplimport pylab as pl # 参数设定tz_count[:20].plot(kind=‘barh‘, rot=20)pl.xlabel(u‘count‘)pl.ylabel(u‘type‘)pl.title(u‘tz_count‘)# 校验 pl.show()# 查看浏览器类型 print(records[0])result = Series([x.split()[0] for x in frame.a.dropna()])# print(result[:20])liu_count = result.value_counts()# 校验 print(liu_count[:20])# 再次分组查看:是否是win print(records[0])frame = DataFrame(records)gframe = frame[frame.a.notnull()]# 找出分组字段by_column = np.where(gframe[‘a‘].str.contains(‘Windows‘),‘Windows‘,‘Not Windows‘)# 分组明细by_detail = gframe.groupby([‘tz‘, by_column])# 分组计数by_count = by_detail.size().unstack().fillna(0)print(by_count[:20])
原文地址:https://www.cnblogs.com/niushichong/p/10331219.html
时间: 2024-11-05 11:19:36