import pandas as pdimport numpy as npfrom pandas import Series,DataFrame #时间序列的操作基础from datetime import datetime"""t = datetime(2016,9,10)print(t)#2016-09-10 00:00:00 data_list = [ datetime(2016,9,1), datetime(2016,9,10), datetime(2017,9,1), datetime(2017,9,20), datetime(2017,10,1)]print(data_list)# [datetime.datetime(2016, 9, 1, 0, 0), datetime.datetime(2016, 9, 10, 0, 0), datetime.datetime(2017, 9, 1, 0, 0), datetime.datetime(2017, 9, 20, 0, 0), datetime.datetime(2017, 10, 1, 0, 0)]s1 = Series(np.random.rand(5),index=data_list)print(s1)# 2016-09-01 0.437216# 2016-09-10 0.002021# 2017-09-01 0.990085# 2017-09-20 0.635123# 2017-10-01 0.504584# dtype: float64print(s1.values)#[0.74523743 0.67846232 0.33464572 0.66881491 0.34169192]print(s1.index)# DatetimeIndex([‘2016-09-01‘, ‘2016-09-10‘, ‘2017-09-01‘, ‘2017-09-20‘,# ‘2017-10-01‘],# dtype=‘datetime64[ns]‘, freq=None)print(s1[1])#0.3247607714134729print(s1[datetime(2016,9,10)])#0.3247607714134729print(s1[‘2016-09-10‘])#0.3247607714134729print(s1[‘20160910‘])#0.3247607714134729print(s1[‘2016-09‘])# 2016-09-01 0.713300# 2016-09-10 0.265708# dtype: float64print(s1[‘2016‘])# 2016-09-01 0.139233# 2016-09-10 0.595806# dtype: float64""" """data_list_new = pd.date_range("2016-01-01",periods=100,freq="5H")#表示每5小时生成一个时间,一共生成100个print(data_list_new)# DatetimeIndex([‘2016-01-01 00:00:00‘, ‘2016-01-01 05:00:00‘,# ‘2016-01-01 10:00:00‘, ‘2016-01-01 15:00:00‘,# ‘2016-01-01 20:00:00‘, ‘2016-01-02 01:00:00‘,# ‘2016-01-02 06:00:00‘, ‘2016-01-02 11:00:00‘,# ‘2016-01-02 16:00:00‘, ‘2016-01-02 21:00:00‘,# ‘2016-01-03 02:00:00‘, ‘2016-01-03 07:00:00‘,# ‘2016-01-03 12:00:00‘, ‘2016-01-03 17:00:00‘,# ‘2016-01-03 22:00:00‘, ‘2016-01-04 03:00:00‘,# ‘2016-01-04 08:00:00‘, ‘2016-01-04 13:00:00‘,# ‘2016-01-04 18:00:00‘, ‘2016-01-04 23:00:00‘,# ‘2016-01-05 04:00:00‘, ‘2016-01-05 09:00:00‘,# ‘2016-01-05 14:00:00‘, ‘2016-01-05 19:00:00‘,# ‘2016-01-06 00:00:00‘, ‘2016-01-06 05:00:00‘,# ‘2016-01-06 10:00:00‘, ‘2016-01-06 15:00:00‘,# ‘2016-01-06 20:00:00‘, ‘2016-01-07 01:00:00‘,# ‘2016-01-07 06:00:00‘, ‘2016-01-07 11:00:00‘,# ‘2016-01-07 16:00:00‘, ‘2016-01-07 21:00:00‘,# ‘2016-01-08 02:00:00‘, ‘2016-01-08 07:00:00‘,# ‘2016-01-08 12:00:00‘, ‘2016-01-08 17:00:00‘,# ‘2016-01-08 22:00:00‘, ‘2016-01-09 03:00:00‘,# ‘2016-01-09 08:00:00‘, ‘2016-01-09 13:00:00‘,# ‘2016-01-09 18:00:00‘, ‘2016-01-09 23:00:00‘,# ‘2016-01-10 04:00:00‘, ‘2016-01-10 09:00:00‘,# ‘2016-01-10 14:00:00‘, ‘2016-01-10 19:00:00‘,# ‘2016-01-11 00:00:00‘, ‘2016-01-11 05:00:00‘,# ‘2016-01-11 10:00:00‘, ‘2016-01-11 15:00:00‘,# ‘2016-01-11 20:00:00‘, ‘2016-01-12 01:00:00‘,# ‘2016-01-12 06:00:00‘, ‘2016-01-12 11:00:00‘,# ‘2016-01-12 16:00:00‘, ‘2016-01-12 21:00:00‘,# ‘2016-01-13 02:00:00‘, ‘2016-01-13 07:00:00‘,# ‘2016-01-13 12:00:00‘, ‘2016-01-13 17:00:00‘,# ‘2016-01-13 22:00:00‘, ‘2016-01-14 03:00:00‘,# ‘2016-01-14 08:00:00‘, ‘2016-01-14 13:00:00‘,# ‘2016-01-14 18:00:00‘, ‘2016-01-14 23:00:00‘,# ‘2016-01-15 04:00:00‘, ‘2016-01-15 09:00:00‘,# ‘2016-01-15 14:00:00‘, ‘2016-01-15 19:00:00‘,# ‘2016-01-16 00:00:00‘, ‘2016-01-16 05:00:00‘,# ‘2016-01-16 10:00:00‘, ‘2016-01-16 15:00:00‘,# ‘2016-01-16 20:00:00‘, ‘2016-01-17 01:00:00‘,# ‘2016-01-17 06:00:00‘, ‘2016-01-17 11:00:00‘,# ‘2016-01-17 16:00:00‘, ‘2016-01-17 21:00:00‘,# ‘2016-01-18 02:00:00‘, ‘2016-01-18 07:00:00‘,# ‘2016-01-18 12:00:00‘, ‘2016-01-18 17:00:00‘,# ‘2016-01-18 22:00:00‘, ‘2016-01-19 03:00:00‘,# ‘2016-01-19 08:00:00‘, ‘2016-01-19 13:00:00‘,# ‘2016-01-19 18:00:00‘, ‘2016-01-19 23:00:00‘,# ‘2016-01-20 04:00:00‘, ‘2016-01-20 09:00:00‘,# ‘2016-01-20 14:00:00‘, ‘2016-01-20 19:00:00‘,# ‘2016-01-21 00:00:00‘, ‘2016-01-21 05:00:00‘,# ‘2016-01-21 10:00:00‘, ‘2016-01-21 15:00:00‘],# dtype=‘datetime64[ns]‘, freq=‘5H‘)s2 = Series(np.random.rand(100),index=data_list_new)print(s2)""" """#-----------------#时间序列数据的采样和画图#t_range = pd.date_range("2016-01-01","2016-12-31")#print(t_range)# DatetimeIndex([‘2016-01-01‘, ‘2016-01-02‘, ‘2016-01-03‘, ‘2016-01-04‘,# ‘2016-01-05‘, ‘2016-01-06‘, ‘2016-01-07‘, ‘2016-01-08‘,# ‘2016-01-09‘, ‘2016-01-10‘,# ...# ‘2016-12-22‘, ‘2016-12-23‘, ‘2016-12-24‘, ‘2016-12-25‘,# ‘2016-12-26‘, ‘2016-12-27‘, ‘2016-12-28‘, ‘2016-12-29‘,# ‘2016-12-30‘, ‘2016-12-31‘],# dtype=‘datetime64[ns]‘, length=366, freq=‘D‘)#s1 = Series(np.random.randn(len(t_range)),index=t_range)#print(s1)#print(s1["2016-01"].mean())#0.05316056209771481# s1_month = s1.resample("M").mean()#取样,每个月取一个值,值为平均值# print(s1_month)# 2016-01-31 0.175917# 2016-02-29 -0.018886# 2016-03-31 -0.131760# 2016-04-30 -0.134704# 2016-05-31 0.147767# 2016-06-30 0.382015# 2016-07-31 0.163278# 2016-08-31 -0.079203# 2016-09-30 0.184607# 2016-10-31 0.055851# 2016-11-30 0.284106# 2016-12-31 -0.030083# Freq: M, dtype: float64 #print(s1.resample("H").ffill())# 2016-01-01 00:00:00 -2.031085# 2016-01-01 01:00:00 -2.031085# 2016-01-01 02:00:00 -2.031085# ........ t_range = pd.date_range("2016-01-01","2016-12-31",freq="H")print(t_range)# DatetimeIndex([‘2016-01-01 00:00:00‘, ‘2016-01-01 01:00:00‘,# ‘2016-01-01 02:00:00‘, ‘2016-01-01 03:00:00‘,# ‘2016-01-01 04:00:00‘, ‘2016-01-01 05:00:00‘,# ‘2016-01-01 06:00:00‘, ‘2016-01-01 07:00:00‘,# ‘2016-01-01 08:00:00‘, ‘2016-01-01 09:00:00‘,# ...# ‘2016-12-30 15:00:00‘, ‘2016-12-30 16:00:00‘,# ‘2016-12-30 17:00:00‘, ‘2016-12-30 18:00:00‘,# ‘2016-12-30 19:00:00‘, ‘2016-12-30 20:00:00‘,# ‘2016-12-30 21:00:00‘, ‘2016-12-30 22:00:00‘,# ‘2016-12-30 23:00:00‘, ‘2016-12-31 00:00:00‘],# dtype=‘datetime64[ns]‘, length=8761, freq=‘H‘)stock_df = DataFrame(index=t_range)print(stock_df.head())#Empty DataFrame# Columns: []# Index: [2016-01-01 00:00:00, 2016-01-01 01:00:00, 2016-01-01 02:00:00, 2016-01-01 03:00:00, 2016-01-01 04:00:00]stock_df["BABA"] = np.random.randint(80,160,size=len(t_range))stock_df["TENCENT"] = np.random.randint(30,50,size=len(t_range))print(stock_df.head())# BABA TENCENT# 2016-01-01 00:00:00 147 47# 2016-01-01 01:00:00 88 40# 2016-01-01 02:00:00 143 33# 2016-01-01 03:00:00 132 47# 2016-01-01 04:00:00 93 44# stock_df.plot()import matplotlib.pyplot as plt# plt.show() weekly_df = DataFrame()weekly_df["BABA"] = stock_df["BABA"].resample("W").mean()weekly_df["TENCENT"] = stock_df["TENCENT"].resample("W").mean()print(weekly_df.head())# BABA TENCENT# 2016-01-03 113.819444 39.597222# 2016-01-10 122.696429 39.029762# 2016-01-17 120.458333 38.845238# 2016-01-24 119.196429 39.690476# 2016-01-31 118.315476 38.690476weekly_df.plot()plt.show()""" """#------------------------------#数据分箱技术Binningscore_list = np.random.randint(25,100,size=20)print(score_list)#[41 88 82 66 83 84 77 29 72 97 77 81 80 45 30 74 84 46 95 54]bins = [0,59,70,80,100]score_cut = pd.cut(score_list,bins)print(score_cut)# [(0, 59], (0, 59], (80, 100], (70, 80], (59, 70], ..., (80, 100], (0, 59], (0, 59], (59, 70], (80, 100]]# Length: 20# Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]print(pd.value_counts(score_cut))# (0, 59] 11# (80, 100] 4# (59, 70] 3# (70, 80] 2# dtype: int64df = DataFrame()df["score"] = score_listdf["student"] = [pd.util.testing.rands(3) for i in range(20)]df["Categories"] = pd.cut(df["score"],bins,labels=["Low","Ok","Good","Great"])print(df)# score student Categories# 0 71 sCO Good# 1 40 AgI Low# 2 61 ubC Ok# 3 65 P1K Ok# 4 78 ebd Good# 5 75 oxG Good# 6 81 JN0 Great# 7 35 LpS Low# 8 53 L7l Low# 9 60 puw Ok# 10 27 3KJ Low# 11 77 2ID Good# 12 63 D26 Ok# 13 96 jA7 Great# 14 46 txB Low# 15 85 8NF Great# 16 96 jne Great# 17 71 xBX Good# 18 75 3HP Good# 19 93 Svl Great""" #------------------------------------------------#数据分组技术GroupBy"""df = pd.read_csv("city_weather.csv")# print(df)# data city temperature wind# 0 03/01/2016 BJ 8 5# 1 17/01/2016 BJ 12 2# 2 31/01/2016 BJ 19 2# 3 03/02/2016 BJ -3 3# 4 14/02/2016 BJ 19 2# 5 13/03/2016 BJ 5 3# 6 10/03/2016 SH -4 4# 7 03/04/2016 SH 19 3# 8 24/04/2016 SH 20 3# 9 08/05/2016 SH 17 3# 10 22/05/2016 SH 4 2# 11 05/06/2016 SH -10 4# 12 19/06/2016 SH 0 5# 13 03/07/2016 SH 9 5# 14 17/07/2016 GZ 10 2# 15 31/07/2016 GZ -1 5# 16 14/08/2016 GZ 1 5# 17 28/08/2016 GZ 25 4# 18 11/09/2016 SZ 20 1# 19 25/09/2016 SZ -10 4 g = df.groupby(df["city"])print(g)#<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000000000297D780>print(g.groups)# {‘BJ‘: Int64Index([0, 1, 2, 3, 4, 5], dtype=‘int64‘), ‘GZ‘: Int64Index([14, 15, 16, 17], dtype=‘int64‘), ‘SH‘: Int64Index([6, 7, 8, 9, 10, 11, 12, 13], dtype=‘int64‘), ‘SZ‘: Int64Index([18, 19], dtype=‘int64‘)}print(g.get_group("BJ"))# data city temperature wind# 0 03/01/2016 BJ 8 5# 1 17/01/2016 BJ 12 2# 2 31/01/2016 BJ 19 2# 3 03/02/2016 BJ -3 3# 4 14/02/2016 BJ 19 2# 5 13/03/2016 BJ 5 3df_bj = g.get_group("BJ")print(df_bj.mean())# temperature 10.000000# wind 2.833333# dtype: float64print(g.mean())# temperature wind# city# BJ 10.000 2.833333# GZ 8.750 4.000000# SH 6.875 3.625000# SZ 5.000 2.500000""" #数据聚合技术Aggregationdf = pd.read_csv("city_weather.csv")g = df.groupby("city")print(g.agg("min"))# data temperature wind# city# BJ 03/01/2016 -3 2# GZ 14/08/2016 -1 2# SH 03/04/2016 -10 2# SZ 11/09/2016 -10 1def foo(attr): print(type(attr)),print(attr) return np.nan print(g.agg(foo))# <class ‘pandas.core.series.Series‘># 0 03/01/2016# 1 17/01/2016# 2 31/01/2016# 3 03/02/2016# 4 14/02/2016# 5 13/03/2016# Name: data, dtype: object# <class ‘pandas.core.series.Series‘># 14 17/07/2016# 15 31/07/2016# 16 14/08/2016# 17 28/08/2016# Name: data, dtype: object# <class ‘pandas.core.series.Series‘># 6 10/03/2016# 7 03/04/2016# 8 24/04/2016# 9 08/05/2016# 10 22/05/2016# 11 05/06/2016# 12 19/06/2016# 13 03/07/2016# Name: data, dtype: object# <class ‘pandas.core.series.Series‘># 18 11/09/2016# 19 25/09/2016# Name: data, dtype: object# <class ‘pandas.core.series.Series‘># 0 8# 1 12# 2 19# 3 -3# 4 19# 5 5# Name: temperature, dtype: int64# <class ‘pandas.core.series.Series‘># 14 10# 15 -1# 16 1# 17 25# Name: temperature, dtype: int64# <class ‘pandas.core.series.Series‘># 6 -4# 7 19# 8 20# 9 17# 10 4# 11 -10# 12 0# 13 9# Name: temperature, dtype: int64# <class ‘pandas.core.series.Series‘># 18 20# 19 -10# Name: temperature, dtype: int64# <class ‘pandas.core.series.Series‘># 0 5# 1 2# 2 2# 3 3# 4 2# 5 3# Name: wind, dtype: int64# <class ‘pandas.core.series.Series‘># 14 2# 15 5# 16 5# 17 4# Name: wind, dtype: int64# <class ‘pandas.core.series.Series‘># 6 4# 7 3# 8 3# 9 3# 10 2# 11 4# 12 5# 13 5# Name: wind, dtype: int64# <class ‘pandas.core.series.Series‘># 18 1# 19 4# Name: wind, dtype: int64# data temperature wind# city# BJ NaN NaN NaN# GZ NaN NaN NaN# SH NaN NaN NaN# SZ NaN NaN NaN def foo(attr): return attr.max() - attr.min() print(g.agg(foo))# temperature wind# city# BJ 22 3# GZ 26 3# SH 30 3# SZ 30 3 g_new = df.groupby(["city","wind"])print(g_new.groups)# {(‘BJ‘, 2): Int64Index([1, 2, 4], dtype=‘int64‘),# (‘BJ‘, 3): Int64Index([3, 5], dtype=‘int64‘),# (‘BJ‘, 5): Int64Index([0], dtype=‘int64‘),# (‘GZ‘, 2): Int64Index([14], dtype=‘int64‘),# (‘GZ‘, 4): Int64Index([17], dtype=‘int64‘),# (‘GZ‘, 5): Int64Index([15, 16], dtype=‘int64‘),# (‘SH‘, 2): Int64Index([10], dtype=‘int64‘),# (‘SH‘, 3): Int64Index([7, 8, 9], dtype=‘int64‘),# (‘SH‘, 4): Int64Index([6, 11], dtype=‘int64‘),# (‘SH‘, 5): Int64Index([12, 13], dtype=‘int64‘),# (‘SZ‘, 1): Int64Index([18], dtype=‘int64‘),# (‘SZ‘, 4): Int64Index([19], dtype=‘int64‘)}print(g_new.get_group(("BJ",3)))# data city temperature wind# 3 03/02/2016 BJ -3 3# 5 13/03/2016 BJ 5 3 #32
原文地址:https://www.cnblogs.com/nikecode/p/11130932.html
时间: 2024-11-02 22:44:18