pandas
基于numpy模块,用于处理文本或表格数据,支持文件存取操作,支持数据库
import pandas as pd
import numpy as np
Series
只能放一维数组
print(pd.Series([1,2,3,4,5]))
'''
0 1
1 2
2 3
3 4
4 5
dtype: int32
'''
print(pd.Series(np.array([1,2,3,4,5])))
'''
0 1
1 2
2 3
3 4
4 5
dtype: int32
'''
print(pd.Series(np.array([1,2,3,4,5]),dtype='int64'))
'''
0 1
1 2
2 3
3 4
4 5
dtype: int64
'''
pd.DataFrame
二维数组及以上使用,生成一个表格
print(pd.DataFrame(np.array([[1,2,3,4],[5,6,7,8]])))
'''
0 1 2 3
0 1 2 3 4
1 5 6 7 8
'''
pd.date_range()
生成一个时间索引的一维数组
dates = pd.date_range('2019-06',periods=30)
print(dates)
'''
DatetimeIndex(['2019-06-01', '2019-06-02', '2019-06-03', '2019-06-04',
'2019-06-05', '2019-06-06', '2019-06-07', '2019-06-08',
'2019-06-09', '2019-06-10', '2019-06-11', '2019-06-12',
'2019-06-13', '2019-06-14', '2019-06-15', '2019-06-16',
'2019-06-17', '2019-06-18', '2019-06-19', '2019-06-20',
'2019-06-21', '2019-06-22', '2019-06-23', '2019-06-24',
'2019-06-25', '2019-06-26', '2019-06-27', '2019-06-28',
'2019-06-29', '2019-06-30'],
dtype='datetime64[ns]', freq='D')
'''
# np.random.rand生成随机数(价格),利用pd.date_range生成日期索引
good_list = ['book','bike','drunk','huawei','car']
good_price = np.random.rand(7,5)
date = pd.date_range('2019-6-12',periods=7)
df = pd.DataFrame(good_price,index=date,columns=good_list)
print(df)
'''
book bike drunk huawei car
2019-06-12 0.043552 0.111986 0.323475 0.930866 0.838791
2019-06-13 0.249247 0.138921 0.409256 0.182755 0.715179
2019-06-14 0.285038 0.355326 0.093277 0.187183 0.820155
2019-06-15 0.003262 0.372441 0.925479 0.988700 0.242206
2019-06-16 0.551080 0.560331 0.326195 0.944092 0.504605
2019-06-17 0.473796 0.596993 0.807937 0.616850 0.824657
2019-06-18 0.762190 0.491334 0.040981 0.204430 0.866449
'''
在生成的表格中查看各类数据
print(df.columns) # ['book','bike','drunk','huawei','car']
'''
dtype 查看数据类型
index 查看行序列或者索引
columns 查看各列的标签
values 查看数据框内的数据,也即不含表头索引的数据
describe 查看数据每一列的极值,均值,中位数,只可用于数值型数据
transpose 转置,也可用T来操作
sort_index 排序,可按行或列index排序输出
sort_values() 按数据值排序
'''
处理缺失数据
test_data = '''
0,1,2,3
5.1,,1.4,0.2
4.9,3.0,1.4,0.2
4.7,3.2,,0.2
7.0,3.2,4.7,1.4
6.4,3.2,4.5,1.5
6.9,3.1,4.9,
'''
导入StringIO
from io import StringIO
test_data = StringIO(test_data) # office把数据读入内存
pd.read_csv 把第一行当作行索引,在第一列填充列索引
df = pd.read_csv(test_data) # 添加行和列的索引,打印cheng表格格式
print(df)
'''
0 1 2 3
0 5.1 NaN 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 NaN 0.2
3 7.0 3.2 4.7 1.4
4 6.4 3.2 4.5 1.5
5 6.9 3.1 4.9 NaN
'''
dropna 只要出现NaN(无效值)的行就删除
- 以及该方法中的参数控制
print(df.dropna()) 默认是 df.dropna(axis=0)
'''
0 1 2 3
1 4.9 3.0 1.4 0.2
3 7.0 3.2 4.7 1.4
4 6.4 3.2 4.5 1.5
'''
# axis参数
# dropna(axis=1)只要出现NaN(无效值)的列就删除
print(df.dropna(axis=1))
'''
0
0 5.1
1 4.9
2 4.7
3 7.0
4 6.4
5 6.9
'''
# thresh参数
# dropna中的thresh参数:保留具有thresh参数指定个数有效值的行,没有这么多有效数据的行就删除
print(df.dropna(thresh=3,axis=0))
'''
0 1 2 3
0 5.1 NaN 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 NaN 0.2
3 7.0 3.2 4.7 1.4
4 6.4 3.2 4.5 1.5
5 6.9 3.1 4.9 NaN
'''
print(df.dropna(thresh=4,axis=0))
'''
0 1 2 3
1 4.9 3.0 1.4 0.2
3 7.0 3.2 4.7 1.4
4 6.4 3.2 4.5 1.5
'''
print(df.dropna(thresh=5,axis=0))
'''
Empty DataFrame
Columns: [0, 1, 2, 3]
Index: []
'''
# dropna中的subset参数
print(df.dropna(subset=['3']))
'''
0 1 2 3
0 5.1 NaN 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 NaN 0.2
3 7.0 3.2 4.7 1.4
4 6.4 3.2 4.5 1.5
'''
# fillna参数
# 将NaN(无效值)都赋值为0
print(df.fillna(value=0))
合并数据
DataFrame 如果不输入index和column参数,则都默认从0开始
df1 = pd.DataFrame(np.zeros((3,4)))
print(df1)
'''
0 1 2 3
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
'''
df2 = pd.DataFrame(np.ones((3,4)))
print(df2)
'''
0 1 2 3
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
'''
pd.concat()
# pd.concat(,axis=0)垂直合并数组
print(pd.concat((df1,df2),axis=0))
'''
0 1 2 3
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
'''
# pd.concat(,axis=1) 水平合并数组
print(pd.concat((df1,df2),axis=1))
'''
0 1 2 3 0 1 2 3
0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
1 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
'''
数据写入excel文件和读取excel文件
df = pd.DataFrame(good_price,index=date,columns=good_list)
df.to_excel(r'C:\Users\联想\Desktop\d.xlsx')
df0 = pd.read_excel(r'C:\Users\联想\Desktop\d.xlsx',header=0,index_col=0)
print(df0)
'''
book bike drunk huawei car
2019-06-12 0.322727 0.029768 0.826520 0.589433 0.347090
2019-06-13 0.428079 0.415530 0.809133 0.746058 0.931913
2019-06-14 0.333673 0.594986 0.346576 0.920752 0.225355
2019-06-15 0.712803 0.449877 0.551796 0.302778 0.782467
2019-06-16 0.004480 0.398669 0.296981 0.746109 0.534168
2019-06-17 0.615625 0.976240 0.780116 0.227414 0.666836
2019-06-18 0.161665 0.873286 0.490482 0.960265 0.967142
'''
按索引(或key)取值
1.df.loc[]
# df0.loc[] 里面放key
print(df0.loc['2019-06-12'])
'''
book 0.322727
bike 0.029768
drunk 0.826520
huawei 0.589433
car 0.347090
Name: 2019-06-12 00:00:00, dtype: float64
'''
2.df.iloc[]
# df0.iloc[]里面放索引值,类似于numpy模块中的二维数组的索引取值
print(df0.iloc[0,0])
'''
0.322727442903458
'''
# 可以用这种方法赋值
df0.iloc[0,:] = 0
print(df0)
'''
book bike drunk huawei car
2019-06-12 0.000000 0.000000 0.000000 0.000000 0.000000
2019-06-13 0.428079 0.415530 0.809133 0.746058 0.931913
2019-06-14 0.333673 0.594986 0.346576 0.920752 0.225355
2019-06-15 0.712803 0.449877 0.551796 0.302778 0.782467
2019-06-16 0.004480 0.398669 0.296981 0.746109 0.534168
2019-06-17 0.615625 0.976240 0.780116 0.227414 0.666836
2019-06-18 0.161665 0.873286 0.490482 0.960265 0.967142
'''
原文地址:https://www.cnblogs.com/itboy-newking/p/11041707.html
时间: 2024-10-07 18:55:49