第三十八篇 pandas模块

pandas

基于numpy模块，用于处理文本或表格数据，支持文件存取操作，支持数据库

import pandas as pd

import numpy as np

Series

只能放一维数组

print(pd.Series([1,2,3,4,5]))
'''
0    1
1    2
2    3
3    4
4    5
dtype: int32
'''
print(pd.Series(np.array([1,2,3,4,5])))
'''
0    1
1    2
2    3
3    4
4    5
dtype: int32
'''
print(pd.Series(np.array([1,2,3,4,5]),dtype='int64'))
'''
0    1
1    2
2    3
3    4
4    5
dtype: int64
'''

pd.DataFrame

二维数组及以上使用，生成一个表格

 print(pd.DataFrame(np.array([[1,2,3,4],[5,6,7,8]])))
'''
   0  1  2  3
0  1  2  3  4
1  5  6  7  8
'''

pd.date_range()

生成一个时间索引的一维数组

dates = pd.date_range('2019-06',periods=30)
print(dates)
'''
DatetimeIndex(['2019-06-01', '2019-06-02', '2019-06-03', '2019-06-04',
               '2019-06-05', '2019-06-06', '2019-06-07', '2019-06-08',
               '2019-06-09', '2019-06-10', '2019-06-11', '2019-06-12',
               '2019-06-13', '2019-06-14', '2019-06-15', '2019-06-16',
               '2019-06-17', '2019-06-18', '2019-06-19', '2019-06-20',
               '2019-06-21', '2019-06-22', '2019-06-23', '2019-06-24',
               '2019-06-25', '2019-06-26', '2019-06-27', '2019-06-28',
               '2019-06-29', '2019-06-30'],
              dtype='datetime64[ns]', freq='D')
'''

# np.random.rand生成随机数(价格)，利用pd.date_range生成日期索引
good_list = ['book','bike','drunk','huawei','car']
good_price = np.random.rand(7,5)
date = pd.date_range('2019-6-12',periods=7)
df = pd.DataFrame(good_price,index=date,columns=good_list)
print(df)
'''
                book      bike     drunk    huawei       car
2019-06-12  0.043552  0.111986  0.323475  0.930866  0.838791
2019-06-13  0.249247  0.138921  0.409256  0.182755  0.715179
2019-06-14  0.285038  0.355326  0.093277  0.187183  0.820155
2019-06-15  0.003262  0.372441  0.925479  0.988700  0.242206
2019-06-16  0.551080  0.560331  0.326195  0.944092  0.504605
2019-06-17  0.473796  0.596993  0.807937  0.616850  0.824657
2019-06-18  0.762190  0.491334  0.040981  0.204430  0.866449
'''

在生成的表格中查看各类数据

print(df.columns) # ['book','bike','drunk','huawei','car']
'''
dtype  查看数据类型
index  查看行序列或者索引
columns    查看各列的标签
values 查看数据框内的数据，也即不含表头索引的数据
describe   查看数据每一列的极值，均值，中位数，只可用于数值型数据
transpose  转置，也可用Ｔ来操作
sort_index 排序，可按行或列index排序输出
sort_values() 按数据值排序
'''

处理缺失数据

test_data = '''
0,1,2,3
5.1,,1.4,0.2
4.9,3.0,1.4,0.2
4.7,3.2,,0.2
7.0,3.2,4.7,1.4
6.4,3.2,4.5,1.5
6.9,3.1,4.9,
'''

导入StringIO

from io import StringIO
test_data = StringIO(test_data)   # office把数据读入内存

pd.read_csv 把第一行当作行索引，在第一列填充列索引

df = pd.read_csv(test_data)  # 添加行和列的索引，打印cheng表格格式
print(df)
'''
     0    1    2    3
0  5.1  NaN  1.4  0.2
1  4.9  3.0  1.4  0.2
2  4.7  3.2  NaN  0.2
3  7.0  3.2  4.7  1.4
4  6.4  3.2  4.5  1.5
5  6.9  3.1  4.9  NaN
'''

dropna 只要出现NaN（无效值）的行就删除

以及该方法中的参数控制

 print(df.dropna())  默认是 df.dropna(axis=0)
'''
     0    1    2    3
1  4.9  3.0  1.4  0.2
3  7.0  3.2  4.7  1.4
4  6.4  3.2  4.5  1.5
'''

# axis参数
# dropna(axis=1)只要出现NaN（无效值）的列就删除
 print(df.dropna(axis=1))
'''
     0
0  5.1
1  4.9
2  4.7
3  7.0
4  6.4
5  6.9
'''

# thresh参数
# dropna中的thresh参数:保留具有thresh参数指定个数有效值的行，没有这么多有效数据的行就删除
print(df.dropna(thresh=3,axis=0))
'''
     0    1    2    3
0  5.1  NaN  1.4  0.2
1  4.9  3.0  1.4  0.2
2  4.7  3.2  NaN  0.2
3  7.0  3.2  4.7  1.4
4  6.4  3.2  4.5  1.5
5  6.9  3.1  4.9  NaN

'''
print(df.dropna(thresh=4,axis=0))
'''
    0    1    2    3
1  4.9  3.0  1.4  0.2
3  7.0  3.2  4.7  1.4
4  6.4  3.2  4.5  1.5
'''
print(df.dropna(thresh=5,axis=0))
'''
Empty DataFrame
Columns: [0, 1, 2, 3]
Index: []
'''

# dropna中的subset参数
print(df.dropna(subset=['3']))
'''
     0    1    2    3
0  5.1  NaN  1.4  0.2
1  4.9  3.0  1.4  0.2
2  4.7  3.2  NaN  0.2
3  7.0  3.2  4.7  1.4
4  6.4  3.2  4.5  1.5
'''

# fillna参数
# 将NaN（无效值）都赋值为0
print(df.fillna(value=0))

合并数据

DataFrame 如果不输入index和column参数，则都默认从0开始

df1 = pd.DataFrame(np.zeros((3,4)))
print(df1)
'''
     0    1    2    3
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
'''
df2 = pd.DataFrame(np.ones((3,4)))
print(df2)
'''
     0    1    2    3
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
'''

pd.concat()

# pd.concat(，axis=0)垂直合并数组
print(pd.concat((df1,df2),axis=0))
'''
     0    1    2    3
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
'''

# pd.concat(，axis=1)   水平合并数组
print(pd.concat((df1,df2),axis=1))
'''
     0    1    2    3    0    1    2    3
0  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
1  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
'''

数据写入excel文件和读取excel文件

 df = pd.DataFrame(good_price,index=date,columns=good_list)
df.to_excel(r'C:\Users\联想\Desktop\d.xlsx')
df0 = pd.read_excel(r'C:\Users\联想\Desktop\d.xlsx',header=0,index_col=0)
print(df0)
'''
                book      bike     drunk    huawei       car
2019-06-12  0.322727  0.029768  0.826520  0.589433  0.347090
2019-06-13  0.428079  0.415530  0.809133  0.746058  0.931913
2019-06-14  0.333673  0.594986  0.346576  0.920752  0.225355
2019-06-15  0.712803  0.449877  0.551796  0.302778  0.782467
2019-06-16  0.004480  0.398669  0.296981  0.746109  0.534168
2019-06-17  0.615625  0.976240  0.780116  0.227414  0.666836
2019-06-18  0.161665  0.873286  0.490482  0.960265  0.967142
'''

按索引（或key）取值

1.df.loc[]

# df0.loc[] 里面放key
print(df0.loc['2019-06-12'])
'''
book      0.322727
bike      0.029768
drunk     0.826520
huawei    0.589433
car       0.347090
Name: 2019-06-12 00:00:00, dtype: float64
'''

2.df.iloc[]

# df0.iloc[]里面放索引值，类似于numpy模块中的二维数组的索引取值
print(df0.iloc[0,0])
'''
0.322727442903458
'''
# 可以用这种方法赋值
df0.iloc[0,:] = 0
print(df0)
'''
                book      bike     drunk    huawei       car
2019-06-12  0.000000  0.000000  0.000000  0.000000  0.000000
2019-06-13  0.428079  0.415530  0.809133  0.746058  0.931913
2019-06-14  0.333673  0.594986  0.346576  0.920752  0.225355
2019-06-15  0.712803  0.449877  0.551796  0.302778  0.782467
2019-06-16  0.004480  0.398669  0.296981  0.746109  0.534168
2019-06-17  0.615625  0.976240  0.780116  0.227414  0.666836
2019-06-18  0.161665  0.873286  0.490482  0.960265  0.967142
'''

原文地址：https://www.cnblogs.com/itboy-newking/p/11041707.html

时间： 2024-10-07 18:55:49

第三十八篇 pandas模块的相关文章

第三十六篇 hashlib模块、hmac模块和logging模块

目录第三十七篇 hashlib模块.hmac模块和logging模块一.hashlib模块 1.hash是什么 2.撞库破解hash算法加密二.hmac模块三.logging模块 1.日志的五个级别 2.V3 3.日志配置文件 4.总结第三十七篇 hashlib模块.hmac模块和logging模块一.hashlib模块 1.hash是什么 1.hashlib模块一般用于明文加密 2.hash是一种算法,在hashlib模块中主要提供了md5 等算法,传入的内容通过这些算法,会得到一

Python之路(第十八篇)shutil 模块、zipfile模块、configparser模块

一.shutil 模块 1.shutil.copyfileobj(fsrc, fdst[, length]) 将文件内容拷贝到另一个文件中,需要打开文件 import shutil shutil.copyfileobj(open("old_test.txt","r"),open("new_test.txt","w")) 输出结果 2.shutil.copyfile(src,dst) 复制文件内容到另外一个文件,不需要打开文件,

Python之路(第三十八篇) 并发编程：进程同步锁/互斥锁、信号量、事件、队列、生产者消费者模型

一.进程锁(同步锁/互斥锁) 进程之间数据不共享,但是共享同一套文件系统,所以访问同一个文件,或同一个打印终端,是没有问题的, 而共享带来的是竞争,竞争带来的结果就是错乱,如何控制,就是加锁处理. 例子 #并发运行,效率高,但竞争同一打印终端,带来了打印错乱 from multiprocessing import Process import os,time def work(): print('%s is running' %os.getpid()) time.sleep(2) print('

第三十五篇 os模块、sys模块、json模块、pickle模块

目录一.os模块二.sys模块三.json模块 dump和load 四.pickle模块一.os模块 os模块和操作系统交互,主要用于文件操作 import os # test.py文件中 # os.mkdir() 一个路径参数和一个字符串参数.如果有路径参数,则在该路径下创建一个新的文件夹:如果无路径参数,则在当前文件的同级路径下创建一个新的文件夹 os.mkdir('king') # 创建了和test.py文件同级目录下的名为king的文件夹 # os.removedirs() 一个

第三十九篇 matplotlib模块

matplotlib模块绘图库,可以创建常用的统计图(条形图.箱型图.折线图.散点图和直方图) bar() 条形图 # 由于该模块不识别中文,所以我们需要导入一个中文简体字文件 import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties font = FontProperties(fname='B:\\msyh.ttc') # 在文件B中找字体文件 # 修改背景为条纹 plt.style.

C++第三十八篇 -- 研究一下Windows驱动开发（二）--WDM式驱动的加载

基于Windows驱动开发技术详解这本书一.简单的INF文件剖析 INF文件是一个文本文件,由若干个节(Section)组成.每个节的名称用一个方括号指示,紧接着方括号后面的就是节内容.每一行就是一项内容,其形式都是类似SomeEntry=SomwValue.每个项的顺序是可以颠倒的,但系统分析INF文件的时候,是顺序解析的.INF中注释语句是用分号开头的. 二.WDM设备安装在注册表中的变化 WDM式驱动程序的安装会在三个方面修改注册表,分别是硬件子键(Hardware).类子键(Class

第三十八篇、给UITabBar按钮的动画效果

- (void)tabBar:(UITabBar *)tabBar didSelectItem:(UITabBarItem *)item { NSInteger index = [self.tabBar.items indexOfObject:item]; if (self.indexFlag != index) { [self animationWithIndex:index]; } } // 动画 - (void)animationWithIndex:(NSInteger) index {

【黑金原创教程】【FPGA那些事儿-驱动篇I 】实验十八：SDRAM模块① — 单字读写

实验十八:SDRAM模块① — 单字读写笔者与SDRAM有段不短的孽缘,它作为冤魂日夜不断纠缠笔者.笔者尝试过许多方法将其退散,不过屡试屡败的笔者,最终心情像橘子一样橙.<整合篇>之际,笔者曾经大战几回儿,不过内容都是点到即止.最近它破蛊而出,日夜不停:“好~痛苦!好~痛苦!”地呻吟着,吓得笔者不敢半夜如厕.疯狂之下,誓要歪它不可 ... 可恶的东西,笔者要它血债血还! 图18.1 数据读取(理想时序左,物理时序右). 首先,让我们来了解一下,什么才是数据读取的最佳状态?如图18.1所示,红

Python之路【第十八篇】：Web框架们

Python之路[第十八篇]:Web框架们 Python的WEB框架 Bottle Bottle是一个快速.简洁.轻量级的基于WSIG的微型Web框架,此框架只由一个 .py 文件,除了Python的标准库外,其不依赖任何其他模块. 1 2 3 4 pip install bottle easy_install bottle apt-get install python-bottle wget http://bottlepy.org/bottle.py Bottle框架大致可以分为以下部分: 路