python数据分析-03pandas库

#Seriesimport numpy as npimport pandas as pd

# s1 = pd.Series([1,2,3,4])# print(s1)# # 0    1# # 1    2# # 2    3# # 3    4# # dtype: int64# print(s1.values) #[1 2 3 4]# print(s1.index) #RangeIndex(start=0, stop=4, step=1)

#传入数组# s2 = pd.Series(np.arange(10))# print(s2)# 0    0# 1    1# 2    2# 3    3# 4    4# 5    5# 6    6# 7    7# 8    8# 9    9# dtype: int32

#传入字典# s3 = pd.Series({"1":1,"2":2,"3":3})# print(s3)# # 1    1# # 2    2# # 3    3# # dtype: int64# print(s3.values) #[1 2 3]# print(s3.index) #Index([‘1‘, ‘2‘, ‘3‘], dtype=‘object‘)

# s4 = pd.Series([1,2,3,4],index=[‘A‘,‘B‘,‘C‘,‘D‘])# print(s4)# # A    1# # B    2# # C    3# # D    4# # dtype: int64# print(s4["A"]) #1# print(s4[s4>2])# # C    3# # D    4# # dtype: int64# print(s4.to_dict()) #{‘A‘: 1, ‘B‘: 2, ‘C‘: 3, ‘D‘: 4}

# s4 = pd.Series([1,2,3,4],index=[‘A‘,‘B‘,‘C‘,‘D‘])# print(s4)# A    1# B    2# C    3# D    4# dtype: int64# index_1 = [‘A‘,‘B‘,‘C‘,‘D‘,‘E‘]# s5 = pd.Series(s4,index=index_1)#print(s5)# A    1.0# B    2.0# C    3.0# D    4.0# E    NaN# dtype: float64#print(s5.isnull())# A    False# B    False# C    False# D    False# E     True# dtype: bool# print(s5.notnull())# A     True# B     True# C     True# D     True# E    False# dtype: bool

# s5.name = ‘demo‘# print(s5)# A    1.0# B    2.0# C    3.0# D    4.0# E    NaN# Name: demo, dtype: float64# s5.index.name = "demo index"# print(s5)# Name: demo, dtype: float64# demo index# A    1.0# B    2.0# C    3.0# D    4.0# E    NaN# Name: demo, dtype: float64

from pandas import Series,DataFrame

# import webbrowser# link = "https://www.tiobe.com/tiobe-index/"# webbrowser.open(link) #打开这个网页,然后使用鼠标进行复制操作# df = pd.read_clipboard()#这里是读取鼠标复制的数据#print(df)#     Apr 2019  Apr 2018  Change  Programming Language  Ratings Change.1# 0          1         1     NaN                  Java  15.035%   -0.74%# 1          2         2     NaN                     C  14.076%   +0.49%# 2          3         3     NaN                   C++   8.838%   +1.62%# 3          4         4     NaN                Python   8.166%   +2.36%# 4          5         6  change     Visual Basic .NET   5.795%   +0.85%# 5          6         5  change                    C#   3.515%   -1.75%# 6          7         8  change            JavaScript   2.507%   -0.99%# 7          8         9  change                   SQL   2.272%   -0.38%# 8          9         7  change                   PHP   2.239%   -1.98%# 9         10        14  change     Assembly language   1.710%   +0.05%# 10        11        18  change           Objective-C   1.505%   +0.25%# 11        12        17  change                MATLAB   1.285%   -0.17%# 12        13        10  change                  Ruby   1.277%   -0.74%# 13        14        16  change                  Perl   1.269%   -0.26%# 14        15        11  change  Delphi/Object Pascal   1.264%   -0.70%# 15        16        12  change                     R   1.181%   -0.63%# 16        17        13  change          Visual Basic   1.060%   -0.74%# 17        18        19  change                    Go   1.009%   -0.17%# 18        19        15  change                 Swift   0.978%   -0.56%# 19        20        68  change                Groovy      NaN      NaN#print(type(df)) #<class ‘pandas.core.frame.DataFrame‘>#print(df.columns)# Index([‘Apr 2019‘, ‘Apr 2018‘, ‘Change‘, ‘Programming Language‘, ‘Ratings‘,#        ‘Change.1‘],#       dtype=‘object‘)#print(df.Ratings) #直接获取Ratings列 或者print(df["Ratings"])# 0    15.035%# 1    14.076%# 2     8.838%# 3     8.166%# 4     5.795%# 5     3.515%# 6     2.507%# 7     2.272%# 8     2.239%# Name: Ratings, dtype: object#print(DataFrame(df,columns=["Programming Language","Ratings"]))#获取多列#   Programming Language  Ratings# 0                 Java  15.035%# 1                    C  14.076%# 2                  C++   8.838%# 3               Python   8.166%# 4    Visual Basic .NET   5.795%# 5                   C#   3.515%# 6           JavaScript   2.507%# 7                  SQL   2.272%# 8                  PHP   2.239%

#填充一列新列:Apr 2020,数据是用NaN填充的#df_new = DataFrame(df,columns=["Programming Language","Ratings","Apr 2020"])# print(df_new)#   Programming Language  Ratings  Apr 2020# 0                 Java  15.035%       NaN# 1                    C  14.076%       NaN# 2                  C++   8.838%       NaN# 3               Python   8.166%       NaN# 4    Visual Basic .NET   5.795%       NaN# 5                   C#   3.515%       NaN# 6           JavaScript   2.507%       NaN# 7                  SQL   2.272%       NaN# 8                  PHP   2.239%       NaN

# df_new["Apr 2020"] = range(9)# print(df_new)#给新的一列赋值#   Programming Language  Ratings  Apr 2020# 0                 Java  15.035%         0# 1                    C  14.076%         1# 2                  C++   8.838%         2# 3               Python   8.166%         3# 4    Visual Basic .NET   5.795%         4# 5                   C#   3.515%         5# 6           JavaScript   2.507%         6# 7                  SQL   2.272%         7# 8                  PHP   2.239%         8

#把数组赋值过来# df_new["Apr 2020"] = np.arange(9)# print(df_new)#   Programming Language  Ratings  Apr 2020# 0                 Java  15.035%         0# 1                    C  14.076%         1# 2                  C++   8.838%         2# 3               Python   8.166%         3# 4    Visual Basic .NET   5.795%         4# 5                   C#   3.515%         5# 6           JavaScript   2.507%         6# 7                  SQL   2.272%         7# 8                  PHP   2.239%         8

#由于其本身每列都是Series,所以可以用Series赋值# df_new["Apr 2020"] = pd.Series(np.arange(9))# print(df_new)#   Programming Language  Ratings  Apr 2020# 0                 Java  15.035%         0# 1                    C  14.076%         1# 2                  C++   8.838%         2# 3               Python   8.166%         3# 4    Visual Basic .NET   5.795%         4# 5                   C#   3.515%         5# 6           JavaScript   2.507%         6# 7                  SQL   2.272%         7# 8                  PHP   2.239%         8

#只赋值给某一行# df_new["Apr 2020"] = pd.Series([100,200],index=[1,2])# print(df_new)#   Programming Language  Ratings  Apr 2020# 0                 Java  15.035%       NaN# 1                    C  14.076%     100.0# 2                  C++   8.838%     200.0# 3               Python   8.166%       NaN# 4    Visual Basic .NET   5.795%       NaN# 5                   C#   3.515%       NaN# 6           JavaScript   2.507%       NaN# 7                  SQL   2.272%       NaN# 8                  PHP   2.239%       NaN

data = {‘country‘:[‘belgium‘,‘India‘,‘Brazil‘],        ‘Capital‘:[‘Brussels‘,‘New Delhi‘,‘Brasilia‘],        ‘Population‘:[11190846,1303171035,207847528]}# s1 = pd.Series(data[‘country‘])# print(s1)# 0    belgium# 1      India# 2     Brazil# dtype: object# print(s1.values)#[‘belgium‘ ‘India‘ ‘Brazil‘]# print(s1.index)#RangeIndex(start=0, stop=3, step=1)# s1 = pd.Series(data[‘country‘],index=[‘A‘,‘B‘,‘C‘])# print(s1)# A    belgium# B      India# C     Brazil# dtype: object

#DataFrame# df1 = DataFrame(data)# print(df1)#    country    Capital  Population# 0  belgium   Brussels    11190846# 1    India  New Delhi  1303171035# 2   Brazil   Brasilia   207847528# cou = df1["country"]# print(type(cou)) #<class ‘pandas.core.series.Series‘># print(df1.iterrows()) #<generator object DataFrame.iterrows at 0x000000000B6F4DE0># for row in df1.iterrows():#     print(row)#     # (0, country        belgium#     # Capital       Brussels#     # Population    11190846#     # Name: 0, dtype: object)#     print(row[0])#0#     print(row[1])#     # country belgium#     # Capital Brussels#     # Population  11190846#     # Name: 0, dtype: object#     print(type(row[1])) #<class ‘pandas.core.series.Series‘>#     break

# s1 = pd.Series(data["country"])# s2 = pd.Series(data["Capital"])# s3 = pd.Series(data["Population"])#print(s1)# 0    belgium# 1      India# 2     Brazil# dtype: object# df_new = DataFrame([s1,s2,s3],index=[‘country‘,‘Capital‘,‘Population‘])# df_new = df_new.T# print(df_new)#    country    Capital  Population# 0  belgium   Brussels    11190846# 1    India  New Delhi  1303171035# 2   Brazil   Brasilia   207847528

#DataFrame的IO操作# import webbrowser# link = "http://pandas.pydata.org/pandas-docs/version/0.20/io.html"# webbrowser.open(link)## df1 = pd.read_clipboard()#print(df1)#   Format Type      Data Description          Reader        Writer# 0         text                   CSV        read_csv        to_csv# 1         text                  JSON       read_json       to_json# 2         text                  HTML       read_html       to_html# 3         text       Local clipboard  read_clipboard  to_clipboard# 4       binary              MS Excel      read_excel      to_excel# 5       binary           HDF5 Format        read_hdf        to_hdf# 6       binary        Feather Format    read_feather    to_feather# 7       binary               Msgpack    read_msgpack    to_msgpack# 8       binary                 Stata      read_stata      to_stata# 9       binary                   SAS        read_sas# 10      binary  Python Pickle Format     read_pickle     to_pickle# 11         SQL                   SQL        read_sql        to_sql# 12         SQL      Google Big Query        read_gbq        to_gbq

# df1.to_csv("df1.csv",index=False)#去掉前面的index# df2 = pd.read_csv("df1.csv")# print(df2)  # Format Type      Data Description          Reader        Writer# 0         text                   CSV        read_csv        to_csv# 1         text                  JSON       read_json       to_json# 2         text                  HTML       read_html       to_html# 3         text       Local clipboard  read_clipboard  to_clipboard# 4       binary              MS Excel      read_excel      to_excel# 5       binary           HDF5 Format        read_hdf        to_hdf# 6       binary        Feather Format    read_feather    to_feather# 7       binary               Msgpack    read_msgpack    to_msgpack# 8       binary                 Stata      read_stata      to_stata# 9       binary                   SAS        read_sas# 10      binary  Python Pickle Format     read_pickle     to_pickle# 11         SQL                   SQL        read_sql        to_sql# 12         SQL      Google Big Query        read_gbq        to_gbq

# print(df1.to_json())# {"Format":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},"Type":{"0":"CSV","1":"JSON","2":"HTML","3":"Local","4":"MS","5":"HDF5","6":"Feather","7":"Msgpack","8":"Stata","9":"SAS","10":"Python","11":"SQL","12":"Google"},"Data":{"0":"read_csv","1":"read_json","2":"read_html","3":"clipboard","4":"Excel","5":"Format","6":"Format","7":"read_msgpack","8":"read_stata","9":"read_sas","10":"Pickle","11":"read_sql","12":"Big"},"Description":{"0":"to_csv","1":"to_json","2":"to_html","3":"read_clipboard","4":"read_excel","5":"read_hdf","6":"read_feather","7":"to_msgpack","8":"to_stata","9":null,"10":"Format","11":"to_sql","12":"Query"},"Reader":{"0":null,"1":null,"2":null,"3":"to_clipboard","4":"to_excel","5":"to_hdf","6":"to_feather","7":null,"8":null,"9":null,"10":"read_pickle","11":null,"12":"read_gbq"},"Writer":{"0":null,"1":null,"2":null,"3":null,"4":null,"5":null,"6":null,"7":null,"8":null,"9":null,"10":"to_pickle","11":null,"12":"to_gbq"}}

# print(pd.read_json(df1.to_json()))#    Format Type      Data Description          Reader        Writer# 0         text                   CSV        read_csv        to_csv# 1         text                  JSON       read_json       to_json# 10      binary  Python Pickle Format     read_pickle     to_pickle# 11         SQL                   SQL        read_sql        to_sql# 12         SQL      Google Big Query        read_gbq        to_gbq# 2         text                  HTML       read_html       to_html# 3         text       Local clipboard  read_clipboard  to_clipboard# 4       binary              MS Excel      read_excel      to_excel# 5       binary           HDF5 Format        read_hdf        to_hdf# 6       binary        Feather Format    read_feather    to_feather# 7       binary               Msgpack    read_msgpack    to_msgpack# 8       binary                 Stata      read_stata      to_stata# 9       binary                   SAS        read_sas

#假设有movie_metadata.csv 文件# imbd = pd.read_csv("movie_metadata.csv")# print(imbd.shape)#(5043,28)# print(imbd.head())# print(imbd[["color","director_name"]])#q取出两列数据

# sub_df = imbd["director_name","movie_title","imbd_score"]# print(sub_df.head(5))# print(sub_df.iloc[10:20,:])# print(sub_df.iloc[10:20,0:2])# print(sub_df.loc[10:20,:])#和iloc类似,只是多了第20行# print(sub_df.loc[10:20,:"director_name"]) #可以使使用key键

#Series Reindex#s1 = Series([1,2,3,4],index=[‘A‘,‘B‘,‘C‘,‘D‘])# print(s1)# A    1# B    2# C    3# D    4# dtype: int64

#print(s1.reindex(index=[‘A‘,‘B‘,‘C‘,‘D‘,‘E‘]))# A    1.0# B    2.0# C    3.0# D    4.0# E    NaN# dtype: float64

#print(s1.reindex(index=[‘A‘,‘B‘,‘C‘,‘D‘,‘E‘],fill_value=10))# A     1# B     2# C     3# D     4# E    10# dtype: int64

#s2 = Series([‘A‘,‘B‘,‘C‘],index=[1,5,10])#print(s2)# 1     A# 5     B# 10    C# dtype: object

#print(s2.reindex(index=range(15)))# 1     A# 5     B# 10    C# dtype: object# 0     NaN# 1       A# 2     NaN# 3     NaN# 4     NaN# 5       B# 6     NaN# 7     NaN# 8     NaN# 9     NaN# 10      C# 11    NaN# 12    NaN# 13    NaN# 14    NaN# dtype: object

#print(s2.reindex(index=range(15),method="ffill"))# 0     NaN# 1       A# 2       A# 3       A# 4       A# 5       B# 6       B# 7       B# 8       B# 9       B# 10      C# 11      C# 12      C# 13      C# 14      C# dtype: object

#Reindex dataframe# df1 = DataFrame(np.random.rand(25).reshape(5,5))# print(df1)#           0         1         2         3         4# 0  0.150685  0.741189  0.642348  0.625132  0.318640# 1  0.781998  0.793684  0.434840  0.053550  0.076352# 2  0.657116  0.261819  0.089875  0.298170  0.035670# 3  0.408057  0.550972  0.298262  0.734598  0.920229# 4  0.707607  0.163687  0.861138  0.553325  0.439473

# df2 = DataFrame(np.random.rand(25).reshape(5,5),index=[‘A‘,‘B‘,‘D‘,‘E‘,‘F‘],columns=[‘c1‘,‘c2‘,‘c3‘,‘c4‘,‘c5‘])# print(df2)#         c1        c2        c3        c4        c5# A  0.096956  0.687012  0.242486  0.106347  0.951611# B  0.534206  0.555345  0.743860  0.156659  0.228296# D  0.963385  0.648523  0.603671  0.904279  0.161911# E  0.549797  0.987869  0.048364  0.706606  0.820717# F  0.003817  0.923006  0.611485  0.986054  0.160444

# print(df2.reindex(index=[‘A‘,‘B‘,‘D‘,‘C‘,‘E‘,‘F‘]))#          c1        c2        c3        c4        c5# A  0.745011  0.621461  0.288680  0.177793  0.013119# B  0.431538  0.170305  0.780363  0.007156  0.139781# D  0.663396  0.807862  0.732135  0.347896  0.959864# C       NaN       NaN       NaN       NaN       NaN# E  0.145247  0.191087  0.811372  0.648703  0.697846# F  0.742532  0.439197  0.612185  0.114661  0.221951

# print(df2.reindex(columns=[‘c1‘,‘c2‘,‘c3‘,‘c4‘,‘c5‘,‘c6‘]))#          c1        c2        c3        c4        c5  c6# A  0.287383  0.910655  0.418470  0.613704  0.200391 NaN# B  0.942793  0.389105  0.619344  0.076861  0.474860 NaN# D  0.945629  0.308200  0.165710  0.152989  0.552817 NaN# E  0.876477  0.138687  0.838985  0.656992  0.773661 NaN# F  0.866165  0.539998  0.500313  0.540542  0.002450 NaN

# print(df2.reindex(index=[‘A‘,‘B‘,‘D‘,‘C‘,‘E‘,‘F‘],columns=[‘c1‘,‘c2‘,‘c3‘,‘c4‘,‘c5‘,‘c6‘]))#          c1        c2        c3        c4        c5  c6# A  0.978832  0.807321  0.366297  0.148317  0.308838 NaN# B  0.905668  0.114278  0.368676  0.428269  0.162910 NaN# D  0.930796  0.963658  0.902773  0.584296  0.295554 NaN# C       NaN       NaN       NaN       NaN       NaN NaN# E  0.101119  0.000268  0.301075  0.697321  0.121599 NaN# F  0.402271  0.660168  0.477529  0.590062  0.459596 NaN

# print(df2.reindex(index=[‘A‘,‘B‘]))#          c1        c2        c3        c4        c5# A  0.855483  0.462398  0.282791  0.454249  0.027320# B  0.223694  0.827418  0.368981  0.867265  0.471167

# print(df2.drop("A"))#          c1        c2        c3        c4        c5# B  0.047756  0.880659  0.744061  0.012340  0.216161# D  0.603093  0.769085  0.526477  0.187897  0.991472# E  0.159034  0.909088  0.765743  0.428868  0.972190# F  0.239292  0.982104  0.802697  0.848463  0.503050

# print(df2.drop("A",axis=0))#         c1        c2        c3        c4        c5# B  0.474883  0.859859  0.594369  0.077369  0.616871# D  0.562033  0.190256  0.882217  0.810458  0.855765# E  0.545617  0.872125  0.406509  0.544556  0.718795# F  0.944125  0.268808  0.070181  0.351121  0.040010

# print(df2.drop("c1",axis=1))#          c2        c3        c4        c5# A  0.404537  0.646484  0.319498  0.818558# B  0.231232  0.132706  0.851948  0.061789# D  0.067037  0.789874  0.368729  0.761373# E  0.176873  0.294302  0.818214  0.284220# F  0.378809  0.835109  0.124004  0.857353

原文地址:https://www.cnblogs.com/nikecode/p/11130876.html

时间: 2024-10-17 16:24:20

python数据分析-03pandas库的相关文章

python数据分析-02numpy库

#数学基础回顾之矩阵运算#基本概念 #矩阵:矩形的数组,即二维数组.其中向量和标量都是矩阵的特例 #向量:是指1*n或者n*1的矩阵 #标量:1*1的矩阵 #数组:N维的数组,是矩阵的延伸 #特殊矩阵: #全0全1矩阵 #单位矩阵 #矩阵加减运算: #相加,减的两个矩阵必须要有相同的行和列 #行和列对应元素相加减 #数组乘法(点乘) #数组乘法(点乘)是对应元素之间的乘法 #矩阵乘法: #设A为m*p的矩阵,B为p*n的矩阵,m*n的矩阵为C为A*B的乘积,即为C=AB# [a1 b1 * [c

python数据分析numpy库学习

import numpy as np def asum(a_list,b_list,n1=2,n2=3): a = np.array(a_list) b = np.array(b_list) c = pow(a,n1) + pow(b,n2) return c a_lst = [1,2,3,4] b_lst = [2,3,4,5] print(asum(a_lst,b_lst)) #np.array()生成数据对象ndarray a = np.array([[1,2,3,4],[1,2,3,4]

基于Python数据分析与机器学习案例实战教程

课程--基于Python数据分析与机器学习案例实战教程 分享网盘下载--https://pan.baidu.com/s/1jHSaRAY 密码: xk37 课程背景基于数据分析与机器学习领域,使用python作为课程的实战语言,随着大数据与人工智能领域日益火爆,数据分析和机器学习建模成了当下最热门的技术,课程旨在帮助同学们快速掌握python数据分析包以及经典机器学习算法并通过对真实数据集分析进行实战演示. 课程风格通俗易懂,基于真实数据集案例实战. 主体课程分成三个大模块 (1)python数

高端实战 Python数据分析与机器学习实战 Numpy/Pandas/Matplotlib等常用库

课程简介:? ? 课程风格通俗易懂,真实案例实战.精心挑选真实的数据集为案例,通过Python数据科学库numpy,pandas,matplot结合机器学习库scikit-learn完成一些列的机器学习案例.课程以实战为基础,所有课时都结合代码演示如何使用这些python库来完成一个真实的数据案例.算法与项目相结合,选择经典kaggle项目,从数据预处理开始一步步代码实战带大家快速入门机器学习.旨在帮助同学们快速上手如何使用python库来完整机器学习案例. ------------------

《Python数据分析常用手册》一、NumPy和Pandas篇

一.常用链接: 1.Python官网:https://www.python.org/ 2.各种库的whl离线安装包:http://www.lfd.uci.edu/~gohlke/pythonlibs/#scikit-learn 3.数据分析常用库的离线安装包(pip+wheels)(百度云):http://pan.baidu.com/s/1dEMXbfN 密码:bbs2 二.常用库 1.NumPy NumPy是高性能科学计算和数据分析的基础包.部分功能如下: ndarray, 具有矢量算术运算和

转:python 的开源库

Python在科学计算领域,有两个重要的扩展模块:Numpy和Scipy.其中Numpy是一个用python实现的科学计算包.包括: 一个强大的N维数组对象Array: 比较成熟的(广播)函数库: 用于整合C/C++和Fortran代码的工具包: 实用的线性代数.傅里叶变换和随机数生成函数. SciPy是一个开源的Python算法库和数学工具包,SciPy包含的模块有最优化.线性代数.积分.插值.特殊函数.快速傅里叶变换.信号处理和图像处理.常微分方程求解和其他科学与工程中常用的计算.其功能与软

[Python数据分析]新股破板买入,赚钱几率如何?

这是本人一直比较好奇的问题,网上没搜到,最近在看python数据分析,正好自己动手做一下试试.作者对于python是零基础,需要从头学起. 在写本文时,作者也没有完成这个小分析目标,边学边做吧. ================================================================ Python基础: 中国大学Mooc,南京大学,张莉老师 -<用Python玩转数据> 了解基本的语法和常用函数就行了,其他的用的时候再搜. 财经数据源: TuShare

总是在起头可是能怎么办呢 Python数据分析

目录 前言1 第1章准备工作5 本书主要内容5 为什么要使用Python进行数据分析6 重要的Python库7 安装和设置10 社区和研讨会16 使用本书16 致谢18 第2章引言20 来自bit.ly的1.usa.gov数据21 MovieLens1M数据集29 1880—2010年间全美婴儿姓名35 小结及展望47 第3章IPython:一种交互式计算和开发环境48 IPython基础49 内省51 使用命令历史60 与操作系统交互63 软件开发工具66 IPythonHTMLNoteboo

Python数据分析入门

Python数据分析入门 最近,Analysis with Programming加入了Planet Python.作为该网站的首批特约博客,我这里来分享一下如何通过Python来开始数据分析.具体内容如下: 数据导入 导入本地的或者web端的CSV文件: 数据变换: 数据统计描述: 假设检验 单样本t检验: 可视化: 创建自定义函数. 数据导入 这是很关键的一步,为了后续的分析我们首先需要导入数据.通常来说,数据是CSV格式,就算不是,至少也可以转换成CSV格式.在Python中,我们的操作如