#Seriesimport numpy as npimport pandas as pd # s1 = pd.Series([1,2,3,4])# print(s1)# # 0 1# # 1 2# # 2 3# # 3 4# # dtype: int64# print(s1.values) #[1 2 3 4]# print(s1.index) #RangeIndex(start=0, stop=4, step=1) #传入数组# s2 = pd.Series(np.arange(10))# print(s2)# 0 0# 1 1# 2 2# 3 3# 4 4# 5 5# 6 6# 7 7# 8 8# 9 9# dtype: int32 #传入字典# s3 = pd.Series({"1":1,"2":2,"3":3})# print(s3)# # 1 1# # 2 2# # 3 3# # dtype: int64# print(s3.values) #[1 2 3]# print(s3.index) #Index([‘1‘, ‘2‘, ‘3‘], dtype=‘object‘) # s4 = pd.Series([1,2,3,4],index=[‘A‘,‘B‘,‘C‘,‘D‘])# print(s4)# # A 1# # B 2# # C 3# # D 4# # dtype: int64# print(s4["A"]) #1# print(s4[s4>2])# # C 3# # D 4# # dtype: int64# print(s4.to_dict()) #{‘A‘: 1, ‘B‘: 2, ‘C‘: 3, ‘D‘: 4} # s4 = pd.Series([1,2,3,4],index=[‘A‘,‘B‘,‘C‘,‘D‘])# print(s4)# A 1# B 2# C 3# D 4# dtype: int64# index_1 = [‘A‘,‘B‘,‘C‘,‘D‘,‘E‘]# s5 = pd.Series(s4,index=index_1)#print(s5)# A 1.0# B 2.0# C 3.0# D 4.0# E NaN# dtype: float64#print(s5.isnull())# A False# B False# C False# D False# E True# dtype: bool# print(s5.notnull())# A True# B True# C True# D True# E False# dtype: bool # s5.name = ‘demo‘# print(s5)# A 1.0# B 2.0# C 3.0# D 4.0# E NaN# Name: demo, dtype: float64# s5.index.name = "demo index"# print(s5)# Name: demo, dtype: float64# demo index# A 1.0# B 2.0# C 3.0# D 4.0# E NaN# Name: demo, dtype: float64 from pandas import Series,DataFrame # import webbrowser# link = "https://www.tiobe.com/tiobe-index/"# webbrowser.open(link) #打开这个网页,然后使用鼠标进行复制操作# df = pd.read_clipboard()#这里是读取鼠标复制的数据#print(df)# Apr 2019 Apr 2018 Change Programming Language Ratings Change.1# 0 1 1 NaN Java 15.035% -0.74%# 1 2 2 NaN C 14.076% +0.49%# 2 3 3 NaN C++ 8.838% +1.62%# 3 4 4 NaN Python 8.166% +2.36%# 4 5 6 change Visual Basic .NET 5.795% +0.85%# 5 6 5 change C# 3.515% -1.75%# 6 7 8 change JavaScript 2.507% -0.99%# 7 8 9 change SQL 2.272% -0.38%# 8 9 7 change PHP 2.239% -1.98%# 9 10 14 change Assembly language 1.710% +0.05%# 10 11 18 change Objective-C 1.505% +0.25%# 11 12 17 change MATLAB 1.285% -0.17%# 12 13 10 change Ruby 1.277% -0.74%# 13 14 16 change Perl 1.269% -0.26%# 14 15 11 change Delphi/Object Pascal 1.264% -0.70%# 15 16 12 change R 1.181% -0.63%# 16 17 13 change Visual Basic 1.060% -0.74%# 17 18 19 change Go 1.009% -0.17%# 18 19 15 change Swift 0.978% -0.56%# 19 20 68 change Groovy NaN NaN#print(type(df)) #<class ‘pandas.core.frame.DataFrame‘>#print(df.columns)# Index([‘Apr 2019‘, ‘Apr 2018‘, ‘Change‘, ‘Programming Language‘, ‘Ratings‘,# ‘Change.1‘],# dtype=‘object‘)#print(df.Ratings) #直接获取Ratings列 或者print(df["Ratings"])# 0 15.035%# 1 14.076%# 2 8.838%# 3 8.166%# 4 5.795%# 5 3.515%# 6 2.507%# 7 2.272%# 8 2.239%# Name: Ratings, dtype: object#print(DataFrame(df,columns=["Programming Language","Ratings"]))#获取多列# Programming Language Ratings# 0 Java 15.035%# 1 C 14.076%# 2 C++ 8.838%# 3 Python 8.166%# 4 Visual Basic .NET 5.795%# 5 C# 3.515%# 6 JavaScript 2.507%# 7 SQL 2.272%# 8 PHP 2.239% #填充一列新列:Apr 2020,数据是用NaN填充的#df_new = DataFrame(df,columns=["Programming Language","Ratings","Apr 2020"])# print(df_new)# Programming Language Ratings Apr 2020# 0 Java 15.035% NaN# 1 C 14.076% NaN# 2 C++ 8.838% NaN# 3 Python 8.166% NaN# 4 Visual Basic .NET 5.795% NaN# 5 C# 3.515% NaN# 6 JavaScript 2.507% NaN# 7 SQL 2.272% NaN# 8 PHP 2.239% NaN # df_new["Apr 2020"] = range(9)# print(df_new)#给新的一列赋值# Programming Language Ratings Apr 2020# 0 Java 15.035% 0# 1 C 14.076% 1# 2 C++ 8.838% 2# 3 Python 8.166% 3# 4 Visual Basic .NET 5.795% 4# 5 C# 3.515% 5# 6 JavaScript 2.507% 6# 7 SQL 2.272% 7# 8 PHP 2.239% 8 #把数组赋值过来# df_new["Apr 2020"] = np.arange(9)# print(df_new)# Programming Language Ratings Apr 2020# 0 Java 15.035% 0# 1 C 14.076% 1# 2 C++ 8.838% 2# 3 Python 8.166% 3# 4 Visual Basic .NET 5.795% 4# 5 C# 3.515% 5# 6 JavaScript 2.507% 6# 7 SQL 2.272% 7# 8 PHP 2.239% 8 #由于其本身每列都是Series,所以可以用Series赋值# df_new["Apr 2020"] = pd.Series(np.arange(9))# print(df_new)# Programming Language Ratings Apr 2020# 0 Java 15.035% 0# 1 C 14.076% 1# 2 C++ 8.838% 2# 3 Python 8.166% 3# 4 Visual Basic .NET 5.795% 4# 5 C# 3.515% 5# 6 JavaScript 2.507% 6# 7 SQL 2.272% 7# 8 PHP 2.239% 8 #只赋值给某一行# df_new["Apr 2020"] = pd.Series([100,200],index=[1,2])# print(df_new)# Programming Language Ratings Apr 2020# 0 Java 15.035% NaN# 1 C 14.076% 100.0# 2 C++ 8.838% 200.0# 3 Python 8.166% NaN# 4 Visual Basic .NET 5.795% NaN# 5 C# 3.515% NaN# 6 JavaScript 2.507% NaN# 7 SQL 2.272% NaN# 8 PHP 2.239% NaN data = {‘country‘:[‘belgium‘,‘India‘,‘Brazil‘], ‘Capital‘:[‘Brussels‘,‘New Delhi‘,‘Brasilia‘], ‘Population‘:[11190846,1303171035,207847528]}# s1 = pd.Series(data[‘country‘])# print(s1)# 0 belgium# 1 India# 2 Brazil# dtype: object# print(s1.values)#[‘belgium‘ ‘India‘ ‘Brazil‘]# print(s1.index)#RangeIndex(start=0, stop=3, step=1)# s1 = pd.Series(data[‘country‘],index=[‘A‘,‘B‘,‘C‘])# print(s1)# A belgium# B India# C Brazil# dtype: object #DataFrame# df1 = DataFrame(data)# print(df1)# country Capital Population# 0 belgium Brussels 11190846# 1 India New Delhi 1303171035# 2 Brazil Brasilia 207847528# cou = df1["country"]# print(type(cou)) #<class ‘pandas.core.series.Series‘># print(df1.iterrows()) #<generator object DataFrame.iterrows at 0x000000000B6F4DE0># for row in df1.iterrows():# print(row)# # (0, country belgium# # Capital Brussels# # Population 11190846# # Name: 0, dtype: object)# print(row[0])#0# print(row[1])# # country belgium# # Capital Brussels# # Population 11190846# # Name: 0, dtype: object# print(type(row[1])) #<class ‘pandas.core.series.Series‘># break # s1 = pd.Series(data["country"])# s2 = pd.Series(data["Capital"])# s3 = pd.Series(data["Population"])#print(s1)# 0 belgium# 1 India# 2 Brazil# dtype: object# df_new = DataFrame([s1,s2,s3],index=[‘country‘,‘Capital‘,‘Population‘])# df_new = df_new.T# print(df_new)# country Capital Population# 0 belgium Brussels 11190846# 1 India New Delhi 1303171035# 2 Brazil Brasilia 207847528 #DataFrame的IO操作# import webbrowser# link = "http://pandas.pydata.org/pandas-docs/version/0.20/io.html"# webbrowser.open(link)## df1 = pd.read_clipboard()#print(df1)# Format Type Data Description Reader Writer# 0 text CSV read_csv to_csv# 1 text JSON read_json to_json# 2 text HTML read_html to_html# 3 text Local clipboard read_clipboard to_clipboard# 4 binary MS Excel read_excel to_excel# 5 binary HDF5 Format read_hdf to_hdf# 6 binary Feather Format read_feather to_feather# 7 binary Msgpack read_msgpack to_msgpack# 8 binary Stata read_stata to_stata# 9 binary SAS read_sas# 10 binary Python Pickle Format read_pickle to_pickle# 11 SQL SQL read_sql to_sql# 12 SQL Google Big Query read_gbq to_gbq # df1.to_csv("df1.csv",index=False)#去掉前面的index# df2 = pd.read_csv("df1.csv")# print(df2) # Format Type Data Description Reader Writer# 0 text CSV read_csv to_csv# 1 text JSON read_json to_json# 2 text HTML read_html to_html# 3 text Local clipboard read_clipboard to_clipboard# 4 binary MS Excel read_excel to_excel# 5 binary HDF5 Format read_hdf to_hdf# 6 binary Feather Format read_feather to_feather# 7 binary Msgpack read_msgpack to_msgpack# 8 binary Stata read_stata to_stata# 9 binary SAS read_sas# 10 binary Python Pickle Format read_pickle to_pickle# 11 SQL SQL read_sql to_sql# 12 SQL Google Big Query read_gbq to_gbq # print(df1.to_json())# {"Format":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},"Type":{"0":"CSV","1":"JSON","2":"HTML","3":"Local","4":"MS","5":"HDF5","6":"Feather","7":"Msgpack","8":"Stata","9":"SAS","10":"Python","11":"SQL","12":"Google"},"Data":{"0":"read_csv","1":"read_json","2":"read_html","3":"clipboard","4":"Excel","5":"Format","6":"Format","7":"read_msgpack","8":"read_stata","9":"read_sas","10":"Pickle","11":"read_sql","12":"Big"},"Description":{"0":"to_csv","1":"to_json","2":"to_html","3":"read_clipboard","4":"read_excel","5":"read_hdf","6":"read_feather","7":"to_msgpack","8":"to_stata","9":null,"10":"Format","11":"to_sql","12":"Query"},"Reader":{"0":null,"1":null,"2":null,"3":"to_clipboard","4":"to_excel","5":"to_hdf","6":"to_feather","7":null,"8":null,"9":null,"10":"read_pickle","11":null,"12":"read_gbq"},"Writer":{"0":null,"1":null,"2":null,"3":null,"4":null,"5":null,"6":null,"7":null,"8":null,"9":null,"10":"to_pickle","11":null,"12":"to_gbq"}} # print(pd.read_json(df1.to_json()))# Format Type Data Description Reader Writer# 0 text CSV read_csv to_csv# 1 text JSON read_json to_json# 10 binary Python Pickle Format read_pickle to_pickle# 11 SQL SQL read_sql to_sql# 12 SQL Google Big Query read_gbq to_gbq# 2 text HTML read_html to_html# 3 text Local clipboard read_clipboard to_clipboard# 4 binary MS Excel read_excel to_excel# 5 binary HDF5 Format read_hdf to_hdf# 6 binary Feather Format read_feather to_feather# 7 binary Msgpack read_msgpack to_msgpack# 8 binary Stata read_stata to_stata# 9 binary SAS read_sas #假设有movie_metadata.csv 文件# imbd = pd.read_csv("movie_metadata.csv")# print(imbd.shape)#(5043,28)# print(imbd.head())# print(imbd[["color","director_name"]])#q取出两列数据 # sub_df = imbd["director_name","movie_title","imbd_score"]# print(sub_df.head(5))# print(sub_df.iloc[10:20,:])# print(sub_df.iloc[10:20,0:2])# print(sub_df.loc[10:20,:])#和iloc类似,只是多了第20行# print(sub_df.loc[10:20,:"director_name"]) #可以使使用key键 #Series Reindex#s1 = Series([1,2,3,4],index=[‘A‘,‘B‘,‘C‘,‘D‘])# print(s1)# A 1# B 2# C 3# D 4# dtype: int64 #print(s1.reindex(index=[‘A‘,‘B‘,‘C‘,‘D‘,‘E‘]))# A 1.0# B 2.0# C 3.0# D 4.0# E NaN# dtype: float64 #print(s1.reindex(index=[‘A‘,‘B‘,‘C‘,‘D‘,‘E‘],fill_value=10))# A 1# B 2# C 3# D 4# E 10# dtype: int64 #s2 = Series([‘A‘,‘B‘,‘C‘],index=[1,5,10])#print(s2)# 1 A# 5 B# 10 C# dtype: object #print(s2.reindex(index=range(15)))# 1 A# 5 B# 10 C# dtype: object# 0 NaN# 1 A# 2 NaN# 3 NaN# 4 NaN# 5 B# 6 NaN# 7 NaN# 8 NaN# 9 NaN# 10 C# 11 NaN# 12 NaN# 13 NaN# 14 NaN# dtype: object #print(s2.reindex(index=range(15),method="ffill"))# 0 NaN# 1 A# 2 A# 3 A# 4 A# 5 B# 6 B# 7 B# 8 B# 9 B# 10 C# 11 C# 12 C# 13 C# 14 C# dtype: object #Reindex dataframe# df1 = DataFrame(np.random.rand(25).reshape(5,5))# print(df1)# 0 1 2 3 4# 0 0.150685 0.741189 0.642348 0.625132 0.318640# 1 0.781998 0.793684 0.434840 0.053550 0.076352# 2 0.657116 0.261819 0.089875 0.298170 0.035670# 3 0.408057 0.550972 0.298262 0.734598 0.920229# 4 0.707607 0.163687 0.861138 0.553325 0.439473 # df2 = DataFrame(np.random.rand(25).reshape(5,5),index=[‘A‘,‘B‘,‘D‘,‘E‘,‘F‘],columns=[‘c1‘,‘c2‘,‘c3‘,‘c4‘,‘c5‘])# print(df2)# c1 c2 c3 c4 c5# A 0.096956 0.687012 0.242486 0.106347 0.951611# B 0.534206 0.555345 0.743860 0.156659 0.228296# D 0.963385 0.648523 0.603671 0.904279 0.161911# E 0.549797 0.987869 0.048364 0.706606 0.820717# F 0.003817 0.923006 0.611485 0.986054 0.160444 # print(df2.reindex(index=[‘A‘,‘B‘,‘D‘,‘C‘,‘E‘,‘F‘]))# c1 c2 c3 c4 c5# A 0.745011 0.621461 0.288680 0.177793 0.013119# B 0.431538 0.170305 0.780363 0.007156 0.139781# D 0.663396 0.807862 0.732135 0.347896 0.959864# C NaN NaN NaN NaN NaN# E 0.145247 0.191087 0.811372 0.648703 0.697846# F 0.742532 0.439197 0.612185 0.114661 0.221951 # print(df2.reindex(columns=[‘c1‘,‘c2‘,‘c3‘,‘c4‘,‘c5‘,‘c6‘]))# c1 c2 c3 c4 c5 c6# A 0.287383 0.910655 0.418470 0.613704 0.200391 NaN# B 0.942793 0.389105 0.619344 0.076861 0.474860 NaN# D 0.945629 0.308200 0.165710 0.152989 0.552817 NaN# E 0.876477 0.138687 0.838985 0.656992 0.773661 NaN# F 0.866165 0.539998 0.500313 0.540542 0.002450 NaN # print(df2.reindex(index=[‘A‘,‘B‘,‘D‘,‘C‘,‘E‘,‘F‘],columns=[‘c1‘,‘c2‘,‘c3‘,‘c4‘,‘c5‘,‘c6‘]))# c1 c2 c3 c4 c5 c6# A 0.978832 0.807321 0.366297 0.148317 0.308838 NaN# B 0.905668 0.114278 0.368676 0.428269 0.162910 NaN# D 0.930796 0.963658 0.902773 0.584296 0.295554 NaN# C NaN NaN NaN NaN NaN NaN# E 0.101119 0.000268 0.301075 0.697321 0.121599 NaN# F 0.402271 0.660168 0.477529 0.590062 0.459596 NaN # print(df2.reindex(index=[‘A‘,‘B‘]))# c1 c2 c3 c4 c5# A 0.855483 0.462398 0.282791 0.454249 0.027320# B 0.223694 0.827418 0.368981 0.867265 0.471167 # print(df2.drop("A"))# c1 c2 c3 c4 c5# B 0.047756 0.880659 0.744061 0.012340 0.216161# D 0.603093 0.769085 0.526477 0.187897 0.991472# E 0.159034 0.909088 0.765743 0.428868 0.972190# F 0.239292 0.982104 0.802697 0.848463 0.503050 # print(df2.drop("A",axis=0))# c1 c2 c3 c4 c5# B 0.474883 0.859859 0.594369 0.077369 0.616871# D 0.562033 0.190256 0.882217 0.810458 0.855765# E 0.545617 0.872125 0.406509 0.544556 0.718795# F 0.944125 0.268808 0.070181 0.351121 0.040010 # print(df2.drop("c1",axis=1))# c2 c3 c4 c5# A 0.404537 0.646484 0.319498 0.818558# B 0.231232 0.132706 0.851948 0.061789# D 0.067037 0.789874 0.368729 0.761373# E 0.176873 0.294302 0.818214 0.284220# F 0.378809 0.835109 0.124004 0.857353
原文地址:https://www.cnblogs.com/nikecode/p/11130876.html
时间: 2024-10-17 16:24:20