吴裕雄--天生自然 PYTHON数据分析:人类发展报告——HDI, GDI,健康,全球人口数据数据分析

import pandas as pd  # Data analysis
import numpy as np #Data analysis
import seaborn as sns # Data visualization
import matplotlib.pyplot as plt # Data Visualization
import  matplotlib.gridspec as gridspec # subplots and grid
from wordcloud import WordCloud, STOPWORDS # Visualize text

import json
import folium # Map
import folium.plugins as plugins # Map
from mpl_toolkits.basemap import Basemap # Map

import warnings
warnings.filterwarnings(‘ignore‘)
import scipy.stats
import gc

# Plotting style and setting
plt.style.use(‘fivethirtyeight‘) #Plot style
#plt.style.use(‘bmh‘)
plt.rc(‘axes‘, labelsize=12) # plot setting
plt.rc(‘xtick‘, labelsize=12)
plt.rc(‘ytick‘, labelsize=12)
pd.options.display.max_rows = 100
% matplotlib inline
#path = ‘file/‘ # local file loaction
path = ‘F:\\kaggleDataSet\\human-development\\‘
loan = pd.read_csv(path+‘kiva_loans.csv‘)
mpi = pd.read_csv(path+‘kiva_mpi_region_locations.csv‘)
#loan_theme = pd.read_csv(path+‘loan_theme_ids.csv‘)
#loan_theme_region = pd.read_csv(path+‘loan_themes_by_region.csv‘)

# MPI
#mpi_world = pd.read_csv(‘file/MPI_national.csv‘)
#mpi_subnational = pd.read_csv(‘file/MPI_subnational.csv‘)

#HDI
path = ‘F:\\kaggleDataSet\\human-development\\‘
hdi = pd.read_csv(path+‘HDI.csv‘)
continent_hdi = pd.read_csv(path+‘Continent_HDI.csv‘)
geo_world_data = json.load(open(path+‘countries.geojson‘))
loan.head()

loan.describe()

loan.describe(include=[‘O‘]) # Discribe categorical data

mpi.head()

mpi.describe(include=[‘O‘]) # Discribe categorical data

f,ax = plt.subplots(1,3,figsize=(16,6))
sns.distplot(loan[‘funded_amount‘],ax=ax[0])
ax[0].set_title(‘Distribution of funded_amount‘)
ax[0].set_xlabel(‘Funded Amount‘)

ulimit = np.percentile(loan[‘funded_amount‘],99)
llimit= np.percentile(loan[‘funded_amount‘],1)
value = loan[(llimit<loan[‘funded_amount‘])&(loan[‘funded_amount‘]<ulimit)][‘funded_amount‘]
sns.distplot(value,color=‘r‘,ax=ax[1])
ax[1].set_title(‘Distribution of funded_amount by removing outliers‘);
ax[1].set_xlabel(‘Funded Amount‘)

ax[2].scatter(np.sort(loan[‘funded_amount‘].values),range(loan.shape[0]),)
ax[2].set_title(‘Distribution of funded_amount‘);
ax[2].set_xlabel(‘Funded Amount‘)
ax[2].set_ylabel(‘Index‘)
plt.subplots_adjust(wspace=0.3)

f,ax = plt.subplots(1,3,figsize=(16,6))
sns.distplot(loan[‘loan_amount‘],ax=ax[0])
ax[0].set_title(‘Distribution of Loan amount‘)
ax[0].set_xlabel(‘Loan Amount‘)

ulimit = np.percentile(loan[‘loan_amount‘],99)
llimit= np.percentile(loan[‘loan_amount‘],1)
value = loan[(llimit<loan[‘loan_amount‘])&(loan[‘loan_amount‘]<ulimit)][‘loan_amount‘]
sns.distplot(value,color=‘r‘,ax=ax[1])
ax[1].set_xlabel(‘Loan Amount‘)
ax[1].set_title(‘Distribution of Loan amount by removing outliers‘);

ax[2].scatter(np.sort(loan[‘loan_amount‘].values),range(loan.shape[0]),)
ax[2].set_title(‘Distribution of Loan amount‘);
ax[2].set_xlabel(‘Loan Amount‘)
ax[2].set_ylabel(‘Index‘)
plt.subplots_adjust(wspace=0.3)

m = folium.Map(location=[0,0],zoom_start=2)
poo = loan.groupby([‘country_code‘]).agg({‘count‘,‘count‘})[‘id‘].reset_index()
m.choropleth(geo_data= geo_world_data,data = poo,
             columns=[‘country_code‘,‘count‘],key_on=‘feature.properties.wb_a2‘,
             name=‘Listed Country‘,fill_opacity=1,fill_color=‘YlOrBr‘,
             highlight=True,legend_name=‘Count‘)
folium.LayerControl().add_to(m)
m

f,ax = plt.subplots(1,2,figsize=(16,8))
poo = loan[‘country‘].value_counts()[:10]
sns.barplot(poo.values,poo.index, palette=‘Wistia‘, ax=ax[0])
ax[0].set_title(‘Distribution of Top listed Countries‘)
ax[0].set_xlabel(‘Count‘)

for i, v in enumerate(poo.values):
    ax[0].text(.6,i, round(v,2),fontsize=10,color=‘k‘)
poo = loan.groupby(‘country‘).mean()[‘loan_amount‘].sort_values(ascending=False)[:10]
sns.barplot(poo.values, poo.index, palette=‘cool‘, ax=ax[1])
ax[1].set_title(‘Distribution of Top Average loan amount by country‘)
ax[1].set_ylabel(‘‘)
ax[1].set_xlabel(‘Average Loan Amount‘)

for i, v in enumerate(poo.values):
    ax[1].text(.6,i, round(v,2),fontsize=10,color=‘k‘)

plt.subplots_adjust(wspace=0.5);

plt.figure(figsize=(16,8))

poo = loan.groupby(‘country‘).mean()[‘loan_amount‘].sort_values(ascending=False)
sns.boxplot(loan[‘country‘], np.log(loan[‘loan_amount‘]), palette=‘spring‘,order=poo.index)
plt.xlabel(‘‘)
plt.ylabel(‘Loan amount ($log10$)‘)
plt.title(‘Boxplot of loan amount($log10$)‘)
plt.xticks(rotation=90);

print("Cote D‘Ivoire",loan[loan[‘country‘] == "Cote D‘Ivoire"][‘loan_amount‘])
print("Mauritania",loan[loan[‘country‘] == "Mauritania"][‘loan_amount‘])

f,ax = plt.subplots(1,2,figsize=(16,8))
poo = loan[‘region‘].value_counts()[:10]
sns.barplot(poo.values,poo.index, palette=‘Wistia‘, ax=ax[0])
ax[0].set_title(‘Distribution of Top listed Region‘)
ax[0].set_xlabel(‘Count‘)

for i, v in enumerate(poo.values):
    ax[0].text(.6,i, round(v,2),fontsize=10,color=‘k‘)
poo = loan.groupby(‘region‘).mean()[‘loan_amount‘].sort_values(ascending=False)[:10]
sns.barplot(poo.values, poo.index, palette=‘cool‘, ax=ax[1])
ax[1].set_title(‘Distribution of Top Average loan amount by Region‘)
ax[1].set_ylabel(‘‘)
ax[1].set_xlabel(‘Average Loan Amount‘)

for i, v in enumerate(poo.values):
    ax[1].text(.6,i, round(v,2),fontsize=10,color=‘k‘)

plt.subplots_adjust(wspace=0.5);

plt.figure( figsize =(16,8))
gridspec.GridSpec(2,2)

plt.subplot2grid((1,2),(0,0))
poo = loan[‘sector‘].value_counts()
#plt.pie(poo.values, labels = poo.index, autopct=‘%1.1f%%‘,colors=sns.color_palette(‘Wistia‘),startangle=60,)
sns.barplot(poo.values,poo.index,palette=‘Wistia‘)
for i, v in enumerate(poo.values):
    plt.text(.6,i, round(v,2),fontsize=10,color=‘k‘)
plt.title(‘Distribution of listed sector‘)

plt.subplot2grid((1,2),(0,1))
poo = loan.groupby(‘sector‘).mean()[‘loan_amount‘].sort_values(ascending=False)
sns.barplot(poo.values,poo.index,palette=‘cool‘)
plt.title(‘Distribution of Average loan amount by sector‘)
plt.xlabel(‘Average Loan Amount‘)
for i, v in enumerate(poo.values):
    plt.text(.6,i, round(v,2),fontsize=10,color=‘k‘)

# Joy plot
tmp = loan[[‘loan_amount‘,‘sector‘]]
tmp[‘loan_amount‘] = np.log(tmp[‘loan_amount‘])
g = sns.FacetGrid(tmp,row=‘sector‘,hue=‘sector‘,aspect=15, size=0.6)

# Draw the densities in a few steps
g.map(sns.kdeplot, "loan_amount", clip_on=False, shade=True, alpha=1, lw=1.5, bw=.2)
g.map(sns.kdeplot, "loan_amount", clip_on=False, color="w", lw=2, bw=.2)
g.map(plt.axhline, y=0, lw=2, clip_on=False)

# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
    ax = plt.gca()
    ax.text(0, .2, label, fontweight="bold", color=color, ha="left", va="center", transform=ax.transAxes)

g.map(label, "loan_amount")

# Set the subplots to overlap
g.fig.subplots_adjust(hspace=0)

# Remove axes details that don‘t play will with overlap
g.set_titles("")
g.set(yticks=[])
g.set(xlabel = ‘loan amount (log)‘)
g.despine(bottom=True, left=True)
g.savefig(‘F:\\joy.png‘)

f,ax = plt.subplots(1,2,figsize=(16,8))
poo = loan[‘activity‘].value_counts()[:10]
sns.barplot(poo.values,poo.index, palette=‘Wistia‘,ax= ax[0])
ax[0].set_title(‘Distribution of Top listed Activity‘)
ax[0].set_xlabel(‘Count‘)
for i, v in enumerate(poo.values):
    ax[0].text(.6,i, round(v,2),fontsize=10,color=‘k‘)

poo = loan.groupby(‘activity‘).mean()[‘loan_amount‘].sort_values(ascending=False)[:10]
sns.barplot(poo.values, poo.index, palette=‘cool‘, ax=ax[1])
ax[1].set_title(‘Distribution of Top Average loan amount by activity‘)
ax[1].set_ylabel(‘‘)
ax[1].set_xlabel(‘Average Loan Amount‘)
for i, v in enumerate(poo.values):
    ax[1].text(1,i, round(v,2),fontsize=10,color=‘k‘)
plt.subplots_adjust(wspace=0.4)

plt.figure(figsize =(16,8))
gridspec.GridSpec(2,2)

plt.subplot2grid((1,2),(0,0))
poo = loan[‘repayment_interval‘].value_counts()
plt.pie(poo.values,labels= poo.index,autopct=‘%1.1f%%‘,startangle=60,colors=sns.color_palette(‘cool‘,desat=.7))
plt.title(‘Distribution of listed repayment_interval‘)

plt.subplot2grid((1,2),(0,1))
poo = loan.groupby(‘repayment_interval‘).mean()[‘loan_amount‘].sort_values(ascending=False)
sns.barplot(poo.values,poo.index, palette=‘Wistia‘)
plt.title(‘Distribution of Average loan amount by Repayment‘)
plt.xlabel(‘Average Loan Amount‘)
plt.ylabel(‘‘)
for i, v in enumerate(poo.values):
    plt.text(1,i, round(v,2),fontsize=10,color=‘b‘)

# Joy plot
tmp = loan[[‘loan_amount‘,‘repayment_interval‘]]
tmp[‘loan_amount‘] = np.log(tmp[‘loan_amount‘])
g = sns.FacetGrid(tmp,row=‘repayment_interval‘,hue=‘repayment_interval‘,aspect=15, size=0.6)

# Draw the densities in a few steps
g.map(sns.kdeplot, "loan_amount", clip_on=False, shade=True, alpha=1, lw=1.5, bw=.2)
g.map(sns.kdeplot, "loan_amount", clip_on=False, color="w", lw=2, bw=.2)
g.map(plt.axhline, y=0, lw=2, clip_on=False)

# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
    ax = plt.gca()
    ax.text(0, .2, label, fontweight="bold", color=color, ha="left", va="center", transform=ax.transAxes)

g.map(label, "loan_amount")

# Set the subplots to overlap
g.fig.subplots_adjust(hspace=0)

# Remove axes details that don‘t play will with overlap
g.set_titles("")
g.set(yticks=[])
g.set(xlabel = ‘loan amount (log)‘)
g.despine(bottom=True, left=True)

plt.subplots_adjust(wspace=0.3);

f,ax = plt.subplots(2,2,figsize=(16,12))
axs = ax.ravel()
for i,c in enumerate(loan[‘repayment_interval‘].unique()):
    k = loan[loan[‘repayment_interval‘] == c]
    agg = k.groupby([‘country‘]).mean()[‘loan_amount‘].sort_values(ascending=False).dropna()[:10]
    if i<4:
        sns.barplot(x = agg.values,y = agg.index, ax= axs[i],palette=sns.color_palette(‘cool‘,n_colors=i+1))
        axs[i].set_title(‘Average loan amount for country by \n Repayment Interval: {}‘.format(c))
        axs[i].set_ylabel(‘‘)
        axs[i].set_xlabel(‘Average Loan amount‘)
        for j, v in enumerate(agg.values):
            axs[i].text(1,j, round(v,2),fontsize=10,color=‘k‘)
plt.subplots_adjust(wspace=0.4,hspace=0.3)

plt.figure(figsize=(16,6))
poo = loan[‘term_in_months‘].value_counts().iloc[:20]
sns.barplot(y = poo.values, x = poo.index, palette= ‘cool‘,order=poo.index)
plt.xticks(rotation=90)
plt.xlabel(‘Month‘)
plt.ylabel(‘Count‘)
plt.title(‘Distribution of terms‘);

plt.figure(figsize=(16,6))
poo = loan[‘lender_count‘].value_counts().iloc[:20]
sns.barplot(y = poo.values, x = poo.index, palette= ‘Wistia‘,order=poo.index)
plt.xticks(rotation=90)
plt.xlabel(‘Lender Count‘)
plt.title(‘Distribution of Lender count ‘);

f,ax = plt.subplots(1,2,figsize=(16,6))
sns.distplot(loan[‘lender_count‘],ax=ax[0])
ax[0].set_title(‘Distribution of lender_count‘)

ulimit = np.percentile(loan[‘lender_count‘],99)
llimit= np.percentile(loan[‘lender_count‘],1)
value = loan[(llimit<loan[‘lender_count‘])&(loan[‘lender_count‘]<ulimit)][‘lender_count‘]
sns.distplot(value,color=‘r‘,ax=ax[1])
ax[1].set_title(‘Distribution of lender_count by removing outliers‘);

#use
wc = (WordCloud(height= 1000,width=1600, stopwords=STOPWORDS,max_words=1000,background_color=‘white‘).generate(" ".join(loan[‘use‘].astype(str))) )
plt.figure(figsize=(16,10))
plt.imshow(wc)
plt.axis(‘off‘)
#plt.savefig(‘use_cloud.png‘)
plt.title(‘Loan amount usage‘);

plt.figure(figsize=(16,10))
poo = loan[‘use‘].value_counts()[:10]
sns.barplot(poo.values,poo.index, palette=‘Wistia‘)
plt.title(‘Distribution of listed Use of Loan amount‘)
plt.xlabel(‘Average Loan amount‘)
for i, v in enumerate(poo.values):
    plt.text(.6,i, round(v,2),fontsize=10,color=‘k‘)
    plt.rc(‘ytick‘, labelsize=20);
plt.rc(‘ytick‘, labelsize=10);

#tags
wc = (WordCloud(height= 1000,width=1600, stopwords=STOPWORDS,max_words=1000,background_color=‘white‘).generate(" ".join(loan[‘tags‘].astype(str))) )
plt.figure(figsize=(16,10))
plt.imshow(wc)
plt.axis(‘off‘)
plt.title(‘Loan amount Tags‘);

gender = ",".join(loan[‘borrower_genders‘].astype(str).str.replace(‘ ‘,‘‘))

cnt = pd.DataFrame(gender.strip().split(‘,‘),columns=[‘Gender‘])
cnt = cnt[‘Gender‘].value_counts()

f,ax = plt.subplots(1,2,figsize=(16,8))
ax[0].pie(cnt.values,labels=cnt.index,autopct=‘%0.1f%%‘)
ax[0].set_title(‘Borrower Gender‘)

poo = loan[‘borrower_genders‘].value_counts()[:5]*100/loan.shape[0]
#ax[1].pie(poo.values,labels=poo.index,autopct=‘%0.1f%%‘)
sns.barplot(poo.values,poo.index, palette=‘summer‘)
ax[1].set_title(‘Distribution of listed Use of Loan amount‘)
ax[1].set_xlabel(‘Average Loan amount‘)
for i,v in enumerate(poo.values):
    ax[1].text(1,i,round(v,2),fontsize=12)
    ax[1].text(7,i,‘%‘,fontsize=12)
plt.subplots_adjust(wspace=0.4)

poo = (loan.groupby([‘borrower_genders‘,‘repayment_interval‘]).agg([‘count‘])[‘id‘].reset_index())
poo.loc[:,‘borrower_genders‘][~((poo[‘borrower_genders‘] == ‘female‘) |(poo[‘borrower_genders‘] == ‘male‘))] = ‘Group‘

plt.figure(figsize=(16,4))
cnt = poo.groupby([‘borrower_genders‘,‘repayment_interval‘])[‘count‘].sum().reset_index()
cnt[‘count‘] = cnt[‘count‘]*100/cnt[‘count‘].sum()
sns.barplot(y= cnt[‘count‘],x = cnt[‘repayment_interval‘],hue=cnt[‘borrower_genders‘],palette=‘rainbow‘)
plt.title(‘Repayment interval by Gender %‘)
plt.ylabel(‘%‘);

loan[‘date‘] = pd.to_datetime(loan[‘date‘])
loan[‘disbursed_time‘] = pd.to_datetime(loan[‘disbursed_time‘])
loan[‘funded_time‘] = pd.to_datetime(loan[‘funded_time‘])
loan[‘posted_time‘] = pd.to_datetime(loan[‘posted_time‘])
loan_ts = loan.set_index(‘date‘)
plt.figure(figsize=(16,6))
date_feature = [‘posted_time‘,‘funded_time‘]
loan.set_index(‘posted_time‘)[‘loan_amount‘].resample(‘M‘).sum().plot()
loan.set_index(‘posted_time‘)[‘funded_amount‘].resample(‘M‘).sum().plot()
plt.legend()

plt.figure(figsize=(16,10))
gridspec.GridSpec(2,2)
# Agriclure
plt.subplot2grid((2,2),(0,0))
poo = loan[loan[‘sector‘] ==‘Agriculture‘][‘activity‘].value_counts()[:10]
sns.barplot(poo.values,poo.index,palette=‘Wistia‘)
plt.ylabel(‘Activity‘)
plt.xlabel(‘Count‘)
plt.title(‘"Agriculture" Sector‘)
for i, v in enumerate(poo.values):
    plt.text(.6,i, round(v,2),fontsize=10,color=‘k‘)

plt.subplot2grid((2,2),(0,1))
poo = loan[loan[‘sector‘] ==‘Food‘][‘activity‘].value_counts()[:10]
sns.barplot(poo.values,poo.index,palette=‘cool‘)
plt.ylabel(‘Activity‘)
plt.xlabel(‘Count‘)
plt.title(‘"Food" Sector‘)
for i, v in enumerate(poo.values):
    plt.text(.6,i, round(v,2),fontsize=10,color=‘k‘)

plt.subplot2grid((2,2),(1,0))
poo = loan[loan[‘sector‘] ==‘Retail‘][‘activity‘].value_counts()[:10]
sns.barplot(poo.values,poo.index,palette=‘cool‘)
plt.ylabel(‘Activity‘)
plt.xlabel(‘Count‘)
plt.title(‘"Retail" Sector‘)
for i, v in enumerate(poo.values):
    plt.text(.6,i, round(v,2),fontsize=10,color=‘k‘)

plt.subplot2grid((2,2),(1,1))
poo = loan[loan[‘sector‘] ==‘Entertainment‘][‘activity‘].value_counts()[:10]
sns.barplot(poo.values,poo.index,palette=‘magma‘)
plt.ylabel(‘Activity‘)
plt.xlabel(‘Count‘)
plt.title(‘"Entertainment" Sector‘)
for i, v in enumerate(poo.values):
    plt.text(.6,i, round(v,2),fontsize=10,color=‘k‘)

plt.subplots_adjust(hspace=0.4,wspace=0.5);

f,ax = plt.subplots(1,2,figsize=(16,6))
poo = mpi[‘world_region‘].value_counts()
sns.barplot(poo.values, poo.index,palette=sns.color_palette(‘Wistia‘),ax=ax[0])
ax[0].set_title(‘Distribtution of MPI by world region‘)
ax[0].set_xlabel(‘Count‘)
for i, v in enumerate(poo.values):
    ax[0].text(.6,i, round(v,2),fontsize=10,color=‘k‘)
agg = mpi.groupby([‘world_region‘]).mean()[‘MPI‘].sort_values().dropna().sort_values( ascending=False)
sns.barplot(agg.values, agg.index,palette=sns.color_palette(‘cool‘),ax=ax[1])
ax[1].set_xlabel(‘Average MPI‘)
ax[1].set_title(‘Average MPI by world region‘)
for i, v in enumerate(poo.values):
    ax[1].text(0,i, round(v,2),fontsize=10,color=‘k‘)
plt.subplots_adjust(wspace=0.6);

f,ax = plt.subplots(2,3,figsize=(16,12))
axs = ax.ravel()
for i,c in enumerate(mpi[‘world_region‘].unique()):
    k = mpi[mpi[‘world_region‘] == c]
    agg = k.groupby([‘country‘]).mean()[‘MPI‘].sort_values(ascending=False).dropna()[:10]
    if i<6:
        sns.barplot(x = agg.values,y = agg.index, ax= axs[i],palette=sns.color_palette(‘cool‘,n_colors=i+1))
        axs[i].set_title(‘Region: \n {}‘.format(c))
        axs[i].set_xlabel(‘Average MPI‘)
        axs[i].set_ylabel(‘‘)
        for j, v in enumerate(agg.values):
            axs[i].text(0,j,round(v,2),fontsize=10,color=‘k‘)

plt.subplots_adjust(wspace=0.5,hspace=0.3);

f,ax = plt.subplots(1,2,figsize=(16,6))
agg = mpi.groupby([‘country‘]).mean()[‘MPI‘].sort_values().dropna().sort_values( ascending=False)[:10]
sns.barplot(agg.values, agg.index,palette=‘Wistia‘,ax=ax[0])
ax[0].set_title(‘Distribtution of MPI by country‘)
ax[0].set_xlabel(‘Average MPI‘)
for i, v in enumerate(agg.values):
    ax[0].text(0,i, round(v,2),fontsize=10,color=‘k‘)

agg = mpi.groupby([‘LocationName‘]).mean()[‘MPI‘].sort_values().dropna().sort_values( ascending=False)[:10]
sns.barplot(agg.values, agg.index,palette=‘cool‘,ax=ax[1])
for i, v in enumerate(agg.values):
    ax[1].text(0,i, round(v,2),fontsize=10,color=‘k‘)

ax[1].set_title(‘Average MPI by Location Name‘)
ax[0].set_xlabel(‘Average MPI‘)
plt.subplots_adjust(wspace=0.6);

# MPI
plt.figure(figsize=(16,10))
m = Basemap(projection=‘cyl‘,resolution=‘c‘,)
m.drawcoastlines(linewidth=0.1, color="white")
m.fillcontinents(color=‘#f2f2f2‘,lake_color=‘#46bcec‘)
m.drawmapboundary(fill_color=‘#A6CAE0‘, linewidth=0.1)
#m.bluemarble(alpha=0.4)
m.shadedrelief()

values = mpi[‘MPI‘]
mloc = m(mpi[‘lon‘],mpi[‘lat‘])
m.scatter(mloc[0],mloc[1],c = values,zorder=20,cmap=‘hot_r‘)
m.colorbar()
plt.title(‘Distribution of MPI‘)
plt.show()
m
gc.collect();

# http://nbviewer.jupyter.org/github/python-visualization/folium/blob/master/examples/MarkerCluster.ipynb
loc = mpi[[‘lon‘,‘lat‘,‘region‘,‘MPI‘]].dropna()
m1 = folium.Map(location=[0,0],zoom_start=2)

locations = list(zip(loc[‘lat‘],loc[‘lon‘]))
popups = [‘lat: {} lon: {} <br> MPI: {}‘.format(round(lat,2),round(lon,2),m) for (lat,lon,m) in zip(mpi[‘lat‘],mpi[‘lon‘],mpi[‘MPI‘])]

marker = plugins.MarkerCluster(locations, popups=popups)
marker.add_to(m1)
m1

gc.collect()
hdi.head()

continent_hdi.head()

kiva_country = loan[‘country‘].unique()
len(kiva_country)
kiva_hdi = hdi[hdi[‘Country‘].apply(lambda c: c in kiva_country)]
kiva_hdi[‘Country‘].apply(lambda c: c in kiva_country)

m = folium.Map(location=[0,0],zoom_start=2)

m.choropleth(geo_data= geo_world_data,data = hdi, columns=[‘Country‘,‘HDI‘],key_on=‘feature.properties.name‘,name=‘HDI‘,fill_opacity=1,fill_color=‘GnBu‘,highlight=True, legend_name=‘HDI‘)
folium.LayerControl().add_to(m)
m

f,ax = plt.subplots(1,2,figsize=(16,6))
value = (hdi[[‘HDI‘,‘Country‘]].sort_values(by=‘HDI‘)[:10])
sns.barplot(value[‘HDI‘],value[‘Country‘],palette=‘cool‘,ax=ax[0])
ax[0].set_title(‘Bottom 10 country by HDI‘)
for i, v in enumerate(value[‘HDI‘]):
    ax[0].text(0,i, round(v,2),fontsize=10,color=‘k‘)

value = (hdi[[‘HDI‘,‘Country‘]].sort_values(by=‘HDI‘,ascending=False)[:10])
sns.barplot(value[‘HDI‘],value[‘Country‘],palette=‘Wistia‘,ax=ax[1])
ax[1].set_title(‘Top 10 country by HDI‘);
for i, v in enumerate(value[‘HDI‘]):
    ax[1].text(0,i, round(v,2),fontsize=10,color=‘k‘)

### col = hdi.columns[hdi.columns.str.contains(‘HDI‘)]
col = [‘HDI‘,‘Human Development Index (HDI) Female‘,‘Human Development Index (HDI) Male‘]
f,ax = plt.subplots(figsize=(16,6))
for i,C in enumerate(col):
    hdi[C].plot(kind=‘kde‘,ax=ax,color=‘C{}‘.format(i))
    mean = hdi[C].mean()
    ax.axvline(mean,c=‘C{}‘.format(i))
    print(‘Mean value of {}: {}‘.format(C,mean,))
    #ax.text(round(mean,0),0.1,round(mean,2))
    ax.legend()
plt.title(‘Human Development Index (HDI)‘)
#plt.savefig(‘hdi.png‘);

f,ax=plt.subplots(figsize=(16,6))
continent_hdi[[‘Human development groups‘,‘Average annual HDI growth 1990-2000‘,‘Average annual HDI growth 2000-2010‘,
       ‘Average annual HDI growth 2010-2015‘,‘Average annual HDI growth 1990-2015‘,‘HDI‘]].plot(ax=ax)
plt.xticks(np.arange(14),continent_hdi[‘Human development groups‘],rotation=90);

col = hdi.columns[hdi.columns.str.startswith(‘Life expectancy‘)]
f,ax = plt.subplots(figsize=(16,6))
for i,C in enumerate(col):
    hdi[C].plot(kind=‘kde‘,ax=ax,c=‘C{}‘.format(i))
    mean = hdi[C].mean()
    ax.axvline(mean,c=‘C{}‘.format(i))
    print(‘Mean value of {}: {}‘.format(C,mean,))
    #ax.text(round(mean,0),0.1,round(mean,2))
    ax.legend()
plt.title(‘Life expectancy‘);

col = hdi.columns[hdi.columns.str.startswith(‘Mean years‘)]
f,ax = plt.subplots(figsize=(16,6))
for i,C in enumerate(col):
    hdi[C].plot(kind=‘kde‘,ax=ax,c=‘C{}‘.format(i))
    mean = hdi[C].mean()
    ax.axvline(mean,c=‘C{}‘.format(i))
    print(‘Mean value of {}: {}‘.format(C,mean,))
    #ax.text(round(mean,0),0.1,round(mean,2))
    ax.legend()
plt.title(‘Mean value of Schooling‘);

f,ax=plt.subplots(figsize=(16,6))
col = continent_hdi.columns[continent_hdi.columns.str.startswith(‘Mean years‘)]

continent_hdi[col].plot(ax=ax,kind=‘bar‘)
plt.xticks(np.arange(15),continent_hdi[‘Human development groups‘],rotation=90);

f,ax=plt.subplots(figsize=(16,6))

continent_hdi[‘Share of seats in parliament (% held by women)‘].plot(kind=‘bar‘,ax=ax)
plt.xticks(np.arange(15),continent_hdi[‘Human development groups‘],rotation=90)
for i,v in enumerate(continent_hdi[‘Share of seats in parliament (% held by women)‘]):
    plt.text(i,2,round(v,2),fontsize=12,rotation=90);

f,ax=plt.subplots(3,1,figsize=(16,6),sharex=True)
axs = ax.ravel()
col = [‘Population Ages 15–64 (millions) 2015‘,‘Population Under age 5 (millions) 2015‘,
       ‘Population Ages 65 and older (millions) 2015‘,‘Human development groups‘]
continent_hdi[col].plot(ax=axs[0],kind=‘line‘)
axs[0].set_title(‘Population by Age‘)
col = [‘Total Population (millions) 2015‘, ‘Total Population (millions) 2030‘,]
continent_hdi[col].plot(ax=axs[1],kind=‘line‘)
axs[1].set_title(‘Total Population‘)

col = [‘Population Average annual growth 2000/2005 (%) ‘,‘Population Average annual growth 2010/2015 (%) ‘]
continent_hdi[col].plot(ax=axs[2],kind=‘line‘)
axs[2].set_title(‘Population Growth %‘)
plt.xticks(np.arange(15),continent_hdi[‘Human development groups‘],rotation=90);
#axs[2].set_xticklabels([x for x in continent_hdi[‘Human development groups‘]], rotation=90);

f,ax = plt.subplots(1,2,figsize=(16,6))
value = (hdi[[‘Employment in agriculture (% of total employment) 2010-2014‘,‘Country‘]].sort_values(by=‘Employment in agriculture (% of total employment) 2010-2014‘)[:10])
sns.barplot(value[‘Employment in agriculture (% of total employment) 2010-2014‘],value[‘Country‘],palette=‘cool‘,ax=ax[0])
ax[0].set_title(‘Bottom 10 country Employed in agriculture‘)
for i, v in enumerate(value[‘Employment in agriculture (% of total employment) 2010-2014‘]):
    ax[0].text(0,i, round(v,2),fontsize=10,color=‘k‘)

value = (hdi[[‘Employment in agriculture (% of total employment) 2010-2014‘,‘Country‘]].sort_values(by=‘Employment in agriculture (% of total employment) 2010-2014‘,ascending=False)[:10])
sns.barplot(value[‘Employment in agriculture (% of total employment) 2010-2014‘],value[‘Country‘],palette=‘Wistia‘,ax=ax[1])
ax[1].set_title(‘Top 10 country Employed in agriculture‘);
for i, v in enumerate(value[‘Employment in agriculture (% of total employment) 2010-2014‘]):
    ax[1].text(0,i, round(v,2),fontsize=10,color=‘k‘)

f,ax = plt.subplots(1,2,figsize=(16,6))
value = (hdi[[‘Total Unemployment (% of labour force) 2015‘,‘Country‘]].sort_values(by=‘Total Unemployment (% of labour force) 2015‘)[:10])
sns.barplot(value[‘Total Unemployment (% of labour force) 2015‘],value[‘Country‘],palette=‘cool‘,ax=ax[0])
ax[0].set_title(‘Bottom 10 country by Unemployment‘)
for i, v in enumerate(value[‘Total Unemployment (% of labour force) 2015‘]):
    ax[0].text(0,i, round(v,2),fontsize=10,color=‘k‘)

value = (hdi[[‘Total Unemployment (% of labour force) 2015‘,‘Country‘]].sort_values(by=‘Total Unemployment (% of labour force) 2015‘,ascending=False)[:10])
sns.barplot(value[‘Total Unemployment (% of labour force) 2015‘],value[‘Country‘],palette=‘Wistia‘,ax=ax[1])
ax[1].set_title(‘Top 10 country by Unemployed‘);
for i, v in enumerate(value[‘Total Unemployment (% of labour force) 2015‘]):
    ax[1].text(0,i, round(v,2),fontsize=10,color=‘k‘)

m = folium.Map(location=[0,0],zoom_start=2)

m.choropleth(geo_data= geo_world_data,data = hdi, columns=[‘Country‘,‘Inequality in income (%)‘],key_on=‘feature.properties.name‘,name=‘Inequality in income (%)‘,fill_opacity=1,fill_color=‘GnBu‘,highlight=True, legend_name=‘Inequality in income (%)‘)
folium.LayerControl().add_to(m)
m

原文地址:https://www.cnblogs.com/tszr/p/11240220.html

时间: 2024-10-31 16:48:33

吴裕雄--天生自然 PYTHON数据分析:人类发展报告——HDI, GDI,健康,全球人口数据数据分析的相关文章

吴裕雄--天生自然python Google深度学习框架:Tensorflow实现迁移学习

import glob import os.path import numpy as np import tensorflow as tf from tensorflow.python.platform import gfile import tensorflow.contrib.slim as slim # 加载通过TensorFlow-Slim定义好的inception_v3模型. import tensorflow.contrib.slim.python.slim.nets.incepti

吴裕雄--天生自然 PYTHON数据分析:糖尿病视网膜病变数据分析(完整版)

# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np

吴裕雄--天生自然 PYTHON语言数据分析:ESA的火星快车操作数据集分析

import os import numpy as np import pandas as pd from datetime import datetime import matplotlib import matplotlib.pyplot as plt import seaborn as sns sns.set_style('white') %matplotlib inline %load_ext autoreload %autoreload 2 def to_utms(ut): retur

吴裕雄--天生自然python机器学习:决策树算法

我们经常使用决策树处理分类问题’近来的调查表明决策树也是最经常使用的数据挖掘算法. 它之所以如此流行,一个很重要的原因就是使用者基本上不用了解机器学习算法,也不用深究它 是如何工作的. K-近邻算法可以完成很多分类任务,但是它最大的缺点就是无法给出数据的内 在含义,决策树的主要优势就在于数据形式非常容易理解. 决策树很多任务都 是为了数据中所蕴含的知识信息,因此决策树可以使用不熟悉的数据集合,并从中提取出一系列 规则,机器学习算法最终将使用这些机器从数据集中创造的规则.专家系统中经常使用决策树,

吴裕雄--天生自然python编程:turtle模块绘图(3)

turtle(海龟)是Python重要的标准库之一,它能够进行基本的图形绘制.turtle图形绘制的概念诞生于1969年,成功应用于LOGO编程语言. turtle库绘制图形有一个基本框架:一个小海龟在坐标系中爬行,其爬行轨迹形成了绘制图形.刚开始绘制时,小海龟位于画布正中央,此处坐标为(0,0),前进方向为水平右方. Python——turtle库 turtle库包含100多个功能函数,主要包括窗体函数.画笔状态函数和画笔运动函数3类. 画笔运动函数 turtle通过一组函数控制画笔的行进动作

吴裕雄--天生自然python编程:正则表达式

re.match函数 re.match 尝试从字符串的起始位置匹配一个模式,如果不是起始位置匹配成功的话,match()就返回none. 函数语法: re.match(pattern, string, flags=0) 函数参数说明: 参数 描述 pattern 匹配的正则表达式 string 要匹配的字符串. flags 标志位,用于控制正则表达式的匹配方式,如:是否区分大小写,多行匹配等等. 匹配成功re.match方法返回一个匹配的对象,否则返回None. 我们可以使用group(num)

吴裕雄--天生自然python机器学习:朴素贝叶斯算法

分类器有时会产生错误结果,这时可以要求分类器给出一个最优的类别猜测结果,同 时给出这个猜测的概率估计值. 概率论是许多机器学习算法的基础 在计算 特征值取某个值的概率时涉及了一些概率知识,在那里我们先统计特征在数据集中取某个特定值 的次数,然后除以数据集的实例总数,就得到了特征取该值的概率. 首先从一个最简单的概率分类器开始,然后给 出一些假设来学习朴素贝叶斯分类器.我们称之为“朴素”,是因为整个形式化过程只做最原始.最简单的假设. 基于贝叶斯决策理论的分类方法 朴素贝叶斯是贝叶斯决策理论的一部

吴裕雄--天生自然python机器学习:支持向量机SVM

基于最大间隔分隔数据 import matplotlib import matplotlib.pyplot as plt from numpy import * xcord0 = [] ycord0 = [] xcord1 = [] ycord1 = [] markers =[] colors =[] fr = open('F:\\machinelearninginaction\\Ch06\\testSet.txt')#this file was generated by 2normalGen.

吴裕雄--天生自然python机器学习:使用K-近邻算法改进约会网站的配对效果

在约会网站使用K-近邻算法 准备数据:从文本文件中解析数据 海伦收集约会数据巳经有了一段时间,她把这些数据存放在文本文件(1如1^及抓 比加 中,每 个样本数据占据一行,总共有1000行.海伦的样本主要包含以下3种特征: 每年获得的飞行常客里程数 玩视频游戏所耗时间百分比 每周消费的冰淇淋公升数 将文本记录到转换NumPy的解析程序 import operator from numpy import * from os import listdir def file2matrix(filenam