《Python机器学习kaggle案例》-- 网易云课堂

https://study.163.com/course/courseMain.htm?courseId=1003551009

LinearRegression
# -*- coding: utf-8 -*-
"""
Created on Sat Dec  1 09:24:27 2018

@author: zh
"""

import pandas as pd
import numpy as np

titanic = pd.read_csv(‘train.csv‘)

titanic[‘Age‘] = titanic[‘Age‘].fillna(titanic[‘Age‘].median())

titanic.loc[titanic[‘Sex‘] == ‘male‘, ‘Sex‘] = 0
titanic.loc[titanic[‘Sex‘] == ‘female‘, ‘Sex‘] = 1

titanic[‘Embarked‘] = titanic[‘Embarked‘].replace(‘nan‘, np.nan).fillna(‘S‘)
titanic.loc[titanic[‘Embarked‘] == ‘S‘, ‘Embarked‘] = 0
titanic.loc[titanic[‘Embarked‘] == ‘C‘, ‘Embarked‘] = 1
titanic.loc[titanic[‘Embarked‘] == ‘Q‘, ‘Embarked‘] = 2

from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
predictors = [‘Pclass‘, ‘Sex‘, ‘Age‘, ‘SibSp‘, ‘Parch‘, ‘Fare‘, ‘Embarked‘]
alg = LinearRegression()
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions = []
for train, test  in kf:
    train_predictors = (titanic[predictors].iloc[train, :])
    train_target = titanic[‘Survived‘].iloc[train]
    alg.fit(train_predictors, train_target)
    test_predictions = alg.predict(titanic[predictors].iloc[test, :])
    predictions.append(test_predictions)

predictions = np.concatenate(predictions, axis=0)
predictions[predictions > 0.5] = 1
predictions[predictions <= 0.5] = 0
accuracy = sum(predictions == titanic[‘Survived‘])/len(predictions)

#accuracy = 0.7833894500561167

LogisticRegression

# -*- coding: utf-8 -*-
"""
Created on Sat Dec  1 09:34:55 2018

@author: zh
"""

import pandas as pd
import numpy as np

titanic = pd.read_csv(‘train.csv‘)

titanic[‘Age‘] = titanic[‘Age‘].fillna(titanic[‘Age‘].median())

titanic.loc[titanic[‘Sex‘] == ‘male‘, ‘Sex‘] = 0
titanic.loc[titanic[‘Sex‘] == ‘female‘, ‘Sex‘] = 1

titanic[‘Embarked‘] = titanic[‘Embarked‘].replace(‘nan‘, np.nan).fillna(‘S‘)
titanic.loc[titanic[‘Embarked‘] == ‘S‘, ‘Embarked‘] = 0
titanic.loc[titanic[‘Embarked‘] == ‘C‘, ‘Embarked‘] = 1
titanic.loc[titanic[‘Embarked‘] == ‘Q‘, ‘Embarked‘] = 2

predictors = [‘Pclass‘, ‘Sex‘, ‘Age‘, ‘SibSp‘, ‘Parch‘, ‘Fare‘, ‘Embarked‘]

from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[‘Survived‘], cv=3)
accuracy = scores.mean()

#accuracy = 0.7878787878787877

RandomForestClassifier

# -*- coding: utf-8 -*-
"""
Created on Sat Dec  1 09:37:31 2018

@author: zh
"""
import pandas as pd
import numpy as np

titanic = pd.read_csv(‘train.csv‘)

titanic[‘Age‘] = titanic[‘Age‘].fillna(titanic[‘Age‘].median())

titanic.loc[titanic[‘Sex‘] == ‘male‘, ‘Sex‘] = 0
titanic.loc[titanic[‘Sex‘] == ‘female‘, ‘Sex‘] = 1

titanic[‘Embarked‘] = titanic[‘Embarked‘].replace(‘nan‘, np.nan).fillna(‘S‘)
titanic.loc[titanic[‘Embarked‘] == ‘S‘, ‘Embarked‘] = 0
titanic.loc[titanic[‘Embarked‘] == ‘C‘, ‘Embarked‘] = 1
titanic.loc[titanic[‘Embarked‘] == ‘Q‘, ‘Embarked‘] = 2

predictors = [‘Pclass‘, ‘Sex‘, ‘Age‘, ‘SibSp‘, ‘Parch‘, ‘Fare‘, ‘Embarked‘]

from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
predictors = [‘Pclass‘, ‘Sex‘, ‘Age‘, ‘SibSp‘, ‘Parch‘, ‘Fare‘, ‘Embarked‘]
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[‘Survived‘], cv=kf)
accuracy = scores.mean()
#accuracy = 0.7856341189674523

alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[‘Survived‘], cv=kf)
accuracy = scores.mean()
#accuracy = 0.8159371492704826

max_acc = 0
for n_estimators in range(1,60,10):
    for min_samples_split in range(2,10):
        for min_samples_leaf in range(1,10):
            alg = RandomForestClassifier(random_state=1, n_estimators=n_estimators, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
            kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
            scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[‘Survived‘], cv=kf)
            accuracy = scores.mean()
            if accuracy>max_acc:
                print(n_estimators,min_samples_split,min_samples_leaf)
                max_acc = accuracy
print(max_acc)
#max_acc = 0.8316498316498316

feature_selection

# -*- coding: utf-8 -*-
"""
Created on Sat Dec  1 09:52:38 2018

@author: zh
"""

import pandas as pd
import numpy as np

titanic = pd.read_csv(‘train.csv‘)

titanic[‘Age‘] = titanic[‘Age‘].fillna(titanic[‘Age‘].median())

titanic.loc[titanic[‘Sex‘] == ‘male‘, ‘Sex‘] = 0
titanic.loc[titanic[‘Sex‘] == ‘female‘, ‘Sex‘] = 1

titanic[‘Embarked‘] = titanic[‘Embarked‘].replace(‘nan‘, np.nan).fillna(‘S‘)
titanic.loc[titanic[‘Embarked‘] == ‘S‘, ‘Embarked‘] = 0
titanic.loc[titanic[‘Embarked‘] == ‘C‘, ‘Embarked‘] = 1
titanic.loc[titanic[‘Embarked‘] == ‘Q‘, ‘Embarked‘] = 2

titanic[‘FamilySize‘] = titanic[‘SibSp‘] + titanic[‘Parch‘]
titanic[‘NameLength‘] = titanic[‘Name‘].apply(lambda x: len(x))

import re
def get_title(name):
    title_search = re.search(‘ ([A-Za-z]+)\.‘, name)
    if title_search:
        return title_search.group(1)
    return ‘‘
titles = titanic[‘Name‘].apply(get_title)
#pd.value_counts(titles)

title_mapping = {‘Mr‘: 1, ‘Miss‘: 2, ‘Mrs‘: 3, ‘Master‘: 4, ‘Dr‘: 5, ‘Rev‘: 6, ‘Col‘: 7, ‘Major‘: 8, ‘Mlle‘: 9, ‘Capt‘: 10, ‘Ms‘: 11, ‘Jonkheer‘: 12, ‘Don‘:13, ‘Sir‘:14, ‘Countess‘:15, ‘Lady‘:16, ‘Mme‘:17}
for k,v in title_mapping.items():
    titles[titles==k]=v
#pd.value_counts(titles)
titanic[‘Title‘] = titles

import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt

predictors = [‘Pclass‘, ‘Sex‘, ‘Age‘, ‘SibSp‘, ‘Parch‘, ‘Fare‘, ‘Embarked‘, ‘FamilySize‘, ‘Title‘, ‘NameLength‘]

selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic[‘Survived‘])
scores = -np.log10(selector.pvalues_)
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation=‘vertical‘)
plt.show()

from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
predictors = [‘Pclass‘, ‘Sex‘, ‘Fare‘, ‘Title‘]

alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[‘Survived‘], cv=kf)
accuracy = scores.mean()

#accuracy=0.8114478114478114

GradientBoostingClassifier

# -*- coding: utf-8 -*-
"""
Created on Sat Dec  1 09:52:38 2018

@author: zh
"""
import pandas as pd
import numpy as np

titanic = pd.read_csv(‘train.csv‘)

titanic[‘Age‘] = titanic[‘Age‘].fillna(titanic[‘Age‘].median())

titanic.loc[titanic[‘Sex‘] == ‘male‘, ‘Sex‘] = 0
titanic.loc[titanic[‘Sex‘] == ‘female‘, ‘Sex‘] = 1

titanic[‘Embarked‘] = titanic[‘Embarked‘].replace(‘nan‘, np.nan).fillna(‘S‘)
titanic.loc[titanic[‘Embarked‘] == ‘S‘, ‘Embarked‘] = 0
titanic.loc[titanic[‘Embarked‘] == ‘C‘, ‘Embarked‘] = 1
titanic.loc[titanic[‘Embarked‘] == ‘Q‘, ‘Embarked‘] = 2

titanic[‘FamilySize‘] = titanic[‘SibSp‘] + titanic[‘Parch‘]
titanic[‘NameLength‘] = titanic[‘Name‘].apply(lambda x: len(x))

import re
def get_title(name):
    title_search = re.search(‘ ([A-Za-z]+)\.‘, name)
    if title_search:
        return title_search.group(1)
    return ‘‘
titles = titanic[‘Name‘].apply(get_title)
#pd.value_counts(titles)

title_mapping = {‘Mr‘: 1, ‘Miss‘: 2, ‘Mrs‘: 3, ‘Master‘: 4, ‘Dr‘: 5, ‘Rev‘: 6, ‘Col‘: 7, ‘Major‘: 8, ‘Mlle‘: 9, ‘Capt‘: 10, ‘Ms‘: 11, ‘Jonkheer‘: 12, ‘Don‘:13, ‘Sir‘:14, ‘Countess‘:15, ‘Lady‘:16, ‘Mme‘:17}
for k,v in title_mapping.items():
    titles[titles==k]=v
#pd.value_counts(titles)
titanic[‘Title‘] = titles

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold

algorithms = [
    [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), [‘Pclass‘, ‘Sex‘, ‘Age‘, ‘SibSp‘, ‘Parch‘, ‘Fare‘, ‘Embarked‘, ‘Title‘]],
    [LogisticRegression(random_state=1), [‘Pclass‘, ‘Sex‘, ‘Age‘, ‘SibSp‘, ‘Parch‘, ‘Fare‘, ‘Embarked‘, ‘Title‘]]
]
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions = []
for train, test  in kf:
    train_target = titanic[‘Survived‘].iloc[train]
    full_test_predictions = []
    for alg, predictors in algorithms:
        alg.fit(titanic[predictors].iloc[train, :], train_target)
        test_predictions = alg.predict_proba(titanic[predictors].iloc[test, :].astype(float))[:,1]
        full_test_predictions.append(test_predictions)
    test_predictions = (full_test_predictions[0]*3 + full_test_predictions[1])/4
    test_predictions[test_predictions <= 0.5] = 0
    test_predictions[test_predictions > 0.5] = 1
    predictions.append(test_predictions)
predictions = np.concatenate(predictions, axis=0)
accuracy = sum(predictions == titanic[‘Survived‘])/len(predictions)

#accuracy=0.8204264870931538

原文地址:https://www.cnblogs.com/LearnFromNow/p/10048373.html

时间: 2024-10-09 06:52:11

《Python机器学习kaggle案例》-- 网易云课堂的相关文章

?《Python自动化测试修炼宝典》线上课程已经成功入驻网易云课堂......

<Python自动化测试修炼宝典>线上课程已经成功入驻网易云课堂...... IT测试老兵利用工作之余,亲自录制的<Python自动化测试修炼宝典>线上课程已经成功入驻网易云课堂,想要提高测试技术的小伙伴可以线上购买课程进行学习. 课程背景 测试人员进阶实战课程.本套课程以作者多年测试实战经验为背景,结合大量测试案例深入浅出讲解目前主流web端.app端自动化测试框架以及使用Python如何从0到1实现接口测试框架的搭建. 课程特色 系统教学+实战案例+开放源码.涵盖Python3

网易云课堂-软件工程(C编码实践篇)

网易云课堂-软件工程(C编程实践篇) 识别代码工程质量: 代码风格: 封装接口: 可重用代码: 可重入函数和线程安全: 代码风格的原则:简明.易读.无二义性:代码风格原则在团队交流过程中非常重要.

聊一聊【网易云课堂】

(首先说明,这不是广告,知识我个人的一点感受而已) 若干天之前注册了网易云课堂,并填写.邮寄申请讲师的材料. 若干天之后的今天,终于收到通知,我通过了讲师认证,给我名字上加了个大大的"V".以及<petshop4.0源码解读>教程通过了审核,正式发布了.下班之前看了看,竟然已经有6名学习者了,就是不知道他们有没有真的学.有想学的直接去那里看就行了,免费的. 课堂教程的网址是:http://study.163.com/course/introduction/655003.ht

网易云课堂_C++程序设计入门(上)_第2单元:丹青画松石– EGE图形库_第2节:一个简单的EGE程序

网易云课堂_C++程序设计入门(上)_第2单元:丹青画松石– EGE图形库_第2节:一个简单的EGE程序 #ifndef _GRAPHICS_H_ #define _GRAPHICS_H_ #ifndef __cplusplus #error You must use C++ compiler, or you need filename with '.cpp' suffix #endif #include "ege.h" using namespace ege; #endif #inc

网易云课堂程序设计入门--C语言第七周:指针与字符串学习笔记

====================================== 第七周:指针与字符串学习笔记 网易云课堂 程序设计入门–C语言 指针与字符串 学习笔记 ====================================== 7-1指针初步 7-1-1类型大小与取地址运算 sizeof是一个运算符 给出某个变量货类型在内存中所占据的字节数 sizeof(int); sizeif(i); double变量在内存中占据的空间是int变量的两倍 { int a; a =6; prin

Python实例之抓取网易云课堂搜索数据(post方式json型数据)并保存为TXT

本实例实现了抓取网易云课堂中以'java'为关键字的搜索结果,经详细查看请求的方式为post,请求的结果为JSON数据 具体实现代码如下: import requests import json finalstr = '' #初始化字符串 totlePage = 0 #初始化总页数 test = 0 #初始化数据总条数 url = 'http://study.163.com/p/search/studycourse.json' headers = {'content-type': 'applic

Android 网易云课堂第一周笔记16/5/10

Android 网易云课堂第一周笔记 首先第一周的主要任务回顾,第一是Android环境的搭建和Android studio软件的安装使用.第二是android的hello word项目的构建,其他的项目的功能包括button的功能基本使用,比如页面的跳转,Toast,Intent等等,还有我认为最为重要的是Activity的生命活动的讲解,虽然老师在课堂上只是简单展示一下生命周期的几个内容,但我认为如果想玩好Activity的话,生命周期这一块一定要理解和掌握的,废话不多说直接进入正题. An

《RabbitMQ 实战》 -- 网易云课堂

安装 RabbitMQ解决的问题     实例 RabbitMQ通信信道 交换器类型 以上 原文地址:https://www.cnblogs.com/LearnFromNow/p/10011556.html

【网易云课堂】【专知团队】深度学习:算法到实战——绪论

概述 人工智能很火,各国出台政策  有趣的图灵测试 人工智能包括——计算智能(“深蓝”对战卡斯帕罗夫,国际象棋,暴力搜索,本质是计算).感知智能(能听会说,能看会认,类似于人的视觉.听觉.触觉等感知能力 ).认知智能(逻辑推理.知识理解.决策思考,概念.意识.观念都是认知智能的表现 ) 原文地址:https://www.cnblogs.com/dapeng-bupt/p/11600932.html

网易云课堂资源合集百度云分享 下载

2019康复职称考试[全套四门] 资源 百度云 康复教育网校Python-机器学习-进阶实战 资源 百度云 唐宇迪全民一起VBA提高篇(Excel数据处理) 资源 百度云 杨洋老师全网稀缺好课--徐sir的PS超神课 资源 百度云 PS东方站经济学人的动态图表与交互设计 资源 百度云 刘万祥ExcelPro19中级师康复医学治疗技术考试 资源 百度云 康复医学网校PS+AI+ID平面设计入门精通必修课 资源 百度云 顾领中Excel Power Query教程_数据整理 资源 百度云 Power