吴裕雄--天生自然 PYTHON语言数据分析:ESA的火星快车操作数据集分析

import os
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style(‘white‘)

%matplotlib inline

%load_ext autoreload
%autoreload 2
def to_utms(ut):
    return (ut.astype(np.int64) * 1e-6).astype(int)

def read_merged_train_and_test_data(file_name):
    src_path = "F:\\kaggleDataSet\\hackathon-krakow\\hackathon-krakow-2017-05-27"
    train_path = os.path.join(src_path, "context--2014-01-01_2015-01-01--" + file_name + ".csv")
    train_df = pd.read_csv(train_path)
    test_path = os.path.join(src_path, "context--2015-01-01_2015-07-01--" + file_name + ".csv")
    test_df = pd.read_csv(test_path)
    df = pd.concat([train_df, test_df])

    return convert_timestamp_to_date(df)

def convert_timestamp_to_date(df, timestamp_column="ut_ms"):
    df[timestamp_column] = pd.to_datetime(df[timestamp_column], unit=‘ms‘)
    df = df.set_index(timestamp_column)
    df = df.dropna()
    return df

def parse_subsystems(dmop_data):
    dmop_frame = dmop_data.copy()
    dmop_frame = dmop_frame[dmop_frame["subsystem"].str.startswith("A")]
    dmop_frame["device"] = dmop_frame["subsystem"].str[1:4]
    dmop_frame["command"] = dmop_frame["subsystem"].str[4:]
    dmop_frame = dmop_frame.drop("subsystem", axis=1)
    return dmop_frame

def generate_count_in_hour_from_raw_data(raw_data, column_name):
    raw_frame = raw_data.copy()
    raw_frame["timestamp_by_hour"] = raw_frame.index.map(lambda t: datetime(t.year, t.month, t.day, t.hour))
    events_by_hour = raw_frame.groupby(["timestamp_by_hour", column_name]).agg("count")
    events_by_hour = events_by_hour.reset_index()
    events_by_hour.columns = [‘timestamp_by_hour‘, column_name, ‘count‘]
    events_by_hour = events_by_hour.pivot(index="timestamp_by_hour", columns=column_name, values="count").fillna(0)

    events_by_hour.columns =["count_" + str(column_name) + "_in_hour" for column_name in events_by_hour.columns]
    events_by_hour.index.names = [‘ut_ms‘]

    return events_by_hour

def important_commands(dmop_data):
    count_of_each_command = dmop_data["command"].value_counts()
    important_commands = count_of_each_command[count_of_each_command > 2000]
    return list(important_commands.index)

def important_events(evtf_data):
    count_of_each_event = evtf_data["description"].value_counts()
    important_event_names = count_of_each_event[count_of_each_event > 1000]
    return list(important_event_names.index)
dmop_raw = read_merged_train_and_test_data("dmop")
evtf_raw = read_merged_train_and_test_data("evtf")
ltdata_raw = read_merged_train_and_test_data("ltdata")
saaf_raw = read_merged_train_and_test_data("saaf")

power_train_raw = convert_timestamp_to_date(pd.read_csv("F:\\kaggleDataSet\\hackathon-krakow\\hackathon-krakow-2017-05-27\\power--2014-01-01_2015-01-01.csv"))
power_train_raw = power_train_raw.resample("1H").mean().dropna()
power_test_raw = convert_timestamp_to_date(pd.read_csv("F:\\kaggleDataSet\\hackathon-krakow\\hackathon-krakow-2017-05-27\\sample_power_zeros--2015-01-01_2015-07-01.csv"))
power_raw = pd.concat([power_train_raw, power_test_raw])
plt.figure(figsize=(20, 3))
power_raw_with_sum = power_train_raw.copy()
power_raw_with_sum["power_sum"] = power_raw_with_sum.sum(axis=1)
power_raw_with_sum["power_sum"].plot()

plt.figure(figsize=(20, 10))
plt.imshow(power_train_raw.values.T, aspect=‘auto‘, cmap="viridis")

dmop_devices = parse_subsystems(dmop_raw)

dmop_devive_commands_by_hour = generate_count_in_hour_from_raw_data(dmop_devices, "device")
dmop_devive_commands_by_hour["dmop_sum"] = dmop_devive_commands_by_hour.sum(axis=1)

dmop_commands_by_hour = generate_count_in_hour_from_raw_data(dmop_devices, "command")
important_command_names = important_commands(dmop_devices)
important_command_names = list(map(lambda x: "count_" + x + "_in_hour", important_command_names))
dmop_commands_by_hour = dmop_commands_by_hour[important_command_names]

dmop_data_per_hour = pd.concat([dmop_devive_commands_by_hour, dmop_commands_by_hour], axis=1)
dmop_data_per_hour.head()

plt.figure(figsize=(20, 10))
dmop_devive_commands_by_hour["dmop_sum"].plot()

dmop_data = dmop_data_per_hour.reindex(power_raw_with_sum.index, method="nearest")
dmop_with_power_data = pd.concat([power_raw_with_sum, dmop_data], axis=1)
dmop_with_power_data.columns

sns.jointplot("dmop_sum", "power_sum", dmop_with_power_data)

dmop_with_power_data = dmop_with_power_data.resample("24h").mean()
sns.pairplot(dmop_with_power_data, x_vars=dmop_commands_by_hour.columns[0:6], y_vars="power_sum")

sns.pairplot(dmop_with_power_data, x_vars=dmop_commands_by_hour.columns[0:6], y_vars=power_raw.columns[0:6])

important_event_names = list(filter(lambda name: (not("_START" in name) and not("_END" in name)), important_events(evtf_raw)))
important_evtf = evtf_raw[evtf_raw["description"].isin(important_event_names)]

important_evtf["description"].value_counts()

important_evtf_with_count = important_evtf.copy()
important_evtf_with_count["count"] = 1
important_evtf_data_per_hour = generate_count_in_hour_from_raw_data(important_evtf_with_count, "description")
important_evtf_data_per_hour.head()

evtf_data = important_evtf_data_per_hour.reindex(power_raw_with_sum.index, method="nearest")
evtf_with_power_data = pd.concat([power_raw_with_sum, evtf_data])
evtf_with_power_data.columns

evtf_with_power_data = evtf_with_power_data.resample("24h").mean()
sns.pairplot(evtf_with_power_data, x_vars=important_evtf_data_per_hour.columns[0:6], y_vars="power_sum")

sns.pairplot(evtf_with_power_data, x_vars=important_evtf_data_per_hour.columns[0:6], y_vars=power_raw.columns[0:6])

def is_start_event(description, event_type):
    return int((event_type in description) and ("START" in description))
msl_events = ["MSL_/_RANGE_06000KM_START", "MSL_/_RANGE_06000KM_END"]
mrb_events = ["MRB_/_RANGE_06000KM_START", "MRB_/_RANGE_06000KM_END"]
penumbra_events = ["MAR_PENUMBRA_START", "MAR_PENUMBRA_END"]
umbra_events = ["MAR_UMBRA_START", "MAR_UMBRA_END"]

msl_events_df = evtf_raw[evtf_raw["description"].isin(msl_events)].copy()
msl_events_df["in_msl"] = msl_events_df["description"].map(lambda row: is_start_event(row, "MSL"))
msl_events_df = msl_events_df["in_msl"]

mrb_events_df = evtf_raw[evtf_raw["description"].isin(mrb_events)].copy()
mrb_events_df["in_mrb"] = mrb_events_df["description"].map(lambda row: is_start_event(row, "MRB"))
mrb_events_df = mrb_events_df["in_mrb"]

penumbra_events_df = evtf_raw[evtf_raw["description"].isin(penumbra_events)].copy()
penumbra_events_df["in_penumbra"] = penumbra_events_df["description"].map(lambda row: is_start_event(row, "PENUMBRA"))
penumbra_events_df = penumbra_events_df["in_penumbra"]

umbra_events_df = evtf_raw[evtf_raw["description"].isin(umbra_events)].copy()
umbra_events_df["in_umbra"] = umbra_events_df["description"].map(lambda row: is_start_event(row, "UMBRA"))
umbra_events_df = umbra_events_df["in_umbra"]
ltdata_raw.columns

ltdata_raw["eclipseduration_min"].plot()

saaf_raw.describe()

dmop_data = dmop_data_per_hour.reindex(power_raw.index, method="nearest")
evtf_events_data = important_evtf_data_per_hour.reindex(power_raw.index, method="nearest")
msl_period_events_data = msl_events_df.reindex(power_raw.index, method="pad").fillna(0)
mrb_period_events_data = mrb_events_df.reindex(power_raw.index, method="pad").fillna(0)
penumbra_period_events_data = penumbra_events_df.reindex(power_raw.index, method="pad").fillna(0)
umbra_period_events_data = umbra_events_df.reindex(power_raw.index, method="pad").fillna(0)
ltdata_data = ltdata_raw.reindex(power_raw.index, method="nearest")
saaf_data = saaf_raw.reindex(power_raw.index, method="nearest")
all_data = pd.concat([power_raw, dmop_data, evtf_events_data, msl_period_events_data, mrb_period_events_data, penumbra_period_events_data, umbra_period_events_data, ltdata_data, saaf_data], axis=1)
print(all_data.columns, all_data.shape)

plt.figure(figsize=(20, 10))
plt.imshow(all_data.values.T, aspect=‘auto‘, vmin=0, vmax=5, cmap="viridis")

train_set_start_date, train_set_end_date = power_train_raw.index[0], power_train_raw.index[-1]
train_data = all_data[all_data.index <= train_set_end_date].copy()
test_data = all_data.loc[power_test_raw.index].copy()
plt.figure(figsize=(20, 10))
plt.imshow(train_data.values.T, aspect=‘auto‘, vmin=0, vmax=5, cmap="viridis")

plt.figure(figsize=(20, 10))
plt.imshow(test_data.values.T, aspect=‘auto‘, vmin=0, vmax=5, cmap="viridis")

X_train = train_data[train_data.columns.difference(power_raw.columns)]
y_train = train_data[power_raw.columns]
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.3, random_state=0)
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
y_validation_predicted = reg.predict(X_validation)
mean_squared_error(y_validation, y_validation_predicted)

elastic_net = linear_model.ElasticNet()
elastic_net.fit(X_train, y_train)
y_validation_predicted = elastic_net.predict(X_validation)
mean_squared_error(y_validation, y_validation_predicted)

原文地址:https://www.cnblogs.com/tszr/p/11258449.html

时间: 2024-10-26 00:45:38

吴裕雄--天生自然 PYTHON语言数据分析:ESA的火星快车操作数据集分析的相关文章

吴裕雄--天生自然 PYTHON数据分析:糖尿病视网膜病变数据分析(完整版)

# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np

吴裕雄--天生自然 PYTHON数据分析:人类发展报告——HDI, GDI,健康,全球人口数据数据分析

import pandas as pd # Data analysis import numpy as np #Data analysis import seaborn as sns # Data visualization import matplotlib.pyplot as plt # Data Visualization import matplotlib.gridspec as gridspec # subplots and grid from wordcloud import Wor

吴裕雄--天生自然C语言开发:结构体

struct tag { member-list member-list member-list ... } variable-list ; struct Books { char title[50]; char author[50]; char subject[100]; int book_id; } book; //此声明声明了拥有3个成员的结构体,分别为整型的a,字符型的b和双精度的c //同时又声明了结构体变量s1 //这个结构体并没有标明其标签 struct { int a; char

吴裕雄--天生自然python编程:turtle模块绘图(3)

turtle(海龟)是Python重要的标准库之一,它能够进行基本的图形绘制.turtle图形绘制的概念诞生于1969年,成功应用于LOGO编程语言. turtle库绘制图形有一个基本框架:一个小海龟在坐标系中爬行,其爬行轨迹形成了绘制图形.刚开始绘制时,小海龟位于画布正中央,此处坐标为(0,0),前进方向为水平右方. Python——turtle库 turtle库包含100多个功能函数,主要包括窗体函数.画笔状态函数和画笔运动函数3类. 画笔运动函数 turtle通过一组函数控制画笔的行进动作

吴裕雄--天生自然python编程:正则表达式

re.match函数 re.match 尝试从字符串的起始位置匹配一个模式,如果不是起始位置匹配成功的话,match()就返回none. 函数语法: re.match(pattern, string, flags=0) 函数参数说明: 参数 描述 pattern 匹配的正则表达式 string 要匹配的字符串. flags 标志位,用于控制正则表达式的匹配方式,如:是否区分大小写,多行匹配等等. 匹配成功re.match方法返回一个匹配的对象,否则返回None. 我们可以使用group(num)

吴裕雄--天生自然python机器学习:决策树算法

我们经常使用决策树处理分类问题’近来的调查表明决策树也是最经常使用的数据挖掘算法. 它之所以如此流行,一个很重要的原因就是使用者基本上不用了解机器学习算法,也不用深究它 是如何工作的. K-近邻算法可以完成很多分类任务,但是它最大的缺点就是无法给出数据的内 在含义,决策树的主要优势就在于数据形式非常容易理解. 决策树很多任务都 是为了数据中所蕴含的知识信息,因此决策树可以使用不熟悉的数据集合,并从中提取出一系列 规则,机器学习算法最终将使用这些机器从数据集中创造的规则.专家系统中经常使用决策树,

吴裕雄--天生自然python机器学习:朴素贝叶斯算法

分类器有时会产生错误结果,这时可以要求分类器给出一个最优的类别猜测结果,同 时给出这个猜测的概率估计值. 概率论是许多机器学习算法的基础 在计算 特征值取某个值的概率时涉及了一些概率知识,在那里我们先统计特征在数据集中取某个特定值 的次数,然后除以数据集的实例总数,就得到了特征取该值的概率. 首先从一个最简单的概率分类器开始,然后给 出一些假设来学习朴素贝叶斯分类器.我们称之为“朴素”,是因为整个形式化过程只做最原始.最简单的假设. 基于贝叶斯决策理论的分类方法 朴素贝叶斯是贝叶斯决策理论的一部

吴裕雄--天生自然python Google深度学习框架:Tensorflow实现迁移学习

import glob import os.path import numpy as np import tensorflow as tf from tensorflow.python.platform import gfile import tensorflow.contrib.slim as slim # 加载通过TensorFlow-Slim定义好的inception_v3模型. import tensorflow.contrib.slim.python.slim.nets.incepti

吴裕雄--天生自然C语言开发:字符串

char greeting[6] = {'H', 'e', 'l', 'l', 'o', '\0'}; char greeting[] = "Hello"; #include <stdio.h> int main () { char greeting[6] = {'H', 'e', 'l', 'l', 'o', '\0'}; printf("Greeting message: %s\n", greeting ); return 0; } #include