某公司的数据处理。
比如有数据:
0 1 20100304 51.14 5 62.2 0 0.0 1 6.82 4.08666666667 4 3 0 0 20100307 2.47333333333 3 75.9333333333 0 0.0 114 13.5666666667 2.86666666667 0 0 0 2 20100318 2.49333333333 3 58.6666666667 0 0.0 1 5.22666666667 1.61333333333 0 0 0 2 20100329 9.14666666667 14 70.6428571429 0 0.0 1 31.84 7.51333333333 11 7 0 0 20100401 3.36666666667 5 71.1 1 75.0 1 1.86 1.29333333333 0 0 0 1 20100401 7.44666666667 11 58.1909090909 0 0.0 51 40.8066666667 12.5066666667 2 1 0 0 20100401 1.0 1 138.0 0 0.0 1 1.0 1.0 0 0 1 0 20100428 1.0 1 85.0 0 0.0 1 0.0 0.0 0 0 1 0 20100429 1.0 1 128.0 0 0.0 1 24.54 3.59333333333 0 0 0 0 20100506 1.92 2 67.0 0 0.0 44 6.65333333333 1.91333333333 4 1
要求:
1、除去第1列和第2列。
2、对剩下的日期列提取特征,以月份为特征,形式:201003
3、对剩下的数字列,用最大值和最小值的区间,划分十份,每一份就是一个特征,形式:[1,5)、[5,9)...[21,25],注意最后一个区间闭合了。
4、用上面获取的特征,把原始数据向量化,形式:
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
开写:
先下载安装pyExcelerator:http://sourceforge.net/projects/pyexcelerator/,解压后setup.py install
# -*- coding: cp936 -*- import numpy as np from numpy import * import pprint from pyExcelerator import * #all_col_feature存放特征们 all_col_feature = [] for i in range(2,13): #获取[日期]列特征:201003,201004...201408... if i == 2 : FILENAME = "lost5" date = [e[0:6] for e in loadtxt(FILENAME , delimiter = " " , usecols=(2,) , dtype=str)] date = list(set(date)) #去重 date.sort() all_col_feature.append(date) #获取[数字]列特征:取最大值,最小值,划分10份,每一份就是一个特征,[1,5),[5,9)..[21,25] else: digit_col = filter(lambda e:e!=0,loadtxt(FILENAME , delimiter = " " , usecols=(i,) , dtype=float))#获取数据后去0 digit_col.sort() max_num = digit_col[-1] min_num = digit_col[0] block_size = (max_num - min_num) /10 col_feature = [] for i in range(11): col_feature.append(min_num+i*block_size) all_col_feature.append(col_feature) w = Workbook() #创建一个工作簿 ws = w.add_sheet('Zero_One_Data') #创建一个工作表 row_index = 0 col_index = 0 for i in range(len(all_col_feature)): if i == 0: for j in all_col_feature[i]: ws.write(row_index,col_index,str(j)) col_index += 1 else: for j in range(len(all_col_feature[i])-1): ws.write(row_index,col_index,str(all_col_feature[i][j])+"~"+str(all_col_feature[i][j+1])) col_index += 1 row_index += 1 out = file('out.txt','w') res = [] for row in loadtxt(FILENAME , delimiter = " " , usecols=(2,3,4,5,6,7,8,9,10,11,12) , dtype=str): #数据整理成这个形式:every_row_feature_index = [0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1] #比如:第一个0表示在[日期]列的特征下标为0,-1表示NONE值 #比如:第一个1表示在第二列的特征下标为1,心好累,我这都要注释? every_row_feature_index = [] a = [i for i in range(len(all_col_feature[0])) if row[0][0:6] <= all_col_feature[0][i]] every_row_feature_index.append(a[0]) for col in range(1,11): b = [i for i in range(len(all_col_feature[col])) if float(row[col]) < float(all_col_feature[col][i])] if len(b) == 0:#max_num every_row_feature_index.append(len(all_col_feature)-2) else: every_row_feature_index.append(b[0]-1) #-1表NONE #生成向量 #就是把every_row_feature_index = [0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1]转成下面的形式: #[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] zero_one_every_row_res = [] col_index = 0 for i in range(len(every_row_feature_index)): if i == 0: #处理[日期]列 for j in range(every_row_feature_index[i]): out.write('0'+' ') ws.write(row_index,col_index,'0') col_index += 1 out.write('1'+' ') ws.write(row_index,col_index,'1') col_index += 1 for j in range(every_row_feature_index[i]+1,len(all_col_feature[i])): out.write('0'+' ') ws.write(row_index,col_index,'0') col_index += 1 else:#处理其他列 if every_row_feature_index[i] == -1: #-1表NONE值 向量全为0 for j in range(len(all_col_feature[i])-1): out.write('0'+' ') ws.write(row_index,col_index,'0') col_index += 1 else: for j in range(every_row_feature_index[i]): out.write('0'+' ') ws.write(row_index,col_index,'0') col_index += 1 out.write('1'+' ') ws.write(row_index,col_index,'1') col_index += 1 for j in range(every_row_feature_index[i]+1,len(all_col_feature[i])-1): out.write('0'+' ') ws.write(row_index,col_index,'0') col_index += 1 out.write('\n') row_index += 1 if row_index == 100:#为了更快看到结果,只设置100行,去掉可得到全部 break out.close() w.save('out.xls')
看看结果:
版权声明:本文为博主原创文章,未经博主允许不得转载。
时间: 2024-10-30 08:37:00