KNN预测QSAR生物浓缩类别
数据来源:http://archive.ics.uci.edu/ml/datasets/QSAR+Bioconcentration+classes+dataset
import numpy import pandas #导入Excel文件 from sklearn.neighbors import KNeighborsClassifier #机器学习算法库,没有深度学习算法 shen=pandas.read_csv(r"D:\Python\代码\Machine-Learn\1-KNN\data\shenwu.csv") print("总数据条数:{};列数:{}".format(shen.shape[0],shen.shape[1]))shen.head()
总数据条数:779;列数:14
CAS | SMILES | Set | nHM | piPC09 | PCD | X2Av | MLOGP | ON1V | N-072 | B02[C-N] | F04[C-O] | Class | logBCF | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100-02-7 | O=[N+](c1ccc(cc1)O)[O-] | Train | 0 | 0.0 | 1.49 | 0.14 | 1.35 | 0.72 | 0 | 1 | 5 | 1 | 0.74 |
1 | 100-17-4 | O=[N+](c1ccc(cc1)OC)[O-] | Train | 0 | 0.0 | 1.47 | 0.14 | 1.70 | 0.88 | 0 | 1 | 5 | 1 | 0.93 |
2 | 100-18-5 | c1cc(ccc1C(C)C)C(C)C | Train | 0 | 0.0 | 1.20 | 0.25 | 4.14 | 2.06 | 0 | 0 | 0 | 3 | 3.24 |
3 | 100-25-4 | O=[N+]([O-])c1ccc(cc1)[N+](=O)[O-] | Train | 0 | 0.0 | 1.69 | 0.13 | 1.89 | 0.79 | 0 | 1 | 8 | 3 | -0.40 |
4 | 100-40-3 | C=CC1CCC=CC1 | Train | 0 | 0.0 | 0.52 | 0.25 | 2.65 | 1.31 | 0 | 0 | 0 | 1 | 2.24 |
# 筛选set值为Train的训练数据 shen_train=shen[shen.Set.isin(["Train"])] shen_test=shen[shen.Set.isin(["Test"])] print("训练数据:{}个\n测试数据:{}个".format((shen_train.shape)[0],(shen_test.shape[0]))) shen_test.head()
训练数据:584个 测试数据:195个
CAS | SMILES | Set | nHM | piPC09 | PCD | X2Av | MLOGP | ON1V | N-072 | B02[C-N] | F04[C-O] | Class | logBCF | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5 | 100-42-5 | C=Cc1ccccc1 | Test | 0 | 0.000 | 1.40 | 0.18 | 2.85 | 0.86 | 0 | 0 | 0 | 3 | 1.13 |
12 | 101-53-1 | Oc1ccc(cc1)Cc1ccccc1 | Test | 0 | 5.768 | 2.21 | 0.18 | 3.40 | 1.47 | 0 | 0 | 1 | 3 | 1.40 |
15 | 101-84-8 | O(c1ccccc1)c1ccccc1 | Test | 0 | 5.614 | 2.21 | 0.16 | 3.40 | 1.31 | 0 | 0 | 2 | 1 | 2.57 |
16 | 102-06-7 | N=C(Nc1ccccc1)Nc1ccccc1 | Test | 0 | 5.030 | 2.07 | 0.16 | 3.09 | 1.54 | 0 | 1 | 0 | 2 | 1.05 |
19 | 10315-98-7 | O1CCN(CC1)CC(C)C | Test | 0 | 0.000 | 0.00 | 0.28 | 1.00 | 1.80 | 0 | 1 | 1 | 1 | 0.23 |
获得训练,测试数据中的训练数据和结果
# 筛选训练数据中的结果数据 y_train=shen_train.iloc[:,[-2,-1]] y_test=shen_test.iloc[:,[-2,-1]] print("训练数据结果:\n{}\n测试数据结果:\n{}\n".format(y_train.head(),y_test.head()))
训练数据结果:
Class logBCF 0 1 0.74 1 1 0.93 2 3 3.24 3 3 -0.40 4 1 2.24 测试数据结果: Class logBCF 5 3 1.13 12 3 1.40 15 1 2.57 16 2 1.05 19 1 0.23
# 筛选训练数据中的训练数据 x_train=shen_train.iloc[:,[3,4,5,6,7,8,9,10,11]] x_test=shen_test.iloc[:,[3,4,5,6,7,8,9,10,11]] print("训练数据:\n{}\n测试数据:\n{}\n".format(x_train.head(),x_test.head()))
训练数据: nHM piPC09 PCD X2Av MLOGP ON1V N-072 B02[C-N] F04[C-O] 0 0 0.0 1.49 0.14 1.35 0.72 0 1 5 1 0 0.0 1.47 0.14 1.70 0.88 0 1 5 2 0 0.0 1.20 0.25 4.14 2.06 0 0 0 3 0 0.0 1.69 0.13 1.89 0.79 0 1 8 4 0 0.0 0.52 0.25 2.65 1.31 0 0 0 测试数据: nHM piPC09 PCD X2Av MLOGP ON1V N-072 B02[C-N] F04[C-O] 5 0 0.000 1.40 0.18 2.85 0.86 0 0 0 12 0 5.768 2.21 0.18 3.40 1.47 0 0 1 15 0 5.614 2.21 0.16 3.40 1.31 0 0 2 16 0 5.030 2.07 0.16 3.09 1.54 0 1 0 19 0 0.000 0.00 0.28 1.00 1.80 0 1 1
# 查看训练数据的数据类型,当数据类型不是int时要将数据映射为数字才能进行训练 y_train.info()
<class ‘pandas.core.frame.DataFrame‘> Int64Index: 584 entries, 0 to 776 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Class 584 non-null int64 1 logBCF 584 non-null float64 dtypes: float64(1), int64(1) memory usage: 13.7 KB
将非int类型的数据量化为整数
def change_type(values): for col in values.columns[:]: u=values[col].unique() # 获取每个属性的具体属性值,argwhere方法可以获取属性值的索引值(0,1,2...),并将具体属性值映射为索引值 def conver(x): return numpy.argwhere(u==x)[0,0] values[col]=values[col].map(conver) change_type(x_train) change_type(x_test) change_type(y_train) change_type(y_test) y_train
584 rows × 2 columns
Class | logBCF | |
---|---|---|
0 | 0 | 0 |
1 | 0 | 1 |
2 | 1 | 2 |
3 | 1 | 3 |
4 | 0 | 4 |
... | ... | ... |
771 | 0 | 333 |
772 | 0 | 334 |
773 | 0 | 41 |
774 | 0 | 142 |
776 | 0 | 335 |
knn=KNeighborsClassifier(n_neighbors=5,weights="distance",n_jobs=-1) knn.fit(x_train, y_train) y_=knn.predict(x_test) acc=(y_==y_test).mean() print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))
预测生物富集因子准确率:0.041025641025641026;预测生物富集等级准确率:0.4153846153846154
提高算法准确率
1,修改算法参数
knn=KNeighborsClassifier(n_neighbors=3,weights="distance",p=1,n_jobs=-1) knn.fit(x_train, y_train) y_=knn.predict(x_test) acc=(y_==y_test).mean() print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))
预测生物富集因子准确率:0.06666666666666667;预测生物富集等级准确率:0.441025641025641
2,修改训练数据
# 最大值最小值归一化(当数据间隔很大时可以有效提高准确率,消除属性之间的差异) x_train_min=x_train.min() x_train_max=x_train.max() x2_train=(x_train-x_train_min)/(x_train_max-x_train_min) x_test_min=x_test.min() x_test_max=x_test.max() x2_test=(x_test-x_test_min)/(x_test_max-x_test_min) x2_test.head()
nHM | piPC09 | PCD | X2Av | MLOGP | ON1V | N-072 | B02[C-N] | F04[C-O] | |
---|---|---|---|---|---|---|---|---|---|
5 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 |
12 | 0.0 | 0.008929 | 0.009524 | 0.000000 | 0.006849 | 0.007874 | 0.0 | 0.0 | 0.058824 |
15 | 0.0 | 0.017857 | 0.009524 | 0.029412 | 0.006849 | 0.015748 | 0.0 | 0.0 | 0.117647 |
16 | 0.0 | 0.026786 | 0.019048 | 0.029412 | 0.013699 | 0.023622 | 0.0 | 1.0 | 0.000000 |
19 | 0.0 | 0.000000 | 0.028571 | 0.058824 | 0.020548 | 0.031496 | 0.0 | 1.0 | 0.058824 |
knn=KNeighborsClassifier(n_neighbors=3,weights="distance",p=1,n_jobs=-1) knn.fit(x2_train, y_train) y_=knn.predict(x2_test) acc=(y_==y_test).mean() print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))
预测生物富集因子准确率:0.02564102564102564;预测生物富集等级准确率:0.4358974358974359
# Z-score归一化 # 求平均值 x_train_mean=x_train.mean() # 求方差 x_train_std=x_train.std() x3_train=(x_train-x_train_mean)/x_train_std x3_train.head() # 求平均值 x_test_mean=x_test.mean() # 求方差 x_test_std=x_test.std() x3_test=(x_test-x_test_mean)/x_test_std x3_test.head()
nHM | piPC09 | PCD | X2Av | MLOGP | ON1V | N-072 | B02[C-N] | F04[C-O] | |
---|---|---|---|---|---|---|---|---|---|
5 | -0.858971 | -0.961536 | -1.433213 | -1.308189 | -1.621471 | -1.571899 | -0.37945 | -0.885971 | -0.901314 |
12 | -0.858971 | -0.934067 | -1.399267 | -1.308189 | -1.597619 | -1.544272 | -0.37945 | -0.885971 | -0.706463 |
15 | -0.858971 | -0.906599 | -1.399267 | -1.184954 | -1.597619 | -1.516645 | -0.37945 | -0.885971 | -0.511611 |
16 | -0.858971 | -0.879131 | -1.365321 | -1.184954 | -1.573767 | -1.489018 | -0.37945 | 1.122917 | -0.901314 |
19 | -0.858971 | -0.961536 | -1.331375 | -1.061719 | -1.549914 | -1.461391 | -0.37945 | 1.122917 | -0.706463 |
knn=KNeighborsClassifier(n_neighbors=3,weights="distance",p=1,n_jobs=-1) knn.fit(x3_train, y_train) y_=knn.predict(x3_test) acc=(y_==y_test).mean() print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))
预测生物富集因子准确率:0.015384615384615385;预测生物富集等级准确率:0.41025641025641024
# sklearn封装为的z-score归一化操作 from sklearn.preprocessing import StandardScaler,MinMaxScaler s=StandardScaler() x4_train=s.fit_transform(x_train) # x4_train 为z-score归一化后的数据 x4_test=s.fit_transform(x_test) m=MinMaxScaler() x5_train=m.fit_transform(x_train) x5_test=m.fit_transform(x_test) # x5_train 为最大值,最小值归一化后的数据
保存算法模型
from sklearn.externals import joblib joblib.dump(knn,‘./model‘,cache_size=9) # 保存模型,默认保存为压缩类型,会保存所有数据,cache_size=9时为压缩最小大小
# 加载模型 model=joblib.load(‘./model‘) x=model.predict(x2_test) acc=(y_==y_test).mean() print("预测生物富集因子准确率:{};预测生物富集等级准确率:{}".format(acc[1],acc[0]))
预测生物富集因子准确率:0.020512820512820513;预测生物富集等级准确率:0.41025641025641024
原文地址:https://www.cnblogs.com/lq13035130506/p/12543134.html
时间: 2024-11-05 21:37:03