2017-08-12
Logistic 回归,作为分类器:
分别用了梯度上升,牛顿法来最优化损失函数:
1 # -*- coding: utf-8 -*- 2 3 ‘‘‘ 4 function: 实现Logistic回归,拟合直线,对数据进行分类; 5 利用梯度上升,随机梯度上升,改进的随机梯度上升,牛顿法分别对损失函数优化; 6 这里没有给出最后测试分类的函数; 7 date: 2017.8.12 8 ‘‘‘ 9 10 from numpy import * 11 12 #从文件加载处理数据 13 def loadDataSet(): 14 dataMat = [] 15 labelMat = [] 16 fr = open(‘testSet.txt‘) 17 for line in fr.readlines(): 18 lineArr = line.strip().split() 19 dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) 20 labelMat.append(int(lineArr[2])) 21 return dataMat, labelMat 22 23 #sigmoid function X: w1*x1+w2*x2+...+wn*xn 24 def sigmoid(x): 25 return 1 / (1 + exp(-x)) 26 27 #梯度上升求函数的最大值时取得的权重 28 def gradAscent(dataMatIn, classLabels): 29 dataMatrix = mat(dataMatIn) 30 labelMat = mat(classLabels).transpose() #将m维行向量转制为m维列向量 31 m,n = shape(dataMatrix) 32 alpha = 0.001 #设置梯度上升的步长 33 maxCycles = 500 #最大迭代次数 34 weights = ones((n,1)) #weights就是theta,n维列向量,二维数组 35 for i in range(maxCycles): 36 h = sigmoid(dataMatrix*weights) #计算所有数据的分类概率,h是m维向量,这里实际上进行了300次乘法运算 37 error = labelMat - h #计算(y-h(x)):误差 38 weights = weights + alpha * dataMatrix.transpose()*error #对所有weights同时更新 39 print(‘shape of weights‘, shape(weights)) 40 return weights 41 42 #随机上升上升 43 def stocGradAscent0(dataMatrix, classLabels): 44 m,n = shape(dataMatrix) 45 alpha = 0.01 46 weights = ones(n, float) #n 维行向量 47 for i in range(m): 48 h = sigmoid(sum(dataMatrix[i] * weights)) 49 error = classLabels[i] - h 50 weights = weights + alpha * error * dataMatrix[int(i)] 51 return weights 52 53 #改进的随机上升上升 54 def stocGradAscent1(dataMatrix, classLabels, numIter = 550): 55 m,n = shape(dataMatrix) 56 weights = ones(n) 57 dataIndex = [] 58 for j in range(numIter): 59 for k in range(m): 60 dataIndex.append(k) 61 for i in range(m): 62 alpha = 4 / (i + j + 1.0) + 0.01 #aloha随着迭代次数增多不断减小但是不为0,为了解决局部波动 63 randIndex = int(random.uniform(0, len(dataIndex))) #随机选取一个数据进行更新参数 64 h = sigmoid(dataMatrix[randIndex] * weights) 65 error = classLabels[randIndex] - h 66 weights = weights + alpha * error * dataMatrix[randIndex] 67 del(dataIndex[randIndex]) #这里会不会对本来的dataMatrix有影响,不会,因为没有操作矩阵 68 return weights 69 70 ‘‘‘ 71 functuion: 牛顿法更新theta,计算海森矩阵 72 note: 这里一定要对 XT*X 结果求逆,不然分类效果无法直视 73 ‘‘‘ 74 def computeHessianMatrix(dataMatrix): 75 hessianMatrix = mat(dataMatrix).transpose().dot(mat(dataMatrix)) 76 return hessianMatrix.I #矩阵求逆 77 78 ‘‘‘ 79 function: 牛顿法更新theta 80 ‘‘‘ 81 def newtonMethod(dataMat, labelMat, numIter = 10): 82 m,n = shape(dataMat) 83 dataMatrix = mat(dataMat) 84 labelMat = mat(labelMat).transpose() 85 weights = ones((n,1)) #参数向量 86 hessianMatrix = computeHessianMatrix(dataMatrix) 87 print(‘shape of hessian‘, shape(hessianMatrix)) 88 for k in range(numIter): 89 h = sigmoid(dataMatrix * weights) 90 error = (labelMat - h) 91 weights = weights - (dataMatrix*hessianMatrix).transpose() * error 92 return weights 93 94 #画出决策边界 95 def plotBestFit(weights): 96 import matplotlib.pyplot as plt 97 dataMat, labelMat = loadDataSet() #或取数据和标签 98 dataArr = array(dataMat) #将二维列表转换为数组 99 n = shape(dataArr)[0] #行数,代表数据的个数 100 101 xcord1 = []; ycord1 = [] 102 xcord2 = []; ycord2 = [] 103 for i in range(n): 104 if int(labelMat[i]) == 1: 105 xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) 106 else: 107 xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) 108 109 fig = plt.figure() 110 ax = fig.add_subplot(111) 111 ax.scatter(xcord1, ycord1, s = 30, c = ‘red‘, marker = ‘s‘) 112 ax.scatter(xcord2, ycord2, s = 30, c = ‘green‘) 113 x = arange(-3.0, 3.0, 0.1) 114 print(len(x)) 115 print(len(weights)) 116 print(shape(weights)) 117 y = (-weights[0] - weights[1]*x) / weights[2] #把x2堪称y,x0=1,所以就是x1,x2之间的函数 118 print(len(y)) 119 ax.plot(x,y) 120 plt.xlabel(‘X1‘); plt.ylabel(‘X2‘) 121 plt.show() #显示图像 122 123 ‘‘‘ 124 function: 对测试数据进行测试 125 input: testDate 是一个向量,或者多个向量 126 ‘‘‘ 127 def logisticTest(weights, testData): 128 m,n = shape(testData) 129 typeLabel = [] 130 for i in range(m): 131 result = sigmoid(sum(testData[i] * weights)) #得到一个册数数据的概率 132 if result > 0.5: 133 typeLabel.append(1) 134 else: 135 typeLabel.append(0) 136 return typeLabel 137 138 ‘‘‘ 139 function: 计算分类的正确率 140 input: calssLables 是测试数据的类别向量 141 ‘‘‘ 142 def getCorrectRate(classLabels, testLabel): 143 correctRate = 0.0 144 numOfRight = 0 145 for i in range(len(testLabel)): 146 if classLabels[i] == testLabel[i]: 147 numOfRight += 1 148 correctRate = numOfRight / float(len(testLabel)) 149 return correctRate 150 151 #测试算法 152 dataMat, labelMat = loadDataSet() 153 print(‘shape of datamet, ‘, shape(dataMat)) 154 weights1 = gradAscent(dataMat, labelMat) 155 weights2 = stocGradAscent0(array(dataMat), labelMat) 156 weights3 = stocGradAscent1(dataMat, labelMat) 157 158 weights4 = newtonMethod(dataMat, labelMat, 2000) #牛顿法 159 160 161 #测试正确率 162 typeLabel = logisticTest(weights4, [[1,1,0],[1,1,1]]) 163 print(typeLabel) 164 correctRate = getCorrectRate([1,0],typeLabel) 165 print(‘正确率: ‘ ,str(correctRate)) 166 167 #画图 168 #plotBestFit(array(weights1)) #这里因为普通梯度上升返回的是一个矩阵,所以要转换为向量 169 #plotBestFit(weights2) 170 #plotBestFit(weights3) 171 plotBestFit(array(weights4))
# -*- coding: utf-8 -*-
‘‘‘function: 实现Logistic回归,拟合直线,对数据进行分类;利用梯度上升,随机梯度上升,改进的随机梯度上升,牛顿法分别对损失函数优化;这里没有给出最后测试分类的函数;date: 2017.8.12‘‘‘
from numpy import *
#从文件加载处理数据def loadDataSet():dataMat = []labelMat = []fr = open(‘testSet.txt‘)for line in fr.readlines():lineArr = line.strip().split()dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])labelMat.append(int(lineArr[2]))return dataMat, labelMat
#sigmoid function X: w1*x1+w2*x2+...+wn*xndef sigmoid(x):return 1 / (1 + exp(-x))
#梯度上升求函数的最大值时取得的权重def gradAscent(dataMatIn, classLabels):dataMatrix = mat(dataMatIn)labelMat = mat(classLabels).transpose() #将m维行向量转制为m维列向量m,n = shape(dataMatrix)alpha = 0.001 #设置梯度上升的步长maxCycles = 500 #最大迭代次数 weights = ones((n,1)) #weights就是theta,n维列向量,二维数组for i in range(maxCycles):h = sigmoid(dataMatrix*weights) #计算所有数据的分类概率,h是m维向量,这里实际上进行了300次乘法运算error = labelMat - h #计算(y-h(x)):误差weights = weights + alpha * dataMatrix.transpose()*error #对所有weights同时更新print(‘shape of weights‘, shape(weights))return weights
#随机上升上升def stocGradAscent0(dataMatrix, classLabels):m,n = shape(dataMatrix)alpha = 0.01weights = ones(n, float) #n 维行向量for i in range(m):h = sigmoid(sum(dataMatrix[i] * weights))error = classLabels[i] - hweights = weights + alpha * error * dataMatrix[int(i)]return weights
#改进的随机上升上升def stocGradAscent1(dataMatrix, classLabels, numIter = 550):m,n = shape(dataMatrix)weights = ones(n)dataIndex = []for j in range(numIter):for k in range(m):dataIndex.append(k)for i in range(m):alpha = 4 / (i + j + 1.0) + 0.01 #aloha随着迭代次数增多不断减小但是不为0,为了解决局部波动randIndex = int(random.uniform(0, len(dataIndex))) #随机选取一个数据进行更新参数h = sigmoid(dataMatrix[randIndex] * weights)error = classLabels[randIndex] - hweights = weights + alpha * error * dataMatrix[randIndex]del(dataIndex[randIndex]) #这里会不会对本来的dataMatrix有影响,不会,因为没有操作矩阵return weights
‘‘‘functuion: 牛顿法更新theta,计算海森矩阵note: 这里一定要对 XT*X 结果求逆,不然分类效果无法直视‘‘‘def computeHessianMatrix(dataMatrix):hessianMatrix = mat(dataMatrix).transpose().dot(mat(dataMatrix))return hessianMatrix.I #矩阵求逆
‘‘‘function: 牛顿法更新theta‘‘‘def newtonMethod(dataMat, labelMat, numIter = 10):m,n = shape(dataMat)dataMatrix = mat(dataMat)labelMat = mat(labelMat).transpose()weights = ones((n,1)) #参数向量hessianMatrix = computeHessianMatrix(dataMatrix)print(‘shape of hessian‘, shape(hessianMatrix))for k in range(numIter):h = sigmoid(dataMatrix * weights)error = (labelMat - h)weights = weights - (dataMatrix*hessianMatrix).transpose() * error return weights
#画出决策边界def plotBestFit(weights):import matplotlib.pyplot as pltdataMat, labelMat = loadDataSet() #或取数据和标签dataArr = array(dataMat) #将二维列表转换为数组n = shape(dataArr)[0] #行数,代表数据的个数
xcord1 = []; ycord1 = []xcord2 = []; ycord2 = []for i in range(n):if int(labelMat[i]) == 1:xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])else:xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
fig = plt.figure()ax = fig.add_subplot(111)ax.scatter(xcord1, ycord1, s = 30, c = ‘red‘, marker = ‘s‘)ax.scatter(xcord2, ycord2, s = 30, c = ‘green‘)x = arange(-3.0, 3.0, 0.1)print(len(x))print(len(weights))print(shape(weights))y = (-weights[0] - weights[1]*x) / weights[2] #把x2堪称y,x0=1,所以就是x1,x2之间的函数print(len(y))ax.plot(x,y) plt.xlabel(‘X1‘); plt.ylabel(‘X2‘)plt.show() #显示图像
‘‘‘function: 对测试数据进行测试input: testDate 是一个向量,或者多个向量‘‘‘def logisticTest(weights, testData):m,n = shape(testData)typeLabel = []for i in range(m): result = sigmoid(sum(testData[i] * weights)) #得到一个册数数据的概率if result > 0.5:typeLabel.append(1)else:typeLabel.append(0)return typeLabel
‘‘‘function: 计算分类的正确率input: calssLables 是测试数据的类别向量‘‘‘def getCorrectRate(classLabels, testLabel):correctRate = 0.0numOfRight = 0for i in range(len(testLabel)):if classLabels[i] == testLabel[i]:numOfRight += 1correctRate = numOfRight / float(len(testLabel))return correctRate
#测试算法dataMat, labelMat = loadDataSet()print(‘shape of datamet, ‘, shape(dataMat))weights1 = gradAscent(dataMat, labelMat)weights2 = stocGradAscent0(array(dataMat), labelMat)weights3 = stocGradAscent1(dataMat, labelMat)
weights4 = newtonMethod(dataMat, labelMat, 2000) #牛顿法
#测试正确率typeLabel = logisticTest(weights4, [[1,1,0],[1,1,1]])print(typeLabel)correctRate = getCorrectRate([1,0],typeLabel)print(‘正确率: ‘ ,str(correctRate))
#画图#plotBestFit(array(weights1)) #这里因为普通梯度上升返回的是一个矩阵,所以要转换为向量#plotBestFit(weights2)#plotBestFit(weights3)plotBestFit(array(weights4))