關(guān)于邏輯回歸的理解以及公式推導(dǎo),可以看前面的文章
邏輯回歸公式推導(dǎo)
邏輯回歸
sigomoid function:
用梯度上升算法來求參數(shù),求函數(shù)的最大值。
迭代步驟:
訓練算法:求參數(shù)
主要步驟如下:
- 將數(shù)據(jù)轉(zhuǎn)化成矩陣
- 求梯度
from numpy import *
def loadData(filename):
'''
讀取文件
'''
datamat = []; labelmat = []
with open(filename) as f:
for line in f.readlines():
line_arr = line.strip().split()
#有1.0是因為有一個常數(shù)項
datamat.append([1.0, float(line_arr[0]), float(line_arr[1])])
labelmat.append(int(line_arr[2]))
return datamat, labelmat
datamat,labelmat = loadData('testSet.txt')
def sigmoid(inX):
'''
sigmoid函數(shù)
-inX:-wT·x
'''
return 1.0/(1+exp(-inX))
def gradAscent(dataMatIn,classLabels):
'''
梯度上升算法
dataMatIn:訓練樣本
classLabels:標簽
'''
dataMatrix = mat(dataMatIn)
labelMat = mat(classLabels).transpose()
m,n = shape(dataMatrix)
alpha = 0.001
maxCycles = 500
#weights are parameters' vector
weights = ones((n,1))
for k in range(maxCycles):
h = sigmoid(dataMatrix*weights)
error = (labelMat - h)
#推導(dǎo)過程如下:
weights = weights +alpha*dataMatrix.transpose()*error
return weights
推導(dǎo):因為代價函數(shù)的梯度已知:
weights = gradAscent(datamat,labelmat)
import matplotlib
import matplotlib.pyplot as plt
from numpy import *
def plot_fit(data, labelMat, weights):
'''
畫圖,得到分界線
data:訓練樣本
labelMat:標簽矩陣
'''
dataArr = array(data)
n = shape(dataArr)[0]
x_cord1 = []; y_cord1 = []
x_cord2 = []; y_cord2 = []
for i in range(n):
if int(labelMat[i]) == 1:
x_cord1.append(dataArr[i,1]); y_cord1.append(dataArr[i,2])
else: x_cord2.append(dataArr[i,1]); y_cord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(x_cord1, y_cord1, s = 30, c = 'red', marker='s')
ax.scatter(x_cord2, y_cord2, s = 30, c = 'green')
x = arange(-3.0, 3.0, 0.1)
y = ((-weights[0]- weights[1] * x)/weights[2]).transpose()
ax.plot(x, y)
plt.xlabel('X1');
plt.ylabel('X2');
plt.show()
plot_fit(datamat,labelmat,weights)

梯度下降法分類效果還可以,100個樣本分類錯誤4個,但是計算量太大了,計算了300次乘法,需要進一步改進算法。
隨機梯度上升
梯度上升每次更新回歸系數(shù),都需要遍歷整個數(shù)據(jù)集,一旦特征數(shù)量很大,計算復(fù)雜度太高了,改進方法叫隨機梯度上升算法,每次選擇一個樣本點來更新回歸系數(shù)
def stocGradAscent0(dataMatrix,classLabels):
'''
隨機梯度上升算法
dataMatrix:訓練樣本
calssLabels:標簽
output:參數(shù)值
'''
m,n = shape(dataMatrix)
alpha = 0.01
weights = ones(n)
for i in range(m):
h = sigmoid(sum(dataMatrix[i]*weights))
error = classLabels[i] - h
weights = weights + alpha*error*array(dataMatrix[I])
return weights
weight2 = stocGradAscent0(datamat,labelmat)
推導(dǎo)過程:
plot_fit(datamat,labelmat,weight2)

結(jié)果顯示分類效果比較差,是因為迭代次數(shù)不確定導(dǎo)致參數(shù)還未達到穩(wěn)定??梢詫ι鲜龃a進行進一步修改
def stocGradAscent1(dataMatrix,classLabels,numIter = 150):
'''
改進的隨機梯度上升算法
'''
m,n = shape(dataMatrix)
weights = ones(n)
#迭代次數(shù)
for j in range(numIter):
dataIndex = range(m)
#樣本數(shù)量
for i in range(m):
#每次調(diào)整a,why?
alpha = 0.01+4/(1.0+j+i)
#uniform-->生成[x,y)之間的隨機實數(shù)
randIndex = int(random.uniform(0,len(dataIndex)))
h = sigmoid(sum(dataMatrix[randIndex]*weights))
error = classLabels[randIndex] -h
weights = weights + alpha*error*array(dataMatrix[randIndex])
return weights
'''
1.不明白為什么每次需要調(diào)整alpha
2.隨機選擇樣本可以減少周期波動
'''
weights3 = stocGradAscent1(datamat,labelmat,10)
plot_fit(datamat,labelmat,weights3)

疝氣病預(yù)測病馬死亡率
用醫(yī)院檢測疝氣病的指標,來預(yù)測病馬的死亡率
數(shù)據(jù)預(yù)處理:缺失值處理
- 用特征均值填補
- 用特殊值填補,比如-1
- 忽略這部分樣本
- 用類似樣本均值填補
- 用機器學習算法來預(yù)測
具體做法:
- 選一個實數(shù)值替換,選0
- 標簽缺失,需要丟棄數(shù)據(jù)
def getDateSet(filepath):
'''
獲得訓練集和測試集
'''
import numpy as np
with open(filepath) as f:
file = f.readlines()
arr = []
dataSet = [];Labels=[]#訓練集和標簽集
for x in file:
mid = x.strip().split('\t')
vector = list(map(lambda x:float(x),mid))
arr.append(vector)
#用numpy提取數(shù)據(jù)集以及標簽集,加一個x0項
mid2 = np.array(arr)
x0 = array([1]*len(file))
dataSet = np.column_stack((x0,mid2[:,:-1]))
Labels =mid2[:,-1]
return dataSet,Labels
trainingdataSet,trainingLabels = getDateSet('./horseColicTraining.txt')
testdataSet,testLabels = getDateSet('./horseColicTest.txt')
stocGradAscent1(trainingdataSet,trainingLabels,500)
array([ 2.88523241e+00, 4.60371241e+01, -3.44351209e+00, 1.81776542e+00,
-1.22312262e+00, -4.51162382e-02, -1.26815180e+01, -5.19715412e+00,
-1.62249176e+01, 1.69815852e+00, -1.42154178e+01, 2.51022948e+01,
-1.01675395e+01, 3.62858014e+01, -5.73351142e+00, -7.52889830e+00,
8.52370700e+00, -9.93327494e+00, -1.10860304e+00, 1.43608141e+00,
-2.06483905e+00, -5.98332914e+00])
#得到參數(shù)
weights = stocGradAscent1(trainingdataSet,trainingLabels,500)
#測試,計算w^tx
result = mat(testdataSet)*mat(weights).T
#sigmoid參數(shù)z,是-w^Tx的計算結(jié)果,sig
z = result.tolist()
sig = list(map(lambda x:sigmoid(x[0]),result.tolist()))
def classifyVector(x):
'''
sigmoid函數(shù)計算結(jié)果
'''
if x >0.5:
return 1.0
else:
return 0.0
#邏輯回歸的預(yù)測結(jié)果
lg_result = [classifyVector(x) for x in sig]
#標簽集轉(zhuǎn)列表
testLabels.tolist()
#將預(yù)測結(jié)果和真實結(jié)果zip打包成元組n
zipped = zip(lg_result,testLabels.tolist())
n = list(zipped)
#計算分類錯誤率
errorRate = len([x for x in n if x[0]-x[1]!=0])/len(n)
上述代碼合并如下:
def errorTest():
'''
錯誤率計算函數(shù)
weights:邏輯回歸模型參數(shù)
'''
#數(shù)據(jù)集準備
trainingdataSet,trainingLabels = getDateSet('./horseColicTraining.txt')
testdataSet,testLabels = getDateSet('./horseColicTest.txt')
#得到參數(shù)
weights = stocGradAscent1(trainingdataSet,trainingLabels,10)
#測試,計算w^tx
result = mat(testdataSet)*mat(weights).T
#sigmoid參數(shù)z,是-w^Tx的計算結(jié)果,sig
z = result.tolist()
sig = list(map(lambda x:sigmoid(x[0]),result.tolist()))
#邏輯回歸的預(yù)測結(jié)果
lg_result = [classifyVector(x) for x in sig]
#標簽集轉(zhuǎn)列表
testLabels.tolist()
#將預(yù)測結(jié)果和真實結(jié)果zip打包成元組n
zipped = zip(lg_result,testLabels.tolist())
n = list(zipped)
#計算分類錯誤率
errorRate = len([x for x in n if x[0]-x[1]!=0])/len(n)
print('錯誤率:%f'%errorRate)
return errorRate
errorTest()
錯誤率:0.283582
0.2835820895522388
def multiTest():
numTests = 10;errorSum=0.0
for k in range(numTests):
errorSum += errorTest()
print('經(jīng)過10次迭代后平均錯誤率:%f'%(float(errorSum/numTests)))
multiTest()
錯誤率:0.268657
錯誤率:0.492537
錯誤率:0.328358
錯誤率:0.253731
錯誤率:0.268657
錯誤率:0.417910
錯誤率:0.358209
錯誤率:0.567164
錯誤率:0.283582
錯誤率:0.328358
經(jīng)過10次迭代后平均錯誤率:0.356716
代碼整理
def sigmoid(inX):
'''
sigmoid函數(shù)
-inX:-wT·x
'''
return 1.0/(1+exp(-inX))
def stocGradAscent1(dataMatrix,classLabels,numIter = 150):
'''
改進的隨機梯度上升算法
'''
m,n = shape(dataMatrix)
weights = ones(n)
#迭代次數(shù)
for j in range(numIter):
dataIndex = range(m)
#樣本數(shù)量
for i in range(m):
#每次調(diào)整a,why?
alpha = 0.01+4/(1.0+j+i)
#uniform-->生成[x,y)之間的隨機實數(shù)
randIndex = int(random.uniform(0,len(dataIndex)))
h = sigmoid(sum(dataMatrix[randIndex]*weights))
error = classLabels[randIndex] -h
weights = weights + alpha*error*array(dataMatrix[randIndex])
return weights
def getDateSet(filepath):
'''
獲得訓練集和測試集
'''
import numpy as np
with open(filepath) as f:
file = f.readlines()
arr = []
dataSet = [];Labels=[]#訓練集和標簽集
for x in file:
mid = x.strip().split('\t')
vector = list(map(lambda x:float(x),mid))
arr.append(vector)
#用numpy提取數(shù)據(jù)集以及標簽集,加一個x0項
mid2 = np.array(arr)
x0 = array([1]*len(file))
dataSet = np.column_stack((x0,mid2[:,:-1]))
Labels =mid2[:,-1]
return dataSet,Labels
def classifyVector(x):
'''
sigmoid函數(shù)計算結(jié)果分類
'''
if x >0.5:
return 1.0
else:
return 0.0
if __name__ =="__main__":
multiTest()
錯誤率:0.313433
錯誤率:0.283582
錯誤率:0.462687
錯誤率:0.626866
錯誤率:0.328358
錯誤率:0.268657
錯誤率:0.343284
錯誤率:0.328358
錯誤率:0.268657
錯誤率:0.298507
經(jīng)過10次迭代后平均錯誤率:0.352239
小結(jié)
學習了邏輯回歸算法,愈發(fā)感受到只看一本書效果并不好,而且《機器學習實戰(zhàn)》一書缺陷明顯,代碼陳舊復(fù)雜,推導(dǎo)過程簡陋,屬于名副其實的入門書籍。有時候照著書上的代碼操作結(jié)果就是出不來,令人非常惱火。好了,關(guān)于這一章的內(nèi)容,有以下幾個注意事項:
對于邏輯回歸模型的本質(zhì):是一個回歸模型嵌套在sigmoid函數(shù)中,模型借助其階躍的特點,對樣本進行二分類,其問題核心在于求解模型中的參數(shù);從概率的角度來看,由于結(jié)果只有0,1,我們認為在擁有數(shù)據(jù)集的前提下,這些數(shù)據(jù)往往被取到的概率較大,而概率大小取決于使其取到這些值的參數(shù)決定,所以選擇某個參數(shù),使得取到數(shù)據(jù)集的概率最大,這種參數(shù)估計方法叫最大似然估計法。關(guān)于參數(shù)估計,可以使用梯度下降法進行。
隨著學習的深入,更加覺得機器學習并非易事,從數(shù)據(jù)清洗-特征工程-模型選擇-算法優(yōu)化,這些都包含著高深的學問,此外,還需要有一定的數(shù)學基礎(chǔ)以及編程能力。不禁感慨一句:
路漫漫其修遠兮,吾將上下而求索。