从疝气病症预测病马的死亡率
说明:
将 horseColicTraining.txt 和 horseColicTest.txt 放在当前目录下。
import numpy
as np
import matplotlib
.pyplot
as plt
定义 Sigmoid 函数
def sigmoid(inX
):
return 1.0 / (1 + np
.exp
(-inX
))
定义一般的梯度提升算法
def gradAscent(dataMatIn
, classLabels
):
dataMatrix
= np
.mat
(dataMatIn
)
labelMat
= np
.mat
(classLabels
).transpose
()
m
, n
= np
.shape
(dataMatrix
)
alpha
= 0.001
maxCycles
= 500
weights
= np
.ones
((n
, 1))
for k
in range(maxCycles
):
"""
批量梯度上升法
"""
h
= sigmoid
(dataMatrix
*weights
)
error
= (labelMat
- h
)
weights
= weights
+ alpha
* dataMatrix
.transpose
()* error
return weights
定义随机梯度上升算法
def stocGradAscent0(dataMatrix
, classLabels
):
m
, n
= np
.shape
(dataMatrix
)
alpha
= 0.01
weights
= np
.ones
(n
)
for i
in range(m
):
"""
随机梯度上升法
"""
h
= sigmoid
(sum(dataMatrix
[i
]*weights
))
error
= classLabels
[i
] - h
weights
= weights
+ alpha
* error
* dataMatrix
[i
]
return weights
定义改进的随机梯度上升算法
def stocGradAscent1(dataMatrix
, classLabels
, numIter
=150):
m
, n
= np
.shape
(dataMatrix
)
weights
= np
.ones
(n
)
for j
in range(numIter
):
dataIndex
= list(range(m
))
for i
in range(m
):
alpha
= 4/(1.0+j
+i
)+0.0001
randIndex
= int(np
.random
.uniform
(0, len(dataIndex
)))
h
= sigmoid
(sum(dataMatrix
[randIndex
]*weights
))
error
= classLabels
[randIndex
] - h
weights
= weights
+ alpha
* error
* dataMatrix
[randIndex
]
del(dataIndex
[randIndex
])
return weights
从疝气病症预测病马的死亡率
def classifyVector(inX
, weights
):
prob
= sigmoid
(sum(inX
*weights
))
if prob
> 0.5:
return 1.0
else:
return 0.0
def colicTest
frTrain
= open('./horseColicTraining.txt'); frTest
= open('./horseColicTest.txt')
trainingSet
= []; trainingLabels
= []
for line
in frTrain
.readlines
():
currLine
= line
.strip
().split
('\t')
lineArr
= []
for i
in range(21):
lineArr
.append
(float(currLine
[i
]))
trainingSet
.append
(lineArr
)
trainingLabels
.append
(float(currLine
[21]))
trainWeights
= stocGradAscent1
(np
.array
(trainingSet
), trainingLabels
, 1000)
errorCount
= 0; numTestVec
= 0.0
for line
in frTest
.readlines
():
numTestVec
+= 1.0
currLine
= line
.strip
().split
('\t')
lineArr
= []
for i
in range(21):
lineArr
.append
(float(currLine
[i
]))
if int(classifyVector
(np
.array
(lineArr
), trainWeights
)) != int(currLine
[21]):
errorCount
+= 1
errorRate
= (float(errorCount
)/numTestVec
)
print("the error rate of this test is: %f" % errorRate
)
return errorRate
def multiTest():
numTests
= 10; errorSum
= 0.0
for k
in range(numTests
):
errorSum
+= colicTest
()
print("after %d iterations the average error rate is: %f" % (numTests
, errorSum
/float(numTests
)))
multiTest
()
the error rate of this test is: 0.313433
the error rate of this test is: 0.358209
the error rate of this test is: 0.417910
the error rate of this test is: 0.417910
the error rate of this test is: 0.298507
the error rate of this test is: 0.253731
the error rate of this test is: 0.373134
the error rate of this test is: 0.343284
the error rate of this test is: 0.283582
the error rate of this test is: 0.328358
after 10 iterations the average error rate is: 0.338806