徒手写代码之《机器学习实战》----逻辑回归算法(1)（从疝气病症预测病马的死亡率项目）

xiaoxiao2025-03-14 55

从疝气病症预测病马的死亡率

说明:

将 horseColicTraining.txt 和 horseColicTest.txt 放在当前目录下。

import numpy as np import matplotlib.pyplot as plt

定义 Sigmoid 函数

def sigmoid(inX): return 1.0 / (1 + np.exp(-inX))

定义一般的梯度提升算法

def gradAscent(dataMatIn, classLabels): #转换成numpy的mat矩阵 dataMatrix = np.mat(dataMatIn) #转换成numpy的mat矩阵并且转置 labelMat = np.mat(classLabels).transpose() m, n = np.shape(dataMatrix) alpha = 0.001 #设置最大迭代次数为500次 maxCycles = 500 #权重全部初始化为1 weights = np.ones((n, 1)) for k in range(maxCycles): """ 批量梯度上升法 """ h = sigmoid(dataMatrix*weights) error = (labelMat - h) weights = weights + alpha * dataMatrix.transpose()* error return weights

定义随机梯度上升算法

def stocGradAscent0(dataMatrix, classLabels): m, n = np.shape(dataMatrix) alpha = 0.01 weights = np.ones(n) for i in range(m): """ 随机梯度上升法 """ h = sigmoid(sum(dataMatrix[i]*weights)) error = classLabels[i] - h weights = weights + alpha * error * dataMatrix[i] return weights

定义改进的随机梯度上升算法

def stocGradAscent1(dataMatrix, classLabels, numIter=150): m, n = np.shape(dataMatrix) #权重初始化 weights = np.ones(n) for j in range(numIter): dataIndex = list(range(m)) for i in range(m): #随着迭代次数增加，降低apha值 alpha = 4/(1.0+j+i)+0.0001 #随机取一个样本 randIndex = int(np.random.uniform(0, len(dataIndex))) h = sigmoid(sum(dataMatrix[randIndex]*weights)) error = classLabels[randIndex] - h weights = weights + alpha * error * dataMatrix[randIndex] del(dataIndex[randIndex]) return weights

从疝气病症预测病马的死亡率

def classifyVector(inX, weights): prob = sigmoid(sum(inX*weights)) if prob > 0.5: return 1.0 else: return 0.0 def colicTest #打开训练集和测试集 frTrain = open('./horseColicTraining.txt'); frTest = open('./horseColicTest.txt') trainingSet = []; trainingLabels = [] for line in frTrain.readlines(): currLine = line.strip().split('\t') lineArr = [] for i in range(21): lineArr.append(float(currLine[i])) trainingSet.append(lineArr) trainingLabels.append(float(currLine[21])) #计算回归系数的向量 trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels, 1000) errorCount = 0; numTestVec = 0.0 for line in frTest.readlines(): numTestVec += 1.0 currLine = line.strip().split('\t') lineArr = [] for i in range(21): lineArr.append(float(currLine[i])) if int(classifyVector(np.array(lineArr), trainWeights)) != int(currLine[21]): errorCount += 1 errorRate = (float(errorCount)/numTestVec) print("the error rate of this test is: %f" % errorRate) return errorRate def multiTest(): numTests = 10; errorSum = 0.0 for k in range(numTests): errorSum += colicTest() print("after %d iterations the average error rate is: %f" % (numTests, errorSum/float(numTests))) multiTest() the error rate of this test is: 0.313433 the error rate of this test is: 0.358209 the error rate of this test is: 0.417910 the error rate of this test is: 0.417910 the error rate of this test is: 0.298507 the error rate of this test is: 0.253731 the error rate of this test is: 0.373134 the error rate of this test is: 0.343284 the error rate of this test is: 0.283582 the error rate of this test is: 0.328358 after 10 iterations the average error rate is: 0.338806

最新回复(0)