说明:
将数据集文件 ‘datingTestSet2.txt’ 放在当前文件夹
# 导入程序所需要的模块 import numpy as np import operatorfile2matrix函数实现的功能是读取文件数据,函数返回的returnMat和classLabelVector分别是数据集的特征矩阵和输出标签向量。
def file2matrix(filename): love_dictionary = {'largeDoses':3, 'smallDoses':2, 'didntLike':1} # 三个类别 # 打开文件 fr = open(filename) # 逐行打开,readlines()方法用于读取所有行(直到结束符 EOF)并返回列表,该列表可以由 Python 的 for... in ... 结构进行处理。 arrayOLines = fr.readlines() #得到文件的行数 numberOfLines = len(arrayOLines) #返回numpy矩阵,numberOfLines行,3列的零元素矩阵//初始化特征矩阵 returnMat = np.zeros((numberOfLines, 3)) #返回分类的标签向量//初始化输出标签向量 classLabelVector = [] #行的索引值 index = 0 for line in arrayOLines: # 删去字符串首尾空格 line = line.strip() # 按'\t'对字符串进行分割,listFromLine 是列表 listFromLine = line.split('\t') #将数据前三列提取出来,存放到returnMat的numpy矩阵中,也就是不含标签变量,只有特征变量。 # listFromLine的0,1,2元素是特征,赋值给returnMat的当前行 #一行一行的存储 returnMat[index, :] = listFromLine[0:3] # 如果listFromLine最后一个元素是数字 if(listFromLine[-1].isdigit()): # 直接赋值给classLabelVector classLabelVector.append(int(listFromLine[-1])) else: # 如果listFromLine最后一个元素不是数字,而是字符串。根据字典love_dictionary转化为数字 # Python 字典(Dictionary) get() 函数返回指定键的值 classLabelVector.append(love_dictionary.get(listFromLine[-1])) index += 1 return returnMat, classLabelVector # 返回的类别标签classLabelVector是1,2,3 returnMat,classLabelVector=file2matrix('datingTestSet2.txt') print(returnMat) print(classLabelVector) print(returnMat.shape) [[4.0920000e+04 8.3269760e+00 9.5395200e-01] [1.4488000e+04 7.1534690e+00 1.6739040e+00] [2.6052000e+04 1.4418710e+00 8.0512400e-01] ... [2.6575000e+04 1.0650102e+01 8.6662700e-01] [4.8111000e+04 9.1345280e+00 7.2804500e-01] [4.3757000e+04 7.8826010e+00 1.3324460e+00]] [3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3, 2, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 1, 3, 1, 2, 1, 1, 2, 3, 3, 1, 2, 3, 3, 3, 1, 1, 1, 1, 2, 2, 1, 3, 2, 2, 2, 2, 3, 1, 2, 1, 2, 2, 2, 2, 2, 3, 2, 3, 1, 2, 3, 2, 2, 1, 3, 1, 1, 3, 3, 1, 2, 3, 1, 3, 1, 2, 2, 1, 1, 3, 3, 1, 2, 1, 3, 3, 2, 1, 1, 3, 1, 2, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 3, 1, 2, 1, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 3, 3, 1, 3, 2, 2, 3, 1, 3, 3, 3, 1, 3, 1, 1, 3, 3, 2, 3, 3, 1, 2, 3, 2, 2, 3, 3, 3, 1, 2, 2, 1, 1, 3, 2, 3, 3, 1, 2, 1, 3, 1, 2, 3, 2, 3, 1, 1, 1, 3, 2, 3, 1, 3, 2, 1, 3, 2, 2, 3, 2, 3, 2, 1, 1, 3, 1, 3, 2, 2, 2, 3, 2, 2, 1, 2, 2, 3, 1, 3, 3, 2, 1, 1, 1, 2, 1, 3, 3, 3, 3, 2, 1, 1, 1, 2, 3, 2, 1, 3, 1, 3, 2, 2, 3, 1, 3, 1, 1, 2, 1, 2, 2, 1, 3, 1, 3, 2, 3, 1, 2, 3, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 2, 1, 1, 1, 3, 3, 2, 1, 1, 1, 2, 2, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 3, 2, 3, 3, 3, 3, 1, 2, 3, 1, 1, 1, 3, 1, 3, 2, 2, 1, 3, 1, 3, 2, 2, 1, 2, 2, 3, 1, 3, 2, 1, 1, 3, 3, 2, 3, 3, 2, 3, 1, 3, 1, 3, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 2, 1, 3, 1, 1, 3, 3, 2, 2, 3, 1, 2, 3, 3, 2, 2, 1, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 1, 3, 3, 3, 2, 3, 2, 1, 1, 1, 1, 1, 3, 2, 2, 1, 2, 1, 3, 2, 1, 3, 2, 1, 3, 1, 1, 3, 3, 3, 3, 2, 1, 1, 2, 1, 3, 3, 2, 1, 2, 3, 2, 1, 2, 2, 2, 1, 1, 3, 1, 1, 2, 3, 1, 1, 2, 3, 1, 3, 1, 1, 2, 2, 1, 2, 2, 2, 3, 1, 1, 1, 3, 1, 3, 1, 3, 3, 1, 1, 1, 3, 2, 3, 3, 2, 2, 1, 1, 1, 2, 1, 2, 2, 3, 3, 3, 1, 1, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 1, 1, 1, 3, 3, 3, 3, 2, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 2, 3, 2, 1, 2, 2, 2, 3, 2, 1, 3, 2, 3, 2, 3, 2, 1, 1, 2, 3, 1, 3, 3, 3, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 1, 3, 3, 2, 2, 2, 3, 1, 2, 1, 1, 3, 2, 3, 2, 3, 2, 3, 3, 2, 2, 1, 3, 1, 2, 1, 3, 1, 1, 1, 3, 1, 1, 3, 3, 2, 2, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 1, 3, 3, 1, 2, 3, 1, 3, 1, 1, 2, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 2, 1, 3, 1, 3, 1, 1, 2, 2, 2, 3, 2, 2, 1, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 3, 2, 3, 2, 1, 2, 1, 1, 1, 2, 3, 2, 2, 1, 2, 2, 1, 3, 1, 3, 3, 3, 2, 2, 3, 3, 1, 2, 2, 2, 3, 1, 2, 1, 3, 1, 2, 3, 1, 1, 1, 2, 2, 3, 1, 3, 1, 1, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 2, 2, 3, 1, 3, 1, 2, 3, 2, 2, 3, 1, 2, 3, 2, 3, 1, 2, 2, 3, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 3, 2, 1, 3, 3, 3, 1, 1, 3, 1, 2, 3, 3, 2, 2, 2, 1, 2, 3, 2, 2, 3, 2, 2, 2, 3, 3, 2, 1, 3, 2, 1, 3, 3, 1, 2, 3, 2, 1, 3, 3, 3, 1, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 1, 2, 1, 3, 1, 2, 2, 1, 3, 2, 1, 3, 3, 2, 2, 2, 1, 2, 2, 1, 3, 1, 3, 1, 3, 3, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 3, 2, 2, 1, 3, 1, 2, 3, 1, 3, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 3, 3, 3, 1, 3, 2, 2, 1, 1, 3, 3, 2, 2, 2, 1, 2, 1, 2, 1, 3, 2, 1, 2, 2, 3, 1, 2, 2, 2, 3, 2, 1, 2, 1, 2, 3, 3, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2, 2, 2, 1, 3, 3, 3, 3, 3, 1, 1, 3, 2, 1, 2, 1, 2, 2, 3, 2, 2, 2, 3, 1, 2, 1, 2, 2, 1, 1, 2, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 1, 3, 3, 2, 3, 2, 3, 3, 2, 2, 1, 1, 1, 3, 3, 1, 1, 1, 3, 3, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 3, 1, 1, 2, 3, 2, 2, 1, 3, 1, 2, 3, 1, 2, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 2, 3, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 3, 3, 3] (1000, 3)这里为什么要对特征进行归一化?
因为在处理这种不同取值范围的特征值时,数值归一化能够将不同特征的取值范围限定在同一区间例如[0,1]之间,让不同特征对距离的计算影响相同。具体可看《机器学习实战》第2.2.3节内容。
#归一化特征变量中的数据 #归一化公式为 : newValue = (oldvalue-min)/(max-min) def autoNorm(dataSet): #获得数据每一列的最小值和最大值 #b.min(k)就是b.min(axis=k),就是在他的第k个轴上投影求最小 minVals = dataSet.min(0) maxVals = dataSet.max(0) #最大值和最小值的范围 ranges = maxVals - minVals #创建numpy矩阵,里面全是零元素 normDataSet = np.zeros(np.shape(dataSet)) #返回dataSet的行数 m = dataSet.shape[0] #原始值减去最小值。np.tile: 重复n次 normDataSet = dataSet - np.tile(minVals, (m, 1)) #除以最大和最小值的差,得到归一化的数据 #normDataSet值被限定在[0,1]之间 normDataSet = normDataSet/np.tile(ranges, (m, 1)) return normDataSet, ranges, minVals normDataSet,ranges,minVals=autoNorm(returnMat) print(normDataSet) print(ranges) print(minVals) [[0.44832535 0.39805139 0.56233353] [0.15873259 0.34195467 0.98724416] [0.28542943 0.06892523 0.47449629] ... [0.29115949 0.50910294 0.51079493] [0.52711097 0.43665451 0.4290048 ] [0.47940793 0.3768091 0.78571804]] [9.1273000e+04 2.0919349e+01 1.6943610e+00] [0. 0. 0.001156]根据用户输入,在线判断类别。
def classifyPerson(): resultList = ['not at all', 'in small doses', 'in large doses'] percentTats = float(input(\ "percentage of time spent playing video games?")) ffMiles = float(input("frequent flier miles earned per year?")) iceCream = float(input("liters of ice cream consumed per year?")) datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') normMat, ranges, minVals = autoNorm(datingDataMat) inArr = np.array([ffMiles, percentTats, iceCream, ]) classifierResult = classify0((inArr - minVals)/ranges, normMat, datingLabels, 5) print("You will probably like this person: %s" % resultList[classifierResult - 1]) classifyPerson() percentage of time spent playing video games?10 frequent flier miles earned per year?100 liters of ice cream consumed per year?1 You will probably like this person: in small doses