Python3学习笔记【机器学习】【k-近邻算法】【约会网站配对算法】

xiaoxiao2023-10-03 98

# -*- coding: utf-8 -*- """ Created on Wed May 22 10:43:50 2019 @author: 激光雷达 """ from numpy import * import operator '''Part 1 ''' def creatDataSet(): group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) labels = ['A','A','B','B'] return group,labels def classfiy0(iinX,dataSet,labels,k): # Step 1 : Calculated distance dataSetSize = dataSet.shape[0] # shape return the size of the matrix dataMat = tile(iinX,(dataSetSize,1)) - dataSet # tile is copy the matrix or group : tile(object,(col,row)) sqDiffMat = dataMat**2 # ** means power sqDistances = sqDiffMat.sum(axis=1) # sum means plus all , when the axis = 0 plus by col,axis = 1 ,by row distances = sqDistances**0.5 # According to the Euclidean distance formula, the prescription should be made here. # We get all distance from the dataSet to the NewOne sortDistanceIndex = distances.argsort() # sort distance , notice here argsort return the index ,not element # Step 2 : Determining the Classification of the First K # Minimum Distance Elements classCount = {} for i in range(k): voteIlabel = labels[sortDistanceIndex[i]] classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 # get（key，return（default is None）） # here run like this : k is a requirement for near accuracy, # According to the previous order, we take the labels of the corresponding # first k points into the new dictionary classCount and get the most # similar three labels. # Step 3 : Decomposed the classCount inyto a tupleList,sort it and # return the result. sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] group,labels = creatDataSet() print(classfiy0([0,0],group,labels,3)) ''' Part 2 ''' def file2matrix(filename): fr = open(filename) arrayOLines = fr.readlines() # readlines() : read all lines in a file untill meet EOF ,return a list # when meet EOF return empty string numberOfLines = len(arrayOLines) # len(obeject) : return the length of obeject returnMat = zeros((numberOfLines,3)) # zeros((numOfRow,numOfCol),dtype = int/double... ) make a 0 matrix classLabelVector = [] index = 0 for line in arrayOLines: line = line.strip() # strip('obeject') : remove the obeject from the string head and tail # here means remove 'space' listFromline = line.split('\t') # split('Obeject') :Separating strings with target symbols # '\t' : is horizontal tab returnMat[index,:] = listFromline[0:3] # ',' Used to disambiguate # copy the 0,1,2 elements of this row ,which is the data , # to the returnMat classLabelVector.append(int(listFromline[-1])) # copy the last elements of this row to the returnMat , # which is label index += 1 return returnMat,classLabelVector datingDataMat,datingDataLabels = file2matrix('datingTestSet2.txt') print(datingDataMat) print(datingDataLabels[0:20]) import matplotlib import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) # For example, "111" means "1 *1 grid, first subgraph" # and "234" means "2 *3 grid, fourth subgraph". ax.scatter(datingDataMat[:,1],datingDataMat[:,2]) # Drawing scatter plots ax.set_xlabel("Percentage of time spent playing video games") ax.set_ylabel("Ice cream kilograms consumed per week") plt.show() plt2 = matplotlib.pyplot fig2 = plt2.figure() ax2 = fig2.add_subplot(111) ax2.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingDataLabels) ,15.0*array(datingDataLabels)) # The above code uses the class label attributes stored in the variable # datingLabels to plot points of different colors and sizes on the # scatter plot. ax2.set_xlabel("Percentage of time spent playing video games") ax2.set_ylabel("Ice cream kilograms consumed per week") plt2.show() plt3 = matplotlib.pyplot fig3 = plt3.figure() ax3 = fig3.add_subplot(111) ax3.scatter(datingDataMat[:,0],datingDataMat[:,1],15.0*array(datingDataLabels) ,15.0*array(datingDataLabels)) # The above code uses the class label attributes stored in the variable # datingLabels to plot points of different colors and sizes on the # scatter plot. ax3.set_xlabel("Frequent Flight Miles Obtained Annually") ax3.set_ylabel("Percentage of time spent playing video games") ax3.legend() plt3.show() # Add Legend plt4 = matplotlib.pyplot plt4.rcParams['font.sans-serif']=['Simhei'] plt4.rcParams['axes.unicode_minus']=False datingDataMat4, datingLabels4 = file2matrix('datingTestSet2.txt') plt4.figure() axes4 = plt4.subplot(111) type1_x = [] type1_y = [] type2_x = [] type2_y = [] type3_x = [] type3_y = [] for i in range(len(datingLabels4)): if datingLabels4[i] == 1: type1_x.append(datingDataMat4[i][0]) type1_y.append(datingDataMat4[i][1]) if datingLabels4[i] == 2: type2_x.append(datingDataMat4[i][0]) type2_y.append(datingDataMat4[i][1]) if datingLabels4[i] == 3: type3_x.append(datingDataMat4[i][0]) type3_y.append(datingDataMat4[i][1]) # Depart the data to three part type1 = axes4.scatter(type1_x, type1_y, s=20, c='r') type2 = axes4.scatter(type2_x, type2_y, s=40, c='b') type3 = axes4.scatter(type3_x, type3_y, s=60, c='k') # Specify attributes for each part of the graph plt4.legend((type1, type2, type3), ('Dislike', 'Charming general', 'Glamour')) plt4.show() ''' Part 3 ''' def autoNorm(dataSet): minValues = dataSet.min(0) maxValues = dataSet.max(0) ranges = maxValues - minValues normDataSet = zeros(shape(dataSet)) tempVector = dataSet.shape[0] normDataSet = dataSet - tile(minValues,(tempVector,1)) normDataSet = normDataSet/tile(ranges,(tempVector,1)) return normDataSet,ranges,minValues normDataSet,ranges,minValues = autoNorm(datingDataMat) print() print(normDataSet) print() print(ranges) print() print(minValues) print() ''' Part 4 ''' def datingClassTest(): hoRatio = 0.10 datingDataMat,datingDataLabels = file2matrix('datingTestSet2.txt') normMat,ranges,minValues = autoNorm(datingDataMat) tempVector = normMat.shape[0] numTestVecs = int(tempVector*hoRatio) errorCount = 0. for i in range(numTestVecs): classfierResults = classfiy0(normMat[i,:],normMat[numTestVecs:tempVector,:], datingDataLabels[numTestVecs:tempVector],3) print("The classfier came back with: %d, the real is : %d" %(classfierResults,datingDataLabels[i])) if (classfierResults != datingDataLabels[i] ): errorCount += 1 print( "The total error rate is : %f"%(errorCount/float(numTestVecs)) ) datingClassTest() ''' Part 5 ''' def classfiyPerson(): resultList = ['Not at all','Small doses','Large doses'] percentTats = float(input("Percecntage of time spent on video games ?")) ffMiles = float(input("Frequent flier miles earned per year ?")) iceCream = float(input("Liters icecream consumed per year ?")) datingDataMat,datingDataLabels = file2matrix('datingTestSet2.txt') normMat,ranges,minValues = autoNorm(datingDataMat) inArr = array([ffMiles,percentTats,iceCream]) classfiyResult = classfiy0((inArr - minValues)/ranges,normMat, datingDataLabels,3) print("You will probably like this person: ",resultList[classfiyResult - 1]) classfiyPerson() #Over

最新回复(0)