#coding:UTF-8 #Definiation of COLs: #1.sepal length in cm花瓣长度 #2.sepal width in cm花瓣宽度 #3.petal lenght in cm花萼长度 #4.petal width in cm花萼宽度 #5.class #花类别 #------Iris setosa #------Iris Versicolour #------Iris Virginica #Missing Attribute vales:none
from future import print_function from sklearn.datasets import load_iris ##莺尾花数据集 from sklearn.cross_validation import train_test_split ##交叉验证包 from sklearn.neighbors import KNeighborsClassifier ##最近邻的分类器 from sklearn.preprocessing import PolynomialFeatures ##特征两两组成 #第一步获取数据 iris=load_iris() X=iris.data#花的数据 y=iris.target#花的类别 #print(X,y)
#第二步将特征两两组合,创建更深度的数据集矩阵(暂时没有用到深度为2的数据集) poly=PolynomialFeatures(2) X_Poly=poly.fit_transform(X) #print(X_Poly) #第三步划分测试机 #train_test_split()函数是用来随机划分样本数据为训练集和测试集的,当然也可以人为的切片划分。(可将X替换成X_Poly,用深度为2的数据集) X_train, X_test, y_train,y_test=train_test_split(X,y,random_state=4) #print(X_train) #print(y_train) #X_train, X_test, y_train,y_test=train_test_split(X_Poly,y,random_state=4) #模型训练(最近邻,计算向量点的5个最近的邻居是谁,那个类别占比最高,这个类型就是谁) knn=KNeighborsClassifier(n_neighbors=5) knn.fit(X_train,y_train) #y_pred为预测值 y_pred=knn.predict(X_test) #print(‘y_pred’) #print(y_pred) #print(‘y_test’) #print(y_test)
#print(knn.score(X_test,y_test))
##切分5次训练集测试机,规避一次切分的特殊性,不充分 from sklearn.cross_validation import cross_val_score knn=KNeighborsClassifier(n_neighbors=5) scores=cross_val_score(knn,X,y,cv=5,scoring=‘accuracy’) print(scores)
#最近邻分类参数设置问题 from sklearn.cross_validation import cross_val_score import matplotlib.pyplot as plt k_range=range(1,31) k_scores=[] for k in k_range: knn=KNeighborsClassifier(n_neighbors=k) scores=cross_val_score(knn,X,y,cv=10,scoring=‘accuracy’) k_scores.append(scores.mean())
plt.plot(k_range,k_scores) plt.show()