Python进行决策树和随机森林

xiaoxiao2025-06-20 33

Python进行决策树和随机森林

一、决策树第一步，导入库；第二步，导入数据；第三步，数据预处理；第四步，决策树；第五步，决策树评价；第六步，生成决策树图。二、随机森林第一步，随机森林；第二步，随机森林评价；

一、决策树

第一步，导入库；

# 导入库 from sklearn import datasets import numpy as np import pandas as pd import matplotlib.pyplot as plt # 使文字可以展示 plt.rcParams['font.sans-serif'] = ['SimHei'] # 使负号可以展示 plt.rcParams['axes.unicode_minus'] = False

第二步，导入数据；

# 读取数据 data = pd.read_excel('F:\\Desktop\\江苏省建模\\建模数据.xlsx') data[:5]

第三步，数据预处理；

# 设置 X 和 y X = data.iloc[:, 1:] y = data.iloc[:, 0] from sklearn.cross_validation import train_test_split # 设置训练数据集和测试数据集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) # 数据标准化 from sklearn.preprocessing import StandardScaler stdsc = StandardScaler() # 将训练数据标准化 X_train_std = stdsc.fit_transform(X_train) # 将测试数据标准化 X_test_std = stdsc.transform(X_test)

第四步，决策树；

# 以熵作为不纯度度量标准 from sklearn.tree import DecisionTreeClassifier tree = DecisionTreeClassifier(criterion = 'entropy', max_depth = 3, random_state = 0) tree.fit(X_train, y_train)

第五步，决策树评价；

# 打印训练集精确度 print('Training accuracy:', tree.score(X_train, y_train)) # 打印测试集精确度 print('Test accuracy:', tree.score(X_test, y_test)) # 绘制混淆矩阵 from sklearn.metrics import confusion_matrix y_pred = tree.predict(X_test) confmat = confusion_matrix(y_true=y_test, y_pred=y_pred) print(confmat) # 将混淆矩阵可视化 fig, ax = plt.subplots(figsize=(2.5, 2.5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) for i in range(confmat.shape[0]): for j in range(confmat.shape[1]): ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center') plt.xlabel('预测类标') plt.ylabel('真实类标') plt.show()

# 获取模型的准确率和召回率 from sklearn.metrics import precision_score, recall_score, f1_score # 准确率 print('Precision: %.4f' % precision_score(y_true=y_test, y_pred=y_pred)) # 召回率 print('Recall: %.4f' % recall_score(y_true=y_test, y_pred=y_pred)) # F1 print('F1: %.4f' % f1_score(y_true=y_test, y_pred=y_pred)) from sklearn.metrics import roc_curve, auc from scipy import interp # 设置图形大小 fig = plt.figure(figsize=(7, 5)) # 计算预测率---使用测试数据集 probas = tree.fit(X_train, y_train).predict_proba(X_test) # 计算 fpr,tpr fpr, tpr, thresholds = roc_curve(y_test, probas[:, 1], pos_label=1) # 计算 AUC 值 roc_auc = auc(fpr, tpr) # 画 ROC 曲线 plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % ( roc_auc)) # 画斜线 plt.plot([0, 1], [0, 1], linestyle='--', color=(0.6, 0.6, 0.6), label='random guessing') # 画完美表现线 plt.plot([0, 0, 1], [0, 1, 1], lw=2, linestyle=':', color='black', label='perfect performance') # 设置坐标轴范围 plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) # 设置坐标轴标题 plt.xlabel('假正率') plt.ylabel('真正率') # 设置标题 plt.title('') # 设置图例位置 plt.legend(loc="lower right") plt.show()

第六步，生成决策树图。

#画图方法1-生成dot文件 from sklearn.tree import export_graphviz #可视化决策树 with open('treeone.dot', 'w') as f: dot_data = export_graphviz(tree, out_file=None) f.write(dot_data) #画图方法2-生成pdf文件 import pydotplus from sklearn.externals.six import StringIO dot_data = StringIO() export_graphviz(tree, out_file = dot_data, feature_names=X.columns, filled=True,rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("treetwo.pdf")

二、随机森林

第一步，随机森林；

from sklearn.ensemble import RandomForestClassifier # n_estimator = 10表示有10颗决策树 # n_jobs = 2表示使用CPU的两个内核 forest = RandomForestClassifier(criterion = 'entropy', n_estimators = 10, random_state = 1, n_jobs = 2) forest.fit(X_train, y_train)

第二步，随机森林评价；

# 打印训练集精确度 print('Training accuracy:', forest.score(X_train, y_train)) # 打印测试集精确度 print('Test accuracy:', forest.score(X_test, y_test)) # 绘制混淆矩阵 from sklearn.metrics import confusion_matrix y_pred = forest.predict(X_test) confmat = confusion_matrix(y_true=y_test, y_pred=y_pred) print(confmat) # 将混淆矩阵可视化 fig, ax = plt.subplots(figsize=(2.5, 2.5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) for i in range(confmat.shape[0]): for j in range(confmat.shape[1]): ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center') plt.xlabel('预测类标') plt.ylabel('真实类标') plt.show()

# 获取模型的准确率和召回率 from sklearn.metrics import precision_score, recall_score, f1_score # 准确率 print('Precision: %.4f' % precision_score(y_true=y_test, y_pred=y_pred)) # 召回率 print('Recall: %.4f' % recall_score(y_true=y_test, y_pred=y_pred)) # F1 print('F1: %.4f' % f1_score(y_true=y_test, y_pred=y_pred)) from sklearn.metrics import roc_curve, auc from scipy import interp # 设置图形大小 fig = plt.figure(figsize=(7, 5)) # 计算预测率---使用测试数据集 probas = forest.fit(X_train, y_train).predict_proba(X_test) # 计算 fpr,tpr fpr, tpr, thresholds = roc_curve(y_test, probas[:, 1], pos_label=1) # 计算 AUC 值 roc_auc = auc(fpr, tpr) # 画 ROC 曲线 plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % ( roc_auc)) # 画斜线 plt.plot([0, 1], [0, 1], linestyle='--', color=(0.6, 0.6, 0.6), label='random guessing') # 画完美表现线 plt.plot([0, 0, 1], [0, 1, 1], lw=2, linestyle=':', color='black', label='perfect performance') # 设置坐标轴范围 plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) # 设置坐标轴标题 plt.xlabel('假正率') plt.ylabel('真正率') # 设置标题 plt.title('') # 设置图例位置 plt.legend(loc="lower right") plt.show()

最新回复(0)