Python进行决策树和随机森林
一、决策树第一步,导入库;第二步,导入数据;第三步,数据预处理;第四步,决策树;第五步,决策树评价;
第六步,生成决策树图。二、随机森林第一步,随机森林;第二步,随机森林评价;
一、决策树
第一步,导入库;
from sklearn
import datasets
import numpy
as np
import pandas
as pd
import matplotlib
.pyplot
as plt
plt
.rcParams
['font.sans-serif'] = ['SimHei']
plt
.rcParams
['axes.unicode_minus'] = False
第二步,导入数据;
data
= pd
.read_excel
('F:\\Desktop\\江苏省建模\\建模数据.xlsx')
data
[:5]
第三步,数据预处理;
X
= data
.iloc
[:, 1:]
y
= data
.iloc
[:, 0]
from sklearn
.cross_validation
import train_test_split
X_train
, X_test
, y_train
, y_test
= train_test_split
(X
, y
, test_size
= 0.3, random_state
= 0)
from sklearn
.preprocessing
import StandardScaler
stdsc
= StandardScaler
()
X_train_std
= stdsc
.fit_transform
(X_train
)
X_test_std
= stdsc
.transform
(X_test
)
第四步,决策树;
from sklearn
.tree
import DecisionTreeClassifier
tree
= DecisionTreeClassifier
(criterion
= 'entropy', max_depth
= 3, random_state
= 0)
tree
.fit
(X_train
, y_train
)
第五步,决策树评价;
print('Training accuracy:', tree
.score
(X_train
, y_train
))
print('Test accuracy:', tree
.score
(X_test
, y_test
))
from sklearn
.metrics
import confusion_matrix
y_pred
= tree
.predict
(X_test
)
confmat
= confusion_matrix
(y_true
=y_test
, y_pred
=y_pred
)
print(confmat
)
fig
, ax
= plt
.subplots
(figsize
=(2.5, 2.5))
ax
.matshow
(confmat
, cmap
=plt
.cm
.Blues
, alpha
=0.3)
for i
in range(confmat
.shape
[0]):
for j
in range(confmat
.shape
[1]):
ax
.text
(x
=j
, y
=i
, s
=confmat
[i
, j
], va
='center', ha
='center')
plt
.xlabel
('预测类标')
plt
.ylabel
('真实类标')
plt
.show
()
from sklearn
.metrics
import precision_score
, recall_score
, f1_score
print('Precision: %.4f' % precision_score
(y_true
=y_test
, y_pred
=y_pred
))
print('Recall: %.4f' % recall_score
(y_true
=y_test
, y_pred
=y_pred
))
print('F1: %.4f' % f1_score
(y_true
=y_test
, y_pred
=y_pred
))
from sklearn
.metrics
import roc_curve
, auc
from scipy
import interp
fig
= plt
.figure
(figsize
=(7, 5))
probas
= tree
.fit
(X_train
, y_train
).predict_proba
(X_test
)
fpr
, tpr
, thresholds
= roc_curve
(y_test
, probas
[:, 1], pos_label
=1)
roc_auc
= auc
(fpr
, tpr
)
plt
.plot
(fpr
, tpr
, lw
=1, label
='ROC (area = %0.2f)'
% ( roc_auc
))
plt
.plot
([0, 1], [0, 1], linestyle
='--', color
=(0.6, 0.6, 0.6), label
='random guessing')
plt
.plot
([0, 0, 1],
[0, 1, 1],
lw
=2,
linestyle
=':',
color
='black',
label
='perfect performance')
plt
.xlim
([-0.05, 1.05])
plt
.ylim
([-0.05, 1.05])
plt
.xlabel
('假正率')
plt
.ylabel
('真正率')
plt
.title
('')
plt
.legend
(loc
="lower right")
plt
.show
()
第六步,生成决策树图。
from sklearn
.tree
import export_graphviz
with open('treeone.dot', 'w') as f
:
dot_data
= export_graphviz
(tree
, out_file
=None)
f
.write
(dot_data
)
import pydotplus
from sklearn
.externals
.six
import StringIO
dot_data
= StringIO
()
export_graphviz
(tree
, out_file
= dot_data
, feature_names
=X
.columns
, filled
=True,rounded
=True, special_characters
=True)
graph
= pydotplus
.graph_from_dot_data
(dot_data
.getvalue
())
graph
.write_pdf
("treetwo.pdf")
二、随机森林
第一步,随机森林;
from sklearn
.ensemble
import RandomForestClassifier
forest
= RandomForestClassifier
(criterion
= 'entropy', n_estimators
= 10, random_state
= 1, n_jobs
= 2)
forest
.fit
(X_train
, y_train
)
第二步,随机森林评价;
print('Training accuracy:', forest
.score
(X_train
, y_train
))
print('Test accuracy:', forest
.score
(X_test
, y_test
))
from sklearn
.metrics
import confusion_matrix
y_pred
= forest
.predict
(X_test
)
confmat
= confusion_matrix
(y_true
=y_test
, y_pred
=y_pred
)
print(confmat
)
fig
, ax
= plt
.subplots
(figsize
=(2.5, 2.5))
ax
.matshow
(confmat
, cmap
=plt
.cm
.Blues
, alpha
=0.3)
for i
in range(confmat
.shape
[0]):
for j
in range(confmat
.shape
[1]):
ax
.text
(x
=j
, y
=i
, s
=confmat
[i
, j
], va
='center', ha
='center')
plt
.xlabel
('预测类标')
plt
.ylabel
('真实类标')
plt
.show
()
from sklearn
.metrics
import precision_score
, recall_score
, f1_score
print('Precision: %.4f' % precision_score
(y_true
=y_test
, y_pred
=y_pred
))
print('Recall: %.4f' % recall_score
(y_true
=y_test
, y_pred
=y_pred
))
print('F1: %.4f' % f1_score
(y_true
=y_test
, y_pred
=y_pred
))
from sklearn
.metrics
import roc_curve
, auc
from scipy
import interp
fig
= plt
.figure
(figsize
=(7, 5))
probas
= forest
.fit
(X_train
, y_train
).predict_proba
(X_test
)
fpr
, tpr
, thresholds
= roc_curve
(y_test
, probas
[:, 1], pos_label
=1)
roc_auc
= auc
(fpr
, tpr
)
plt
.plot
(fpr
, tpr
, lw
=1, label
='ROC (area = %0.2f)'
% ( roc_auc
))
plt
.plot
([0, 1], [0, 1], linestyle
='--', color
=(0.6, 0.6, 0.6), label
='random guessing')
plt
.plot
([0, 0, 1],
[0, 1, 1],
lw
=2,
linestyle
=':',
color
='black',
label
='perfect performance')
plt
.xlim
([-0.05, 1.05])
plt
.ylim
([-0.05, 1.05])
plt
.xlabel
('假正率')
plt
.ylabel
('真正率')
plt
.title
('')
plt
.legend
(loc
="lower right")
plt
.show
()