决策树

xiaoxiao2022-07-03 162

%matplotlib inline import matplotlib.pyplot as plt import numpy as np import pandas as pd def read_dataset(fname): # 指定第一列作为行索引 data = pd.read_csv(fname, index_col=0) # 丢弃无用的数据 data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True) # 处理性别数据 data['Sex'] = (data['Sex'] == 'male').astype('int') # 处理登船港口数据 labels = data['Embarked'].unique().tolist() data['Embarked'] = data['Embarked'].apply(lambda n: labels.index(n)) # 处理缺失数据 data = data.fillna(0) return data train = read_dataset('datasets/titanic/train.csv') train.head() SurvivedPclassSexAgeSibSpParchFareEmbarkedPassengerId103122.0107.25000211038.01071.28331313026.0007.92500411035.01053.10000503135.0008.05000 from sklearn.model_selection import train_test_split y = train['Survived'].values X = train.drop(['Survived'], axis=1).values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) print('train dataset: {0}; test dataset: {1}'.format( X_train.shape, X_test.shape)) train dataset: (712, 7); test dataset: (179, 7) from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier() clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) print('train score: {0}; test score: {1}'.format(train_score, test_score)) train score: 0.9901685393258427; test score: 0.7821229050279329 from sklearn.tree import export_graphviz with open("titanic.dot", 'w') as f: f = export_graphviz(clf, out_file=f) # 参数选择 max_depth def cv_score(d): clf = DecisionTreeClassifier(max_depth=d) clf.fit(X_train, y_train) tr_score = clf.score(X_train, y_train) cv_score = clf.score(X_test, y_test) return (tr_score, cv_score) depths = range(2, 15) scores = [cv_score(d) for d in depths] tr_scores = [s[0] for s in scores] cv_scores = [s[1] for s in scores] best_score_index = np.argmax(cv_scores) best_score = cv_scores[best_score_index] best_param = depths[best_score_index] print('best param: {0}; best score: {1}'.format(best_param, best_score)) plt.figure(figsize=(10, 6), dpi=144) plt.grid() plt.xlabel('max depth of decision tree') plt.ylabel('score') plt.plot(depths, cv_scores, '.g-', label='cross-validation score') plt.plot(depths, tr_scores, '.r--', label='training score') plt.legend() best param: 4; best score: 0.8268156424581006 <matplotlib.legend.Legend at 0x19bd6cbb4e0>

# 训练模型，并计算评分 def cv_score(val): clf = DecisionTreeClassifier(criterion='gini',min_impurity_split=val) clf.fit(X_train, y_train) tr_score = clf.score(X_train, y_train) cv_score = clf.score(X_test, y_test) return (tr_score, cv_score) # 指定参数范围，分别训练模型，并计算评分 values = np.linspace(0, 0.5, 50) scores = [cv_score(v) for v in values] tr_scores = [s[0] for s in scores] cv_scores = [s[1] for s in scores] # 找出评分最高的模型参数 best_score_index = np.argmax(cv_scores) best_score = cv_scores[best_score_index] best_param = values[best_score_index] print('best param: {0}; best score: {1}'.format(best_param, best_score)) # 画出模型参数与模型评分的关系 plt.figure(figsize=(10, 6), dpi=144) plt.grid() plt.xlabel('threshold of entropy') plt.ylabel('score') plt.plot(values, cv_scores, '.g-', label='cross-validation score') plt.plot(values, tr_scores, '.r--', label='training score') plt.legend() best param: 0.17346938775510204; best score: 0.8212290502793296 <matplotlib.legend.Legend at 0x19be2859e80>

def plot_curve(train_sizes, cv_results, xlabel): train_scores_mean = cv_results['mean_train_score'] train_scores_std = cv_results['std_train_score'] test_scores_mean = cv_results['mean_test_score'] test_scores_std = cv_results['std_test_score'] plt.figure(figsize=(10, 6), dpi=144) plt.title('parameters turning') plt.grid() plt.xlabel(xlabel) plt.ylabel('score') plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, '.--', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, '.-', color="g", label="Cross-validation score") plt.legend(loc="best") from sklearn.model_selection import GridSearchCV thresholds = np.linspace(0, 0.5, 50) # Set the parameters by cross-validation param_grid = {'min_impurity_split': thresholds} clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, return_train_score=True) clf.fit(X, y) print("best param: {0}\nbest score: {1}".format(clf.best_params_, clf.best_score_)) plot_curve(thresholds, clf.cv_results_, xlabel='gini thresholds') best param: {'min_impurity_split': 0.22448979591836732} best score: 0.8181818181818182

from sklearn.model_selection import GridSearchCV entropy_thresholds = np.linspace(0, 1, 50) gini_thresholds = np.linspace(0, 0.5, 50) # Set the parameters by cross-validation param_grid = [{'criterion': ['entropy'], 'min_impurity_split': entropy_thresholds}, {'criterion': ['gini'], 'min_impurity_split': gini_thresholds}, {'max_depth': range(2, 10)}, {'min_samples_split': range(2, 30, 2)}] clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, return_train_score=True) clf.fit(X, y) print("best param: {0}\nbest score: {1}".format(clf.best_params_, clf.best_score_)) best param: {'criterion': 'entropy', 'min_impurity_split': 0.5306122448979591} best score: 0.8271604938271605 print(clf.best_params_) {'criterion': 'entropy', 'min_impurity_split': 0.5306122448979591} clf = DecisionTreeClassifier(criterion='entropy', min_impurity_split=0.002857142857142857) model=clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) print('train score: {0}; test score: {1}'.format(train_score, test_score)) # 导出 titanic.dot 文件 with open("titanic.dot", 'w') as f: f = export_graphviz(clf, out_file=f) # 1. 在电脑上安装 graphviz # 2. 运行 `dot -Tpng titanic.dot -o titanic.png` # 3. 在当前目录查看生成的决策树 titanic.png train score: 0.9901685393258427; test score: 0.776536312849162 def read_dataset(fname): # 指定第一列作为行索引 data = pd.read_csv(fname, index_col=0) # 丢弃无用的数据 data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True) # 处理性别数据 data['Sex'] = (data['Sex'] == 'male').astype('int') # 处理登船港口数据 labels = data['Embarked'].unique().tolist() data['Embarked'] = data['Embarked'].apply(lambda n: labels.index(n)) # 处理缺失数据 data = data.fillna(0) return data train = read_dataset('datasets/titanic/test.csv') x_test = train.values clf = DecisionTreeClassifier(criterion='entropy', min_impurity_split=0.002857142857142857) # clf.fit(x_test) ans=model.predict(x_test) tab_2=pd.DataFrame({'Survived':ans},index=train.index) tab_2.to_csv('1.csv')

最新回复(0)