%matplotlib inline
import matplotlib
.pyplot
as plt
import numpy
as np
import pandas
as pd
def read_dataset(fname
):
data
= pd
.read_csv
(fname
, index_col
=0)
data
.drop
(['Name', 'Ticket', 'Cabin'], axis
=1, inplace
=True)
data
['Sex'] = (data
['Sex'] == 'male').astype
('int')
labels
= data
['Embarked'].unique
().tolist
()
data
['Embarked'] = data
['Embarked'].apply(lambda n
: labels
.index
(n
))
data
= data
.fillna
(0)
return data
train
= read_dataset
('datasets/titanic/train.csv')
train
.head
()
SurvivedPclassSexAgeSibSpParchFareEmbarkedPassengerId
103122.0107.25000211038.01071.28331313026.0007.92500411035.01053.10000503135.0008.05000
from sklearn
.model_selection
import train_test_split
y
= train
['Survived'].values
X
= train
.drop
(['Survived'], axis
=1).values
X_train
, X_test
, y_train
, y_test
= train_test_split
(X
, y
, test_size
=0.2)
print('train dataset: {0}; test dataset: {1}'.format(
X_train
.shape
, X_test
.shape
))
train dataset: (712, 7); test dataset: (179, 7)
from sklearn
.tree
import DecisionTreeClassifier
clf
= DecisionTreeClassifier
()
clf
.fit
(X_train
, y_train
)
train_score
= clf
.score
(X_train
, y_train
)
test_score
= clf
.score
(X_test
, y_test
)
print('train score: {0}; test score: {1}'.format(train_score
, test_score
))
train score: 0.9901685393258427; test score: 0.7821229050279329
from sklearn
.tree
import export_graphviz
with open("titanic.dot", 'w') as f
:
f
= export_graphviz
(clf
, out_file
=f
)
def cv_score(d
):
clf
= DecisionTreeClassifier
(max_depth
=d
)
clf
.fit
(X_train
, y_train
)
tr_score
= clf
.score
(X_train
, y_train
)
cv_score
= clf
.score
(X_test
, y_test
)
return (tr_score
, cv_score
)
depths
= range(2, 15)
scores
= [cv_score
(d
) for d
in depths
]
tr_scores
= [s
[0] for s
in scores
]
cv_scores
= [s
[1] for s
in scores
]
best_score_index
= np
.argmax
(cv_scores
)
best_score
= cv_scores
[best_score_index
]
best_param
= depths
[best_score_index
]
print('best param: {0}; best score: {1}'.format(best_param
, best_score
))
plt
.figure
(figsize
=(10, 6), dpi
=144)
plt
.grid
()
plt
.xlabel
('max depth of decision tree')
plt
.ylabel
('score')
plt
.plot
(depths
, cv_scores
, '.g-', label
='cross-validation score')
plt
.plot
(depths
, tr_scores
, '.r--', label
='training score')
plt
.legend
()
best param: 4; best score: 0.8268156424581006
<matplotlib.legend.Legend at 0x19bd6cbb4e0>
def cv_score(val
):
clf
= DecisionTreeClassifier
(criterion
='gini',min_impurity_split
=val
)
clf
.fit
(X_train
, y_train
)
tr_score
= clf
.score
(X_train
, y_train
)
cv_score
= clf
.score
(X_test
, y_test
)
return (tr_score
, cv_score
)
values
= np
.linspace
(0, 0.5, 50)
scores
= [cv_score
(v
) for v
in values
]
tr_scores
= [s
[0] for s
in scores
]
cv_scores
= [s
[1] for s
in scores
]
best_score_index
= np
.argmax
(cv_scores
)
best_score
= cv_scores
[best_score_index
]
best_param
= values
[best_score_index
]
print('best param: {0}; best score: {1}'.format(best_param
, best_score
))
plt
.figure
(figsize
=(10, 6), dpi
=144)
plt
.grid
()
plt
.xlabel
('threshold of entropy')
plt
.ylabel
('score')
plt
.plot
(values
, cv_scores
, '.g-', label
='cross-validation score')
plt
.plot
(values
, tr_scores
, '.r--', label
='training score')
plt
.legend
()
best param: 0.17346938775510204; best score: 0.8212290502793296
<matplotlib.legend.Legend at 0x19be2859e80>
def plot_curve(train_sizes
, cv_results
, xlabel
):
train_scores_mean
= cv_results
['mean_train_score']
train_scores_std
= cv_results
['std_train_score']
test_scores_mean
= cv_results
['mean_test_score']
test_scores_std
= cv_results
['std_test_score']
plt
.figure
(figsize
=(10, 6), dpi
=144)
plt
.title
('parameters turning')
plt
.grid
()
plt
.xlabel
(xlabel
)
plt
.ylabel
('score')
plt
.fill_between
(train_sizes
,
train_scores_mean
- train_scores_std
,
train_scores_mean
+ train_scores_std
,
alpha
=0.1, color
="r")
plt
.fill_between
(train_sizes
,
test_scores_mean
- test_scores_std
,
test_scores_mean
+ test_scores_std
,
alpha
=0.1, color
="g")
plt
.plot
(train_sizes
, train_scores_mean
, '.--', color
="r",
label
="Training score")
plt
.plot
(train_sizes
, test_scores_mean
, '.-', color
="g",
label
="Cross-validation score")
plt
.legend
(loc
="best")
from sklearn
.model_selection
import GridSearchCV
thresholds
= np
.linspace
(0, 0.5, 50)
param_grid
= {'min_impurity_split': thresholds
}
clf
= GridSearchCV
(DecisionTreeClassifier
(), param_grid
, cv
=5, return_train_score
=True)
clf
.fit
(X
, y
)
print("best param: {0}\nbest score: {1}".format(clf
.best_params_
,
clf
.best_score_
))
plot_curve
(thresholds
, clf
.cv_results_
, xlabel
='gini thresholds')
best param: {'min_impurity_split': 0.22448979591836732}
best score: 0.8181818181818182
from sklearn
.model_selection
import GridSearchCV
entropy_thresholds
= np
.linspace
(0, 1, 50)
gini_thresholds
= np
.linspace
(0, 0.5, 50)
param_grid
= [{'criterion': ['entropy'],
'min_impurity_split': entropy_thresholds
},
{'criterion': ['gini'],
'min_impurity_split': gini_thresholds
},
{'max_depth': range(2, 10)},
{'min_samples_split': range(2, 30, 2)}]
clf
= GridSearchCV
(DecisionTreeClassifier
(), param_grid
, cv
=5, return_train_score
=True)
clf
.fit
(X
, y
)
print("best param: {0}\nbest score: {1}".format(clf
.best_params_
,
clf
.best_score_
))
best param: {'criterion': 'entropy', 'min_impurity_split': 0.5306122448979591}
best score: 0.8271604938271605
print(clf
.best_params_
)
{'criterion': 'entropy', 'min_impurity_split': 0.5306122448979591}
clf
= DecisionTreeClassifier
(criterion
='entropy', min_impurity_split
=0.002857142857142857)
model
=clf
.fit
(X_train
, y_train
)
train_score
= clf
.score
(X_train
, y_train
)
test_score
= clf
.score
(X_test
, y_test
)
print('train score: {0}; test score: {1}'.format(train_score
, test_score
))
with open("titanic.dot", 'w') as f
:
f
= export_graphviz
(clf
, out_file
=f
)
train score: 0.9901685393258427; test score: 0.776536312849162
def read_dataset(fname
):
data
= pd
.read_csv
(fname
, index_col
=0)
data
.drop
(['Name', 'Ticket', 'Cabin'], axis
=1, inplace
=True)
data
['Sex'] = (data
['Sex'] == 'male').astype
('int')
labels
= data
['Embarked'].unique
().tolist
()
data
['Embarked'] = data
['Embarked'].apply(lambda n
: labels
.index
(n
))
data
= data
.fillna
(0)
return data
train
= read_dataset
('datasets/titanic/test.csv')
x_test
= train
.values
clf
= DecisionTreeClassifier
(criterion
='entropy', min_impurity_split
=0.002857142857142857)
ans
=model
.predict
(x_test
)
tab_2
=pd
.DataFrame
({'Survived':ans
},index
=train
.index
)
tab_2
.to_csv
('1.csv')