#读取数据,预处理
data = pd.read_csv('housing.csv') prices = data['MEDV'] features = data.drop('MEDV', axis = 1)#观察数据特征 #目标:计算价值的最小值
minimum_price = np.min(prices)#目标:计算价值的最大值
maximum_price = np.max(prices)#目标:计算价值的平均值
mean_price = np.mean(prices)#目标:计算价值的中值
median_price = np.median(prices)#目标:计算价值的标准差
std_price = np.std(prices)#目标:输出计算的结果
print("Statistics for Boston housing dataset:\n") print("Minimum price: ${:,.2f}".format(minimum_price)) print("Maximum price: ${:,.2f}".format(maximum_price)) print("Mean price: ${:,.2f}".format(mean_price)) print("Median price ${:,.2f}".format(median_price)) print("Standard deviation of prices: ${:,.2f}".format(std_price))#通过散点图各个特征和标签之间的关系
import matplotlib.pyplot as plt rm = data['RM'] medv = data['MEDV'] plt.scatter(rm, medv, c='b') plt.show() lstat = data['LSTAT'] plt.scatter(lstat, medv, c='c') plt.show() ptratio = data['PTRATIO'] plt.scatter(ptratio, medv, c='g') plt.show()#确定预测评分模型,选用R2方法
from sklearn.metrics import r2_score def performance_metric(y_true, y_predict): """计算并返回预测值相比于预测值的分数""" score = r2_score(y_true, y_predict, sample_weight=None, multioutput=None) return score#建立预测模型,通过GridSearchCV找到最有决策树模型
from sklearn.model_selection import KFold from sklearn.metrics import make_scorer from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import GridSearchCV def fit_model(X, y): """ 基于输入数据 [X,y],利于网格搜索找到最优的决策树模型""" cross_validator = KFold(n_splits=10, shuffle=False, random_state=None) regressor = DecisionTreeRegressor() params = {'max_depth':[1,2,3,4,5,6,7,8,9,10]} scoring_fnc = make_scorer(performance_metric) grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cross_validator) # 基于输入数据 [X,y],进行网格搜索grid = grid.fit(X, y) # 返回网格搜索后的最优模型 return grid.best_estimator_
#拆分数据集,训练集合测试集,选用train_test_split
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size=0.20, random_state=0) print("Train test split success!")