来源:kaggle
Machine Learning Micro-Course Home Page
Here’s the code you’ve written so far. Start by running it again.
# Code you have previously used to load data import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeRegressor # Path of the file to read. We changed the directory structure to simplify submitting to a competition iowa_file_path = 'train.csv' home_data = pd.read_csv(iowa_file_path) # Create target object and call it y y = home_data.SalePrice # Create X features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd'] X = home_data[features] # Split into validation and training data train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) # Specify Model iowa_model = DecisionTreeRegressor(random_state=1) # Fit Model iowa_model.fit(train_X, train_y) # Make validation predictions and calculate mean absolute error val_predictions = iowa_model.predict(val_X) val_mae = mean_absolute_error(val_predictions, val_y) print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format(val_mae)) # Using best value for max_leaf_nodes iowa_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1) iowa_model.fit(train_X, train_y) val_predictions = iowa_model.predict(val_X) val_mae = mean_absolute_error(val_predictions, val_y) print("Validation MAE for best value of max_leaf_nodes: {:,.0f}".format(val_mae)) # Define the model. Set random_state to 1 rf_model = RandomForestRegressor(random_state=1) rf_model.fit(train_X, train_y) rf_val_predictions = rf_model.predict(val_X) rf_val_mae = mean_absolute_error(rf_val_predictions, val_y) print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae)) Validation MAE when not specifying max_leaf_nodes: 29,653 Validation MAE for best value of max_leaf_nodes: 27,283 Validation MAE for Random Forest Model: 22,762Build a Random Forest model and train it on all of X and y.
# To improve accuracy, create a new Random Forest model which you will train on all training data rf_model_on_full_data = RandomForestRegressor(random_state=1) # fit rf_model_on_full_data on all data from the training data rf_model_on_full_data.fit(train_X,train_y) RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1, verbose=0, warm_start=False)Read the file of “test” data. And apply your model to make predictions
# path to file you will use for predictions test_data_path = 'test.csv' # read test data file using pandas test_data = pd.read_csv(test_data_path) # create test_X which comes from test_data but includes only the columns you used for prediction. # The list of columns is stored in a variable called features test_X = test_data[['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']] # make predictions which we will submit. test_preds = rf_model.predict(test_X) # The lines below shows how to save predictions in format used for competition scoring # Just uncomment them. output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': test_preds}) output.to_csv('submission.csv', index=False)kaggle确实时一个不错的学习平台
刘润森! 认证博客专家 Python Java 前端 17年就读于东莞XX学院化学工程与工艺专业,GitChat作者。Runsen的微信公众号是"Python之王",因为Python入了IT的坑,从此不能自拔。公众号内容涉及Python,Java计算机、杂谈。干货与情怀同在。喜欢的微信搜索:「Python之王」。个人微信号:RunsenLiu。不关注我公号一律拉黑!!!