task3-预测pm2.5

xiaoxiao2022-07-05 179

李宏毅_Machine Learning_2019 Task 3 学习打卡内容大作业按照 Homework1_Introduction.txt 的要求完成本次作业作业1：预测PM2.5的值在这个作业中，我们将用梯度下降法 (Gradient Descent) 预测 PM2.5 的值 (Regression 回归问题) Homework1要求：要求 python3.5+ 只能用 numpy scipy pandas 请用梯度下降手写线性回归最好使用 Public Simple Baseline 对于想加载模型而并不想运行整个训练过程的人：请上传训练代码并命名成 train.py 只要用梯度下降的代码就行了 Homework_best 要求：要求 python3.5+ 任何库都可以用在 Kaggle 上获得你选择的更高的分数据介绍：本次作业使用豐原站的觀測記錄，分成 train set 跟 test set，train set 是豐原站每個月的前20天所有資料，test set則是從豐原站剩下的資料中取樣出來。 train.csv:每個月前20天每個小時的氣象資料(每小時有18種測資)。共12個月。 test.csv:從剩下的資料當中取樣出連續的10小時為一筆，前九小時的所有觀測數據當作feature，第十小時的PM2.5當作answer。一共取出240筆不重複的 test data，請根據feauure預測這240筆的PM2.5。请完成之后参考以下资料： Sample_code:https://ntumlta.github.io/2017fall-ml-hw1/code.html Supplementary_Slide:https://docs.google.com/presentation/d/1WwIQAVI0RRA6tpcieynPVoYDuMmuVKGvVNF_DSKIiDI/edit#slide=id.g1ef6d808f1_2_0 答案参考answer.csv

import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler train = pd.read_csv('train.csv', engine='python', encoding='gbk') test = pd.read_csv('test.csv', engine='python', encoding='gbk') train = train[train['observation'] == 'PM2.5'] test = test[test['AMB_TEMP'] == 'PM2.5'] train = train.drop(['Date', 'stations', 'observation'], axis=1) test_x = test.iloc[:, 2:] train_x = [] train_y = [] for i in range(15): x = train.iloc[:, i:i + 9] x.columns = np.array( range(9)) y = train.iloc[:, i + 9] y.columns = np.array(range(1)) train_x.append(x) train_y.append(y) train_x = pd.concat(train_x) train_y = pd.concat(train_y) train_y = np.array(train_y, float) test_x = np.array(test_x, float) ss = StandardScaler() ss.fit(train_x) train_x = ss.transform(train_x) ss.fit(test_x) test_x = ss.transform(test_x) def r2_score(y_true, y_predict): MSE = np.sum((y_true - y_predict) ** 2) / len(y_true) return 1 - MSE / np.var(y_true) class LinearRegression: def __init__(self): self.coef_ = None self.intercept_ = None self._theta = None def fit_normal(self, X_train, y_train): assert X_train.shape[0] == y_train.shape[0] X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train) self.intercept_ = self._theta[0] self.coef_ = self._theta[1:] return self def fit_gd(self, X_train, y_train, eta=0.01, n_iters=1e4): assert X_train.shape[0] == y_train.shape[0], \ "the size of X_train must be equal to the size of y_train" def J(theta, X_b, y): try: return np.sum((y - X_b.dot(theta)) ** 2) / len(y) except: return float('inf') def dJ(theta, X_b, y): return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y) def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8): ''' :param X_b: :param y: lebel :param initial_theta: :param eta: :param n_iters: :param epsilon: theta :return: ''' theta = initial_theta cur_iter = 0 while cur_iter < n_iters: gradient = dJ(theta, X_b, y) last_theta = theta theta = theta - eta * gradient if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon): break cur_iter += 1 return theta X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) initial_theta = np.zeros(X_b.shape[1]) self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters) self.intercept_ = self._theta[0] self.coef_ = self._theta[1:] return self def predict(self, X_predict): assert self.intercept_ is not None and self.coef_ is not None, \ "must fit before predict!" assert X_predict.shape[1] == len(self.coef_), \ "the feature number of X_predict must be equal to X_train" X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict]) return X_b.dot(self._theta) def score(self, X_test, y_test): y_predict = self.predict(X_test) return r2_score(y_test, y_predict) def __repr__(self): return "LR()" LR = LinearRegression().fit_gd(train_x, train_y) LR.score(train_x, train_y) result = LR.predict(test_x) sampleSubmission = pd.read_csv('sampleSubmission.csv', engine='python', encoding='gbk') sampleSubmission['value'] = result sampleSubmission.to_csv('result.csv')

最新回复(0)