本篇为李宏毅机器学习第三次作业内容,不使用sklearn包来手写线性回归完成对PM2.5的预测,先说本次代码的不足和欠缺思考的部分,首先对数据的特征没有进行过多的处理,如异常值和标准化,其次使用的是最简单的一次线性模型,可能存在拟合程度不够,最后采用的梯度下降方法不够优化,没有使用Adagrad方法进行梯度下降.这次的作业和内容还有很多值得完善和思考的地方,但自己手写代码和推导对于机器学习的理解可以更加深刻.
# 导入相应的包 import numpy as np import pandas as pd import scipy import matplotlib.pyplot as plt #修改默认文件路径 import os os.chdir(r'D:\datawale\week1') os.getcwd() # 读取训练文件 data = pd.read_csv('train.csv') # 删除无用字段 data.drop(['stations'],inplace=True,axis = 1) # 查看数据 data.head() # 选择PM2.5的数据 data1 = data[data['observation'] == 'PM2.5'] #将前两列无关字段去除 data2 = data1.iloc[:,2:] #查看data2 data2.head() #创建两个list来接收数据 x_train = [] y_train = [] for i in range(15): #特征是每行数据的前9个小时数据 x = data2.iloc[:,i:i+9] #给这些列添加相同的列名方便后期合并 x.columns = np.array(range(9)) # 每行的第10个小时为数据的标签 y = data2.iloc[:,i+9] #给这些列添加相同的列名方便后期合并 y.columns = np.array(range(1)) #将特征和标签放入到列表中 x_train.append(x) y_train.append(y) # 合并特征数据 x_train_ts = pd.concat(x_train,axis = 0) y_train_ts = pd.concat(y_train,axis = 0) x_train_ts #x_train_ts.describe() #y_train_ts # 转换为浮点型进行计算 x_train_ts = np.array(x_train_ts,float) y_train_ts = np.array(y_train_ts,float) x_train_ts # 对数据进行增加偏置的修正 x_train_ts = np.concatenate((np.ones((x_train_ts.shape[0],1)),x_train_ts),axis = 1) x_train_ts # 将测试数据集处理好 data_test = pd.read_csv('test(1).csv',header= None) data_test.head() data_new = data_test[data_test[1] == 'PM2.5'] x_test = data_new.iloc[:,2:] x_test = np.array(x_test,float) x_test = np.concatenate((np.ones((x_test.shape[0],1)),x_test),axis = 1) y_data = pd.read_csv('answer.csv',header=0) y_test = np.array(y_data['value'],float) y_test # 反思,这里最初就是对的,但是由于学习率设置过大导致发散,无法获得w值 w = np.zeros(len(x_train_ts[0])) lr = 0.0000000001 #先写最简单的模型 for i in range(100000): y_hat = np.dot(x_train_ts,w) errors = y_hat - y_train_ts # 最小化损失函数 delta_w = 2 * np.dot(x_train_ts.transpose(),errors) w = w - lr*delta_w #更新w权值 #loss = 0.5*(errors**2).sum() y_mean = np.mean(y_train_ts) SSR = ((y_hat - y_train_ts)**2).sum() SST = ((y_train_ts - y_mean)**2).sum() R_square = 1 - SSR/SST if i000 == 0: print('第%i轮误差和%.4f'%(i+1,R_square)) #plt.scatter(i,R_square,'red') print(w) ''' # 使用Adagrad梯度下降(参考) w=np.zeros(len(x_train_ts[0])) lr=10 Iteration=10000 sum_gra=np.zeros(len(x_train_ts[0])) #define sum=0 for i in range(Iteration): y_new=np.dot(x_train_ts,w) loss=y_new- y_train_ts gra= 2*np.dot(x_train_ts.transpose(),loss) #notice we must transpose train_x sum_gra += gra**2 ada=np.sqrt(sum_gra) w=w-lr*gra/ada y_mean = np.mean(y_train_ts) SSR = ((y_hat - y_train_ts)**2).sum() SST = ((y_train_ts - y_mean)**2).sum() R_square = 1 - SSR/SST if i00 == 0: print('第%i轮误差和%.4f'%(i+1,R_square)) w ''' # 定义一个测试函数来检验预测效果 # R_square得分验证 def test(x_test,y_test): y_hat = np.dot(x_test,w) y_mean = np.mean(y_test) SSR = ((y_hat - y_test)**2).sum() SST = ((y_test - y_mean)**2).sum() R_square = 1 - SSR/SST return R_square test(x_test,y_test)