5 rows × 27 columns
train.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 4320 entries, 0 to 4319 Data columns (total 27 columns): Date 4320 non-null object stations 4320 non-null object observation 4320 non-null object 0 4320 non-null object 1 4320 non-null object 2 4320 non-null object 3 4320 non-null object 4 4320 non-null object 5 4320 non-null object 6 4320 non-null object 7 4320 non-null object 8 4320 non-null object 9 4320 non-null object 10 4320 non-null object 11 4320 non-null object 12 4320 non-null object 13 4320 non-null object 14 4320 non-null object 15 4320 non-null object 16 4320 non-null object 17 4320 non-null object 18 4320 non-null object 19 4320 non-null object 20 4320 non-null object 21 4320 non-null object 22 4320 non-null object 23 4320 non-null object dtypes: object(27) memory usage: 911.3+ KB train.observation.unique() array(['AMB_TEMP', 'CH4', 'CO', 'NMHC', 'NO', 'NO2', 'NOx', 'O3', 'PM10', 'PM2.5', 'RAINFALL', 'RH', 'SO2', 'THC', 'WD_HR', 'WIND_DIREC', 'WIND_SPEED', 'WS_HR'], dtype=object) train_PM=train[train.observation=='PM2.5'] train_PM.head() Datestationsobservation0123456...1415161718192021222392014/1/1stationPM2.526393635312825...36454249454441302413272014/1/2stationPM2.521233030221813...53434345463216192226452014/1/3stationPM2.519252720161415...32363445404123292337632014/1/4stationPM2.527271420222426...62555667788390758582812014/1/5stationPM2.580807681756670...647357575370706068665 rows × 27 columns
PM_data=train_PM.iloc[:,3:] PM_data.head() 0123456789...14151617181920212223926393635312825201930...364542494544413024132721233030221813131122...534343454632161922264519252720161415849...323634454041232923376327271420222426334850...625556677883907585828180807681756670656657...647357575370706068665 rows × 24 columns
PM_data=PM_data.apply(lambda x : x.astype('float')) PM_data.info() <class 'pandas.core.frame.DataFrame'> Int64Index: 240 entries, 9 to 4311 Data columns (total 24 columns): 0 240 non-null float64 1 240 non-null float64 2 240 non-null float64 3 240 non-null float64 4 240 non-null float64 5 240 non-null float64 6 240 non-null float64 7 240 non-null float64 8 240 non-null float64 9 240 non-null float64 10 240 non-null float64 11 240 non-null float64 12 240 non-null float64 13 240 non-null float64 14 240 non-null float64 15 240 non-null float64 16 240 non-null float64 17 240 non-null float64 18 240 non-null float64 19 240 non-null float64 20 240 non-null float64 21 240 non-null float64 22 240 non-null float64 23 240 non-null float64 dtypes: float64(24) memory usage: 46.9 KB train_x=[] train_y=[] for i in range(15): x=PM_data.iloc[:,i:i+9] x.columns=np.arange(9) y=PM_data.iloc[:,i+9] y.columns=np.arange(1) train_x.append(x) train_y.append(y) train_x=pd.concat(train_x,axis=0) train_y=pd.concat(train_y,axis=0) train_x.info() <class 'pandas.core.frame.DataFrame'> Int64Index: 3600 entries, 9 to 4311 Data columns (total 9 columns): 0 3600 non-null float64 1 3600 non-null float64 2 3600 non-null float64 3 3600 non-null float64 4 3600 non-null float64 5 3600 non-null float64 6 3600 non-null float64 7 3600 non-null float64 8 3600 non-null float64 dtypes: float64(9) memory usage: 281.2 KB train_x.shape (3600, 9) train_y.shape (3600,) train_x=np.array(train_x) train_y=np.array(train_y) train_x=np.concatenate((np.ones((train_x.shape[0],1)),train_x),axis=1) train_x.shape (3600, 10) w=np.zeros(train_x.shape[1]) lr=10 Iteration=10000 # sum_gra=np.zeros(train_x.shape[0]) sum_gra=np.zeros(train_x.shape[1]) for i in range(Iteration): y_new=np.dot(train_x,w) loss=y_new-train_y ##X^t * loss 为矩阵形式的梯度 gra=np.dot(train_x.transpose(),loss) ###保存之前的梯度的平方 sum_gra+=gra**2 ada=np.sqrt(sum_gra) w=w-lr*gra/ada w array([ 2.15246702, 0.00728964, -0.04603067, 0.19941492, -0.20757788, -0.04384333, 0.46235285, -0.54329351, 0.01552538, 1.07716609]) df_test=pd.read_csv('test(1).csv') df_test.shape (4319, 11) test_PM=df_test[df_test.AMB_TEMP=='PM2.5'] test_PM.head() id_0AMB_TEMP151414.11313.113.213.313.4128id_0PM2.527132429413029272826id_1PM2.546475778847659616144id_2PM2.510102534403936252262id_3PM2.571585141414643342980id_4PM2.5132318105513912 test_x=np.array(test_PM.iloc[:,2:],float) test_x.shape (240, 9) test_x=np.concatenate([np.ones((test_x.shape[0],1)),test_x],axis=1) test_x.shape (240, 10) y_pre=np.dot(test_x,w) y_submit=pd.read_csv('sampleSubmission.csv') y_submit.head() idvalue0id_027.4144211id_161.5557642id_220.4980323id_329.5344344id_410.797670 y_submit.value=y_pre y_submit.to_csv('sampleSubmission.csv',index=False) y_real=pd.read_csv('answer.csv') y_real.head() idvalue0id_0331id_1602id_2163id_3334id_45