任务 【任务六-模型融合】用你目前评分最高的模型作为基准模型,和其他模型进行stacking融合,得到最终模型及评分
Stacking融合 按照自己的理解 第一层: 使用交叉验证的划分方法,将训练集划分成5份, 使用第一个基分类器对划分之后得到的test进行预测,得到的5个predict文件,维数(n5,1) (\frac{n}{5},1)(5n,1),纵向拼接得到1个Predict文件维数(n,1) (n,1)(n,1) 使用第一个基分类器对整个Test进行预测,得到预测文件5个p_t,维数(r,1) (r,1)(r,1),横向拼接,求平均值得到1个Pt文件维数(r,1) (r,1)(r,1) 使用第二个基分类器,得到5个Predict文件维数(n,1) (n,1)(n,1),5个Pt文件维数(r,1) (r,1)(r,1)
from pandas import Series, DataFrame import pickle import pandas as pd from sklearn.externals import joblib from pandas import Series, DataFrame from sklearn import svm from sklearn.model_selection import * # 划分数据 交叉验证 from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, roc_curve import warnings warnings.filterwarnings("ignore") path = "E:/MyPython/Machine_learning_GoGoGo/" """===================================================================================================================== 1 读取数据 """ print("0 读取特征") f = open(path + 'feature/feature_V3.pkl', 'rb') train, test, y_train, y_test = pickle.load(f) f.close() """===================================================================================================================== 2 进行K次训练;用K个模型分别对测试集进行预测,并得到K个结果,再进行结果的融合 """ """===================================================================================================================== 3 交叉验证方式 """ ## 对交叉验证方式进行指定,如验证次数,训练集测试集划分比例等 kf = KFold(n_splits=5, random_state=1) loo = LeaveOneOut() # 将数据集分成训练集和测试集,测试集包含一个样本,训练集包含n-1个样本 lpo = LeavePOut(p=2000) ## #将数据集分成训练集和测试集,测试集包含p个样本,训练集包含n-p个样本 ss = ShuffleSplit(n_splits=5, test_size=.25, random_state=0) tss = TimeSeriesSplit(n_splits=5) logo = LeaveOneGroupOut() lpgo = LeavePGroupsOut(n_groups=3) gss = GroupShuffleSplit(n_splits=4, test_size=.5, random_state=0) gkf = GroupKFold(n_splits=2) """【配置交叉验证方式】""" cv = kf """===================================================================================================================== 2 读取模型 """ print("1 读取模型") SVM_linear = joblib.load( path + "model/model_file/SVM_linear.pkl") SVM_poly = joblib.load( path + "model/model_file/SVM_poly.pkl") SVM_rbf = joblib.load( path + "model/model_file/SVM_rbf.pkl") SVM_sigmoid = joblib.load( path + "model/model_file/SVM_sigmoid.pkl") lg_120 = joblib.load( path + "model/model_file/lg_120.pkl") DT = joblib.load( path + "model/model_file/DT.pkl") xgb_sklearn = joblib.load( path + "model/model_file/xgb_sklearn.pkl") lgb_sklearn = joblib.load( path + "model/model_file/lgb_sklearn.pkl") xgb = joblib.load( path + "model/model_file/xgb.pkl") lgb = joblib.load( path + "model/model_file/lgb.pkl") # 原始数据的索引不是从0开始的,因此重置索引 y_train = y_train.reset_index(drop=True) y_test = y_test.reset_index(drop=True) """===================================================================================================================== 3 【第一层】用预测结果构建特征 """ def get_feature(clf,train,y_train,test,y_test,cv): preds_Train = [] preds_Test = [] i = 0 score_sum = 0 for train_idx, vali_idx in cv.split(train, y_train): i += 1 """获取训练集和验证集""" f_train_x = DataFrame(train[train_idx]) f_train_y = DataFrame(y_train[train_idx]) f_vali_x = DataFrame(train[vali_idx]) f_vali_y = DataFrame(y_train[vali_idx]) """训练分类器""" clf.fit(f_train_x, f_train_y) """对测试集进行预测""" Train1 = clf.predict(f_vali_x) Test_i = clf.predict(test) preds_Test.append(Test_i) preds_Train.append(Train1) # """对验证集进行预测,并计算f1分数""" # pre_vali = clf.predict(f_vali_x) # score_vali = f1_score(y_true=f_vali_y, y_pred=pre_vali, average='macro') # print("第{}折, 验证集分数:{}".format(i, score_vali)) # score_sum += score_vali # score_mean = score_sum / i # print("第{}折后, 验证集分平均分数:{}".format(i, score_mean)) preds_Train = DataFrame(preds_Train).T preds_Test = DataFrame(preds_Test).T Train_i = pd.concat(objs=[preds_Train[0], preds_Train[1], preds_Train[2], preds_Train[3], preds_Train[4]], axis=0, sort=True) Test_i = pd.concat(objs=[preds_Test[0], preds_Test[1], preds_Test[2], preds_Test[3], preds_Test[4]], axis=1) Test_i = Test_i.T.sum()/3 return Test_i,Train_i """===================================================================================================================== 4 【第二层】特征组合 """ Test_1,Train1 = get_feature(SVM_linear,train,y_train,test,y_test,cv) Test_2,Train2 = get_feature(lg_120,train,y_train,test,y_test,cv) Test_3,Train3 = get_feature(DT,train,y_train,test,y_test,cv) Test_4,Train4 = get_feature(SVM_rbf,train,y_train,test,y_test,cv) Test_5,Train5 = get_feature(lgb_sklearn,train,y_train,test,y_test,cv) Train = pd.concat(objs=[Train1, Train2, Train3, Train4, Train5], axis=1).reset_index(drop=True) Test = pd.concat(objs=[Test_1, Test_2, Test_3, Test_4, Test_5], axis=1).astype(int).reset_index(drop=True) train = DataFrame(train).reset_index(drop=True) test = DataFrame(test).reset_index(drop=True) Train = pd.concat(objs=[Train, train], axis=1) Test = pd.concat(objs=[Test, test], axis=1) """===================================================================================================================== 【LGB_sklearn接口训练】 """ import lightgbm as lgbm lgb_sklearn = lgbm.LGBMClassifier(learning_rate=0.1, max_bin=150, num_leaves=32, max_depth=11, reg_alpha=0.1, reg_lambda=0.2, # objective='multiclass', n_estimators=300,) lgb_sklearn.fit(Train,y_train) # y_lgb_pre = lgb_sklearn.predict(Test) y_lgb_pre = lgb_sklearn.predict(Test) print( "lgb_sklearn_Train_Score :{}".format(lgb_sklearn.score(Train, y_train))) print("lgb_sklearn_Test_Score :{}".format(lgb_sklearn.score(Test, y_test))) # print("lgb_sklearn_Train_AUC Score :{:.4f}".format(roc_auc_score(y_train, y_lgb_pre))) print("lgb_sklearn_Test_AUC Score :{}".format(roc_auc_score(y_test, y_lgb_pre)))参考
1.模型融合
