
    xiaoxiao2022-07-03  139

    %matplotlib inline import matplotlib.pyplot as plt import numpy as np # 载入数据 from sklearn.datasets import load_breast_cancer cancer = load_breast_cancer() X = cancer.data y = cancer.target print('data shape: {0}; no. positive: {1}; no. negative: {2}'.format( X.shape, y[y==1].shape[0], y[y==0].shape[0])) print(cancer.data[0]) data shape: (569, 30); no. positive: 357; no. negative: 212 [1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01 4.601e-01 1.189e-01] cancer.feature_names array(['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error', 'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension'], dtype='<U23') from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 模型训练 from sklearn.linear_model import LogisticRegression model = LogisticRegression(solver='liblinear') model.fit(X_train, y_train) train_score = model.score(X_train, y_train) test_score = model.score(X_test, y_test) print('train score: {train_score:.6f}; test score: {test_score:.6f}'.format( train_score=train_score, test_score=test_score)) train score: 0.956044; test score: 0.956140 # 样本预测 y_pred = model.predict(X_test) print('matchs: {0}/{1}'.format(np.equal(y_pred, y_test).sum(), y_test.shape[0])) matchs: 109/114 # 预测概率:找出低于 90% 概率的样本个数 y_pred_proba = model.predict_proba(X_test) print('sample of predict probability: {0}'.format(y_pred_proba[0])) y_pred_proba_0 = y_pred_proba[:, 0] > 0.1 result = y_pred_proba[y_pred_proba_0] y_pred_proba_1 = result[:, 1] > 0.1 print(result[y_pred_proba_1]) sample of predict probability: [0.1495694 0.8504306] [[0.1495694 0.8504306 ] [0.11096724 0.88903276] [0.19499883 0.80500117] [0.73727435 0.26272565] [0.1612553 0.8387447 ] [0.79987232 0.20012768] [0.89256972 0.10743028] [0.88427977 0.11572023] [0.28478309 0.71521691] [0.26459028 0.73540972] [0.18916204 0.81083796] [0.508977 0.491023 ] [0.85503032 0.14496968] [0.69726084 0.30273916] [0.40779235 0.59220765] [0.77126139 0.22873861] [0.71761711 0.28238289]] import time from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import Pipeline # 增加多项式预处理 def polynomial_model(degree=1, **kwarg): polynomial_features = PolynomialFeatures(degree=degree, include_bias=False) logistic_regression = LogisticRegression(**kwarg) pipeline = Pipeline([("polynomial_features", polynomial_features), ("logistic_regression", logistic_regression)]) return pipeline model = polynomial_model(degree=2, penalty='l1', solver='liblinear') start = time.clock() model.fit(X_train, y_train) train_score = model.score(X_train, y_train) cv_score = model.score(X_test, y_test) print('elaspe: {0:.6f}; train_score: {1:0.6f}; cv_score: {2:.6f}'.format( time.clock()-start, train_score, cv_score)) elaspe: 1.709350; train_score: 1.000000; cv_score: 0.991228 import time from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import Pipeline # 增加多项式预处理 def polynomial_model(degree=1, **kwarg): polynomial_features = PolynomialFeatures(degree=degree, include_bias=False) logistic_regression = LogisticRegression(**kwarg) pipeline = Pipeline([("polynomial_features", polynomial_features), ("logistic_regression", logistic_regression)]) return pipeline model = polynomial_model(degree=2, penalty='l2', solver='liblinear') start = time.clock() model.fit(X_train, y_train) train_score = model.score(X_train, y_train) cv_score = model.score(X_test, y_test) print('elaspe: {0:.6f}; train_score: {1:0.6f}; cv_score: {2:.6f}'.format( time.clock()-start, train_score, cv_score)) elaspe: 0.590021; train_score: 0.971429; cv_score: 0.964912