python EDA数据分析例子（二分类问题，源代码）

xiaoxiao2025-07-02 37

import pandas as pd import matplotlib.pyplot as plt from sklearn import metrics import numpy as np from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn import kernel_approximation from sklearn.linear_model import SGDClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier import warnings warnings.filterwarnings("ignore") data = pd.read_csv('https://raw.githubusercontent.com/wzy6642/Machine-Learning-Case-Studies/master/noshowappointments/data/No-show-Issue-Comma-300k.csv') print(len(data)) data.head() for column in list(data.columns): # {0:25}意味着第一个索引(即列)中的特征将被打印，并且将为其分配25个字符空间。 # nuinque()是查看该序列(axis=0/1对应着列或行)的不同值的数量。用这个函数可以查看数据有多少个不同值。 print("{0:25} {1}".format(column, data[column].nunique())) def features_plots(discrete_vars): plt.figure(figsize=(15, 24.5)) for i, cv in enumerate(['Age', 'AwaitingTime']): plt.subplot(7, 2, i+1) # 这个参数指定bin(箱子)的个数,也就是总共有几条条状图 plt.hist(data[cv], bins=len(data[cv].unique())) plt.title(cv) plt.ylabel('Frequency') for i, dv in enumerate(discrete_vars): plt.subplot(7, 2, i+3) data[dv].value_counts().plot(kind='bar', title=dv) plt.ylabel('Frequency') discrete_vars = ['Gender', 'DayOfTheWeek', 'Status', 'Diabetes', 'Alcoolism', 'HiperTension', 'Handcap', 'Smokes', 'Scholarship', 'Tuberculosis', 'Sms_Reminder'] features_plots(discrete_vars) data[data['Age'] < 0]['Age'].value_counts().sum() data = data[data['Age'] >= 0] del data['Handcap'] data['AwaitingTime'] = data['AwaitingTime'].apply(lambda x: abs(x)) dow_mapping = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6} data['DayOfTheWeek'] = data['DayOfTheWeek'].map(dow_mapping) for field in ['Gender', 'Status']: # pd.Categorical( list ).codes 这样就可以直接得到原始数据的对应的序号列表，通过这样的处理可以将类别信息转化成数值信息 data[field] = pd.Categorical(list(data[field])).codes discrete_vars = ['Gender', 'DayOfTheWeek', 'Status', 'Diabetes', 'Alcoolism', 'HiperTension', 'Smokes', 'Scholarship', 'Tuberculosis', 'Sms_Reminder'] features_plots(discrete_vars) plt.scatter(data['Age'], data['AwaitingTime'], s=0.5) plt.title('Scatter plot of Age and Awaiting Time') plt.xlabel('Age') plt.ylabel('Awaiting Time') plt.xlim(0, 120) plt.ylim(0, 120) pd.set_option('display.width', 100) pd.set_option('precision', 3) correlations = data[['Age', 'AwaitingTime']].corr(method='pearson') print(correlations) data_dow_status = data.groupby(['Sms_Reminder', 'Status'])['Sms_Reminder'].count().unstack('Status').fillna(0) data_dow_status[[0, 1]].plot(kind='bar', stacked=True) plt.title('Frequency of people showing up and not showing up by number of SMS reminders sent') plt.xlabel('Number of SMS reminders') plt.ylabel('Frequency') data_dow_status = data.groupby(['DayOfTheWeek', 'Status'])['DayOfTheWeek'].count().unstack('Status').fillna(0) data_dow_status[[0, 1]].plot(kind='bar', stacked=True) plt.title('Frequency of people showing up and not showing up by Day of the week') plt.xlabel('Day of the week') plt.ylabel('Frequency') data.boxplot(column=['Age'], return_type='axes', by='Status') plt.show() plt.figure(figsize=(15, 3.5)) for i, status in enumerate(['no show ups', 'show ups']): data_show = data[data['Status']==i] plt.subplot(1, 2, i+1) for gender in [0, 1]: data_gender = data_show[data_show['Gender']==gender] freq_age = data_gender['Age'].value_counts().sort_index() freq_age.plot() plt.title('Age wise frequency of patient %s for both genders' % status) plt.xlabel('Age') plt.ylabel('Frequency') plt.legend(['Female', 'Male'], loc='upper left') data.boxplot(column=['AwaitingTime'], return_type='axes', by='Status') plt.show() for col in ['AppointmentRegistration', 'ApointmentData']: for index, component in enumerate(['year', 'month', 'day']): data['%s_%s' % (col, component)] = data[col].apply(lambda x: int(x.split('T')[0].split('-')[index])) for index, component in enumerate(['hour', 'min', 'sec']): data['%s_%s' % ('AppointmentRegistration', component)] = data['AppointmentRegistration'].apply(lambda x: int(x.split('T')[1][:-1].split(':')[index])) data.head() #分类 def model_performance(model, model_name, X_train, y_train, y_test, Y_pred): print('Model name: %s' % model_name) # 分类准确率分数是指所有分类正确的百分比 print('Test accuracy (Accuracy Score): %f' % metrics.accuracy_score(y_test, Y_pred)) # 直接根据真实值（必须是二值）、预测值（可以是0/1,也可以是proba值）计算出auc值 print('Test accuracy (ROC AUC Score): %f' % metrics.roc_auc_score(y_test, Y_pred)) # 模型精度 print('Train accuracy: %f' % model.score(X_train, y_train)) # precision 、recall 、thresholds fpr, tpr, thresholds = metrics.precision_recall_curve(y_test, Y_pred) # 计算AUC值，其中x,y分别为数组形式，根据(xi,yi)在坐标上的点，生成的曲线，然后计算AUC值 print('Area Under the Precision-Recall Curve: %f' % metrics.auc(fpr, tpr)) # 纵坐标：真正率/横坐标：假正率 false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, Y_pred) roc_auc = metrics.auc(false_positive_rate, true_positive_rate) plt.title('Receiver Operating Characteristic') # 绘制ROC_AUC曲线图 plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([-0.1, 1.2]) plt.ylim([-0.1, 1.2]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show() features_of_choice = ['Age', 'Gender', 'DayOfTheWeek', 'Diabetes', 'Alcoolism', 'HiperTension', 'Smokes', 'Scholarship', 'Tuberculosis', 'Sms_Reminder', 'AwaitingTime', 'AppointmentRegistration_year', 'AppointmentRegistration_month', 'AppointmentRegistration_day', 'AppointmentRegistration_hour', 'AppointmentRegistration_min', 'AppointmentRegistration_sec', 'ApointmentData_year', 'ApointmentData_month','ApointmentData_day'] x = np.array(data[features_of_choice]) y = np.array(data['Status']) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1) #决策树 clf = DecisionTreeClassifier() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) model_performance(clf, 'Decision tree classifier', x_train, y_train, y_test, y_pred) #SGD分类器 #用核近似和SGD分类器训练模型 rbf_feature = kernel_approximation.RBFSampler(gamma=1, random_state=1) X_train = rbf_feature.fit_transform(x_train) clf = SGDClassifier() clf.fit(X_train, y_train) X_test = rbf_feature.fit_transform(x_test) Y_pred = clf.predict(X_test) model_performance(clf, 'Kernel approximation', X_train, y_train, y_test, Y_pred) #随机森林 clf = RandomForestClassifier() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) model_performance(clf, 'Random Forest', x_train, y_train, y_test, y_pred) #梯度Boosting clf = GradientBoostingClassifier(random_state=10, learning_rate=0.1, n_estimators=200, max_depth=5, max_features=10) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) model_performance(clf, 'Grandient Boosting', x_train, y_train, y_test, y_pred) for feature, score in zip(features_of_choice, list(clf.feature_importances_)): print('%s\t\t\t\t\t%f' % (feature, score))

最新回复(0)