1 | # 導(dǎo)入需要的庫 |
2 | import pandas as pd |
3 | from sklearn.decomposition import PCA |
4 | from sklearn.model_selection import train_test_split |
5 | from sklearn.tree import DecisionTreeClassifier |
6 | import numpy as np |
7 | import seaborn as sns |
8 | import matplotlib.pyplot as plt |
9 | from sklearn import metrics |
10 | from sklearn.metrics import roc_curve, auc |
11 | |
12 | def Read_data(file): |
13 | dt = pd.read_csv(file) |
14 | dt.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol','fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved','exercise_induced_angina','st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target'] |
15 | |
16 | |
17 | data =dt |
18 | pd.set_option('display.max_rows', None) |
19 | pd.set_option('display.max_columns', None) |
20 | pd.set_option('display.width', None) |
21 | pd.set_option('display.unicode.ambiguous_as_wide', True) |
22 | pd.set_option('display.unicode.east_asian_width', True) |
23 | print(data.head()) |
24 | return data |
25 | |
26 | # ===================數(shù)據(jù)清洗====================== |
27 | def data_clean(data): |
28 | # 重復(fù)值處理 |
29 | print('存在' if any(data.duplicated()) else '不存在', '重復(fù)觀測值') |
30 | data.drop_duplicates() |
31 | |
32 | # 缺失值處理 |
33 | # print(data.isnull()) |
34 | # print(data.isnull().sum()) #檢測每列中缺失值的數(shù)量 |
35 | # print(data.isnull().T.sum()) #檢測每行缺失值的數(shù)量 |
36 | print('不存在' if any(data.isnull()) else '存在', '缺失值') |
37 | data.dropna() # 直接刪除記錄 |
38 | data.fillna(method='ffill') # 前向填充 |
39 | data.fillna(method='bfill') # 后向填充 |
40 | data.fillna(value=2) # 值填充 |
41 | data.fillna(value={'resting_blood_pressure': data['resting_blood_pressure'].mean()}) # 統(tǒng)計值填充 |
42 | |
43 | # 異常值處理 |
44 | data1 = data['resting_blood_pressure'] |
45 | # 標(biāo)準(zhǔn)差監(jiān)測 |
46 | xmean = data1.mean() |
47 | xstd = data1.std() |
48 | print('存在' if any(data1 > xmean + 2 * xstd) else '不存在', '上限異常值') |
49 | print('存在' if any(data1 < xmean - 2 * xstd) else '不存在', '下限異常值') |
50 | # 箱線圖監(jiān)測 |
51 | q1 = data1.quantile(0.25) |
52 | q3 = data1.quantile(0.75) |
53 | up = q3 + 1.5 * (q3 - q1) |
54 | dw = q1 - 1.5 * (q3 - q1) |
55 | print('存在' if any(data1 > up) else '不存在', '上限異常值') |
56 | print('存在' if any(data1 < dw) else '不存在', '下限異常值') |
57 | data1[data1 > up] = data1[data1 < up].max() |
58 | data1[data1 < dw] = data1[data1 > dw].min() |
59 | return data |
60 | |
61 | |
62 | #===========數(shù)值型變量分段統(tǒng)計.離散型變量分組統(tǒng)計============== |
63 | def Segment_statistics(data): |
64 | age = data[['age']] |
65 | bins = [20, 30, 40, 50, 60, 70, 80, 90, 100, 110] |
66 | age2 = pd.cut(age.values.flatten(), bins=bins) |
67 | # print(age2.value_counts()) |
68 | age2 = pd.DataFrame(age2, columns=['年齡段']) # |
69 | age3 = pd.concat([age, age2], axis=1) |
70 | # print(age3) |
71 | |
72 | tmp3 = data.groupby(['chest_pain_type', 'sex']) |
73 | print(tmp3.count()) |
74 | return |
75 | |
76 | |
77 | #========================數(shù)據(jù)編碼=========================== |
78 | def data_encoding(data): |
79 | data = data[['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol','fasting_blood_sugar', 'rest_ecg','max_heart_rate_achieved', 'exercise_induced_angina','st_depression', 'st_slope', 'num_major_vessels','thalassemia','target']] |
80 | |
81 | |
82 | Discretefeature=['sex','chest_pain_type', 'fasting_blood_sugar', 'rest_ecg','exercise_induced_angina', 'st_slope', 'thalassemia'] |
83 | |
84 | Continuousfeature=['age', 'resting_blood_pressure', 'cholesterol','max_heart_rate_achieved','st_depression','num_major_vessels'] |
85 | |
86 | |
87 | df = pd.get_dummies(data,columns=Discretefeature) |
88 | |
89 | df[Continuousfeature]=(df[Continuousfeature]-df[Continuousfeature].mean())/(df[Continuousfeature].std()) |
90 | df['target']=data[['target']] |
91 | return df |
92 | |
93 | def PCA_analysis(data): |
94 | # X提取變量特征;Y提取目標(biāo)變量 |
95 | X = data.drop('target', axis=1) |
96 | y = data['target'] |
97 | pca = PCA(n_components=2) |
98 | |
99 | reduced_x = pca.fit_transform(X) # 得到了pca降到2維的數(shù)據(jù) |
100 | |
101 | yes_x, yes_y = [], [] |
102 | no_x, no_y = [], [] |
103 | |
104 | for i in range(len(reduced_x)): |
105 | if y[i] == 1: |
106 | yes_x.append(reduced_x[i][0]) |
107 | yes_y.append(reduced_x[i][1]) |
108 | elif y[i] == 0: |
109 | no_x.append(reduced_x[i][0]) |
110 | no_y.append(reduced_x[i][1]) |
111 | |
112 | font = {'family': 'Times New Roman', |
113 | 'size': 16, |
114 | } |
115 | sns.set(font_scale=1.2) |
116 | |
117 | plt.rc('font',family='Times New Roman') |
118 | plt.scatter(yes_x, yes_y, c='r', marker='o',label='Yes') |
119 | plt.scatter(no_x, no_y, c='b', marker='x',label='No') |
120 | plt.title('PCA analysis') # 顯示標(biāo)題 |
121 | plt.legend() |
122 | plt.show() |
123 | print( pca.explained_variance_ratio_) # 輸出貢獻率 |
124 | |
125 | |
126 | def data_partition(data): |
127 | #======================數(shù)據(jù)集劃分========================== |
128 | # 1.4查看樣本是否平衡 |
129 | print(data['target'].value_counts()) |
130 | # X提取變量特征,;Y提取目標(biāo)變量 |
131 | X = data.drop('target', axis=1) |
132 | y = data['target'] |
133 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,random_state=10) |
134 | feature=list(X.columns) |
135 | return X_train, y_train, X_test, y_test,feature |
136 | |
137 | |
138 | def Draw_ROC(list1,list2): |
139 | fpr_model,tpr_model,thresholds=roc_curve(list1,list2,pos_label=1) |
140 | roc_auc_model=auc(fpr_model,tpr_model) |
141 | |
142 | font = {'family': 'Times New Roman', |
143 | 'size': 12, |
144 | } |
145 | sns.set(font_scale=1.2) |
146 | plt.rc('font',family='Times New Roman') |
147 | |
148 | plt.plot(fpr_model,tpr_model,'blue',label='AUC = %0.2f'% roc_auc_model) |
149 | plt.legend(loc='lower right',fontsize = 12) |
150 | plt.plot([0,1],[0,1],'r--') |
151 | plt.ylabel('True Positive Rate',fontsize = 14) |
152 | plt.xlabel('Flase Positive Rate',fontsize = 14) |
153 | plt.show() |
154 | return |
155 | |
156 | #============決策樹===================== |
157 | def DT(X_train, y_train, X_test, y_test,feature): |
158 | tree1 = DecisionTreeClassifier(max_depth=5, random_state=0) |
159 | tree1.fit(X_train, y_train) |
160 | print('\nFinally results of decision tree fitting:') |
161 | print('Accuracy on training set: {:.3f}'.format(tree1.score(X_train, y_train))) |
162 | print('Accuract on test set: {:.3f}'.format(tree1.score(X_test, y_test))) |
163 | |
164 | predict_target=tree1.predict(X_test) |
165 | predict_target_prob=tree1.predict_proba(X_test) |
166 | predict_target_prob_dt=predict_target_prob[:,1] |
167 | |
168 | df = pd.DataFrame({'prob':predict_target_prob_dt,'target':predict_target, 'labels':list(y_test)}) |
169 | |
170 | print('預(yù)測正確的個數(shù):',sum(predict_target==y_test)) |
171 | print('DT驗證集報告:') |
172 | print(metrics.classification_report(y_test,predict_target)) |
173 | print(metrics.confusion_matrix(y_test, predict_target)) |
174 | |
175 | print('DT訓(xùn)練集報告:') |
176 | predict_Target=tree1.predict(X_train) |
177 | print(metrics.classification_report(y_train,predict_Target)) |
178 | print(metrics.confusion_matrix(y_train, predict_Target)) |
179 | |
180 | id=np.argwhere(tree1.feature_importances_>0) |
181 | |
182 | id=[i for item in id for i in item] #二維數(shù)組(列表)轉(zhuǎn)化為一維 列表推導(dǎo)式 |
183 | dic={} |
184 | for i in id: |
185 | dic.update({feature[i]:tree1.feature_importances_[i]}) |
186 | |
187 | df=pd.DataFrame.from_dict(dic,orient='index',columns=['權(quán)重']) |
188 | df=df.reset_index().rename(columns={'index':'特征'}) |
189 | df=df.sort_values(by='權(quán)重',ascending=False) |
190 | |
191 | data_hight=df['權(quán)重'].values.tolist() |
192 | data_x=df['特征'].values.tolist() |
193 | |
194 | font = {'family': 'Times New Roman', |
195 | 'size': 7, |
196 | } |
197 | sns.set(font_scale=1.2) |
198 | plt.rc('font',family='Times New Roman') |
199 | |
200 | plt.figure() |
201 | plt.barh(range(len(data_x)), data_hight, color='#6699CC') |
202 | plt.yticks(range(len(data_x)),data_x,fontsize=12) |
203 | |
204 | plt.tick_params(labelsize=12) #刻度字體大小13 |
205 | plt.xlabel('Feature importance',fontsize=14) |
206 | plt.title('DT feature importance analysis',fontsize =14) |
207 | plt.show() |
208 | return list(y_test),list(predict_target_prob_dt) |
209 | |
210 | |
211 | if __name__=='__main__': |
212 | data1=Read_data('F:\數(shù)據(jù)雜壇\\0504\heartdisease\ Heart-Disease-Data-Set-main\\UCI Heart Disease Dataset.csv') |
213 | data1=data_clean(data1) |
214 | # Segment_statistics(data1) |
215 | data2=data_encoding(data1) |
216 | PCA_analysis(data2) |
217 | X_train, y_train, X_test, y_test,feature= data_partition(data2) |
218 | |
219 | y_test,predict_target_prob_dt=DT(X_train, y_train, X_test, y_test,feature) |
220 | Draw_ROC(y_test,predict_target_prob_dt) |