久久国产成人av_抖音国产毛片_a片网站免费观看_A片无码播放手机在线观看,色五月在线观看,亚洲精品m在线观看,女人自慰的免费网址,悠悠在线观看精品视频,一级日本片免费的,亚洲精品久,国产精品成人久久久久久久

分享

python實現(xiàn)決策樹建模預(yù)測并驗證評估

 新用戶0935snDB 2022-05-26 發(fā)布于河南

實現(xiàn)功能:

python實現(xiàn)數(shù)據(jù)讀取,、數(shù)據(jù)清洗,、數(shù)據(jù)編碼、數(shù)據(jù)降維分析,、數(shù)據(jù)集劃分,、(具體參見前幾篇文章),,決策樹建模預(yù)測并驗證評估,。

實現(xiàn)代碼:

1

# 導(dǎo)入需要的庫

2

import pandas as pd

3

from sklearn.decomposition import PCA

4

from sklearn.model_selection import train_test_split

5

from sklearn.tree import DecisionTreeClassifier

6

import numpy as np

7

import seaborn as sns

8

import matplotlib.pyplot as plt

9

from sklearn import metrics

10

from sklearn.metrics import roc_curve, auc

11

12

def Read_data(file):

13

dt = pd.read_csv(file)

14

dt.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol','fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved','exercise_induced_angina','st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']

15

16

17

data =dt

18

pd.set_option('display.max_rows', None)

19

pd.set_option('display.max_columns', None)

20

pd.set_option('display.width', None)

21

pd.set_option('display.unicode.ambiguous_as_wide', True)

22

pd.set_option('display.unicode.east_asian_width', True)

23

print(data.head())

24

return data

25

26

# ===================數(shù)據(jù)清洗======================

27

def data_clean(data):

28

# 重復(fù)值處理

29

print('存在' if any(data.duplicated()) else '不存在', '重復(fù)觀測值')

30

data.drop_duplicates()

31

32

# 缺失值處理

33

# print(data.isnull())

34

# print(data.isnull().sum()) #檢測每列中缺失值的數(shù)量

35

# print(data.isnull().T.sum()) #檢測每行缺失值的數(shù)量

36

print('不存在' if any(data.isnull()) else '存在', '缺失值')

37

data.dropna() # 直接刪除記錄

38

data.fillna(method='ffill') # 前向填充

39

data.fillna(method='bfill') # 后向填充

40

data.fillna(value=2) # 值填充

41

data.fillna(value={'resting_blood_pressure': data['resting_blood_pressure'].mean()}) # 統(tǒng)計值填充

42

43

# 異常值處理

44

data1 = data['resting_blood_pressure']

45

# 標(biāo)準(zhǔn)差監(jiān)測

46

xmean = data1.mean()

47

xstd = data1.std()

48

print('存在' if any(data1 > xmean + 2 * xstd) else '不存在', '上限異常值')

49

print('存在' if any(data1 < xmean - 2 * xstd) else '不存在', '下限異常值')

50

# 箱線圖監(jiān)測

51

q1 = data1.quantile(0.25)

52

q3 = data1.quantile(0.75)

53

up = q3 + 1.5 * (q3 - q1)

54

dw = q1 - 1.5 * (q3 - q1)

55

print('存在' if any(data1 > up) else '不存在', '上限異常值')

56

print('存在' if any(data1 < dw) else '不存在', '下限異常值')

57

data1[data1 > up] = data1[data1 < up].max()

58

data1[data1 < dw] = data1[data1 > dw].min()

59

return data

60

61

62

#===========數(shù)值型變量分段統(tǒng)計.離散型變量分組統(tǒng)計==============

63

def Segment_statistics(data):

64

age = data[['age']]

65

bins = [20, 30, 40, 50, 60, 70, 80, 90, 100, 110]

66

age2 = pd.cut(age.values.flatten(), bins=bins)

67

# print(age2.value_counts())

68

age2 = pd.DataFrame(age2, columns=['年齡段']) #

69

age3 = pd.concat([age, age2], axis=1)

70

# print(age3)

71

72

tmp3 = data.groupby(['chest_pain_type', 'sex'])

73

print(tmp3.count())

74

return

75

76

77

#========================數(shù)據(jù)編碼===========================

78

def data_encoding(data):

79

data = data[['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol','fasting_blood_sugar', 'rest_ecg','max_heart_rate_achieved', 'exercise_induced_angina','st_depression', 'st_slope', 'num_major_vessels','thalassemia','target']]

80

81

82

Discretefeature=['sex','chest_pain_type', 'fasting_blood_sugar', 'rest_ecg','exercise_induced_angina', 'st_slope', 'thalassemia']

83

84

Continuousfeature=['age', 'resting_blood_pressure', 'cholesterol','max_heart_rate_achieved','st_depression','num_major_vessels']

85

86

87

df = pd.get_dummies(data,columns=Discretefeature)

88

89

df[Continuousfeature]=(df[Continuousfeature]-df[Continuousfeature].mean())/(df[Continuousfeature].std())

90

df['target']=data[['target']]

91

return df

92

93

def PCA_analysis(data):

94

# X提取變量特征;Y提取目標(biāo)變量

95

X = data.drop('target', axis=1)

96

y = data['target']

97

pca = PCA(n_components=2)

98

99

reduced_x = pca.fit_transform(X) # 得到了pca降到2維的數(shù)據(jù)

100

101

yes_x, yes_y = [], []

102

no_x, no_y = [], []

103

104

for i in range(len(reduced_x)):

105

if y[i] == 1:

106

yes_x.append(reduced_x[i][0])

107

yes_y.append(reduced_x[i][1])

108

elif y[i] == 0:

109

no_x.append(reduced_x[i][0])

110

no_y.append(reduced_x[i][1])

111

112

font = {'family': 'Times New Roman',

113

'size': 16,

114

}

115

sns.set(font_scale=1.2)

116

117

plt.rc('font',family='Times New Roman')

118

plt.scatter(yes_x, yes_y, c='r', marker='o',label='Yes')

119

plt.scatter(no_x, no_y, c='b', marker='x',label='No')

120

plt.title('PCA analysis') # 顯示標(biāo)題

121

plt.legend()

122

plt.show()

123

print(
pca.explained_variance_ratio_) # 輸出貢獻率

124

125

126

def data_partition(data):

127

#======================數(shù)據(jù)集劃分==========================

128

# 1.4查看樣本是否平衡

129

print(data['target'].value_counts())

130

# X提取變量特征,;Y提取目標(biāo)變量

131

X = data.drop('target', axis=1)

132

y = data['target']

133

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,random_state=10)

134

feature=list(X.columns)

135

return X_train, y_train, X_test, y_test,feature

136

137

138

def Draw_ROC(list1,list2):

139

fpr_model,tpr_model,thresholds=roc_curve(list1,list2,pos_label=1)

140

roc_auc_model=auc(fpr_model,tpr_model)

141

142

font = {'family': 'Times New Roman',

143

'size': 12,

144

}

145

sns.set(font_scale=1.2)

146

plt.rc('font',family='Times New Roman')

147

148

plt.plot(fpr_model,tpr_model,'blue',label='AUC = %0.2f'% roc_auc_model)

149

plt.legend(loc='lower right',fontsize = 12)

150

plt.plot([0,1],[0,1],'r--')

151

plt.ylabel('True Positive Rate',fontsize = 14)

152

plt.xlabel('Flase Positive Rate',fontsize = 14)

153

plt.show()

154

return

155

156

#============決策樹=====================

157

def DT(X_train, y_train, X_test, y_test,feature):

158

tree1 = DecisionTreeClassifier(max_depth=5, random_state=0)

159

tree1.fit(X_train, y_train)

160

print('\nFinally results of decision tree fitting:')

161

print('Accuracy on training set: {:.3f}'.format(tree1.score(X_train, y_train)))

162

print('Accuract on test set: {:.3f}'.format(tree1.score(X_test, y_test)))

163

164

predict_target=tree1.predict(X_test)

165

predict_target_prob=tree1.predict_proba(X_test)

166

predict_target_prob_dt=predict_target_prob[:,1]

167

168

df = pd.DataFrame({'prob':predict_target_prob_dt,'target':predict_target, 'labels':list(y_test)})

169

170

print('預(yù)測正確的個數(shù):',sum(predict_target==y_test))

171

print('DT驗證集報告:')

172

print(metrics.classification_report(y_test,predict_target))

173

print(metrics.confusion_matrix(y_test, predict_target))

174

175

print('DT訓(xùn)練集報告:')

176

predict_Target=tree1.predict(X_train)

177

print(metrics.classification_report(y_train,predict_Target))

178

print(metrics.confusion_matrix(y_train, predict_Target))

179

180

id=np.argwhere(tree1.feature_importances_>0)

181

182

id=[i for item in id for i in item] #二維數(shù)組(列表)轉(zhuǎn)化為一維 列表推導(dǎo)式

183

dic={}

184

for i in id:

185

dic.update({feature[i]:tree1.feature_importances_[i]})

186

187

df=pd.DataFrame.from_dict(dic,orient='index',columns=['權(quán)重'])

188

df=df.reset_index().rename(columns={'index':'特征'})

189

df=df.sort_values(by='權(quán)重',ascending=False)

190

191

data_hight=df['權(quán)重'].values.tolist()

192

data_x=df['特征'].values.tolist()

193

194

font = {'family': 'Times New Roman',

195

'size': 7,

196

}

197

sns.set(font_scale=1.2)

198

plt.rc('font',family='Times New Roman')

199

200

plt.figure()

201

plt.barh(range(len(data_x)), data_hight, color='#6699CC')

202

plt.yticks(range(len(data_x)),data_x,fontsize=12)

203

204

plt.tick_params(labelsize=12) #刻度字體大小13

205

plt.xlabel('Feature importance',fontsize=14)

206

plt.title('DT feature importance analysis',fontsize =14)

207

plt.show()

208

return list(y_test),list(predict_target_prob_dt)

209

210

211

if __name__=='__main__':

212

data1=Read_data('F:\數(shù)據(jù)雜壇\\0504\heartdisease\
Heart-Disease-Data-Set-main\\UCI Heart Disease Dataset.csv')

213

data1=data_clean(data1)

214

# Segment_statistics(data1)

215

data2=data_encoding(data1)

216

PCA_analysis(data2)

217

X_train, y_train, X_test, y_test,feature= data_partition(data2)

218

219

y_test,predict_target_prob_dt=DT(X_train, y_train, X_test, y_test,feature)

220

Draw_ROC(y_test,predict_target_prob_dt)


實現(xiàn)效果:

文章圖片1
文章圖片2
文章圖片3

喜歡記得點贊,,在看,收藏,,

關(guān)注V訂閱號:數(shù)據(jù)雜壇,,獲取完整代碼和效果,將持續(xù)更新,!

    本站是提供個人知識管理的網(wǎng)絡(luò)存儲空間,,所有內(nèi)容均由用戶發(fā)布,不代表本站觀點,。請注意甄別內(nèi)容中的聯(lián)系方式,、誘導(dǎo)購買等信息,謹(jǐn)防詐騙,。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,,請點擊一鍵舉報,。
    轉(zhuǎn)藏 分享 獻花(0

    0條評論

    發(fā)表

    請遵守用戶 評論公約

    類似文章 更多