關(guān)聯(lián)分析,、數(shù)值比較:散點圖,、曲線圖
分布分析:灰度圖,、密度圖
涉及分類的分析:柱狀圖,、箱式圖
核密度估計(Kernel density estimation),,是一種用于估計概率密度函數(shù)的非參數(shù)方法,,采用平滑的峰值函數(shù)(“核”)來擬合觀察到的數(shù)據(jù)點,,從而對真實的概率分布曲線進(jìn)行模擬,。
https://en./wiki/Kernel_density_estimation
核密度函數(shù)的基本想法是,在知道某一事物概率分布的情況下,,如果某一個數(shù)在觀察中出現(xiàn)了,,可以認(rèn)為這個數(shù)的概率密度很大,和這個數(shù)比較近的數(shù)的概率密度也會比較大,,而那些離這個數(shù)遠(yuǎn)的數(shù)的概率密度會比較小,。
給定獨立同分布的n個樣本點,核密度估計為:
K為核函數(shù)(非負(fù),、積分為1,,符合概率密度性質(zhì),并且均值為0),;有多種核函數(shù),,常用的是Gaussian KDE。h>0是一個平滑參數(shù),,稱作帶寬(bandwidth),,也叫窗口。
針對每個樣本點,用K去擬合上述想象的遠(yuǎn)小近大的概率密度,。對每一個觀察數(shù)擬合出的多個概率密度分布函數(shù)取平均,。如果某些數(shù)是比較重要的,則可以取加權(quán)平均,??偠灾?strong>核密度估計通過核函數(shù)(如高斯)將每個數(shù)據(jù)點的數(shù)據(jù)+帶寬當(dāng)作核函數(shù)的參數(shù),,得到N個核函數(shù),,再線性疊加就形成了核密度的估計函數(shù),歸一化后得核密度概率密度函數(shù),。
導(dǎo)入數(shù)據(jù),,基本處理
import pandas as pd # 讀取數(shù)據(jù)到DataFrame
import urllib # 獲取網(wǎng)絡(luò)數(shù)據(jù)
import shutil # 文件操作
import zipfile # 壓縮解壓
import os
# 建立臨時目錄
try:
os.system('mkdir bike_data')
except:
os.system('rm -rf bike_data; mkdir bike_data')
data_source = 'http://archive.ics./ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip' # 網(wǎng)絡(luò)數(shù)據(jù)地址
zipname = 'bike_data/Bike-Sharing-Dataset.zip' # 拼接文件和路徑
urllib.request.urlretrieve(data_source, zipname) # 獲得數(shù)據(jù)
zip_ref = zipfile.ZipFile(zipname, 'r') # 創(chuàng)建一個ZipFile對象處理壓縮文件
#zip_ref.extractall(temp_dir) # 解壓
zip_ref.extractall('bike_data')
zip_ref.close()
daily_path = 'bike_data/day.csv'
daily_data = pd.read_csv(daily_path) # 讀取csv文件
daily_data['dteday'] = pd.to_datetime(daily_data['dteday']) # 把字符串?dāng)?shù)據(jù)傳換成日期數(shù)據(jù)
drop_list = ['instant', 'season', 'yr', 'mnth', 'holiday', 'workingday', 'weathersit', 'atemp', 'hum'] # 不關(guān)注的列
daily_data.drop(drop_list, inplace = True, axis = 1) # inplace=true在對象上直接操作
daily_data.head() # 看一看數(shù)據(jù)~
配置參數(shù)
from __future__ import division, print_function # 引入3.x版本的除法和打印
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
# 在notebook中顯示繪圖結(jié)果
%matplotlib inline
# 設(shè)置一些全局的資源參數(shù),可以進(jìn)行個性化修改
import matplotlib
# 設(shè)置圖片尺寸 14" x 7"
# rc: resource configuration
matplotlib.rc('figure', figsize = (14, 7))
# 設(shè)置字體 14
matplotlib.rc('font', size = 14)
# 不顯示頂部和右側(cè)的坐標(biāo)線
matplotlib.rc('axes.spines', top = False, right = False)
# 不顯示網(wǎng)格
matplotlib.rc('axes', grid = False)
# 設(shè)置背景顏色是白色
matplotlib.rc('axes', facecolor = 'white')
關(guān)聯(lián)分析
散點圖,,分析變量關(guān)系
from matplotlib import font_manager #設(shè)置字體的一個包
fontP = font_manager.FontProperties()
fontP.set_family('SimHei')
fontP.set_size(14)
# 包裝一個散點圖的函數(shù)便于復(fù)用
def scatterplot(x_data, y_data, x_label, y_label, title):
# 創(chuàng)建一個繪圖對象
fig, ax = plt.subplots()
# 設(shè)置數(shù)據(jù),、點的大小、點的顏色和透明度
ax.scatter(x_data, y_data, s = 10, color = '#539caf', alpha = 0.9) # http://www./other/rgb.htm
# 添加標(biāo)題和坐標(biāo)說明
ax.set_title(title)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
# 繪制散點圖
scatterplot(x_data = daily_data['temp']
, y_data = daily_data['cnt']
, x_label = 'Normalized temperature (C)'
, y_label = 'Check outs'
, title = 'Number of Check Outs vs Temperature')
曲線圖,,擬合變量關(guān)系
OLS理論參考http://blog.csdn.net/cymy001/article/details/78364652
# 線性回歸
import statsmodels.api as sm # 最小二乘
from statsmodels.stats.outliers_influence import summary_table # 獲得匯總信息
x = sm.add_constant(daily_data['temp']) # 線性回歸增加常數(shù)項 y=kx+b
y = daily_data['cnt']
regr = sm.OLS(y, x) # 普通最小二乘模型,,ordinary least square model
res = regr.fit() #res.model.endog
# 從模型獲得擬合數(shù)據(jù)
st, data, ss2 = summary_table(res, alpha=0.05) # 置信水平alpha=5%,st數(shù)據(jù)匯總,,data數(shù)據(jù)詳情,,ss2數(shù)據(jù)列名
fitted_values = data[:,2] #等價于res.fittedvalues
# 包裝曲線繪制函數(shù)
def lineplot(x_data, y_data, x_label, y_label, title):
# 創(chuàng)建繪圖對象
_, ax = plt.subplots()
# 繪制擬合曲線,lw=linewidth,,alpha=transparancy
ax.plot(x_data, y_data, lw = 2, color = '#539caf', alpha = 1)
# 添加標(biāo)題和坐標(biāo)說明
ax.set_title(title)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
# 調(diào)用繪圖函數(shù)
lineplot(x_data = daily_data['temp']
, y_data = fitted_values
, x_label = 'Normalized temperature (C)'
, y_label = 'Check outs'
, title = 'Line of Best Fit for Number of Check Outs vs Temperature')
帶置信區(qū)間的曲線圖,,評估曲線擬合結(jié)果
# 獲得5%置信區(qū)間的上下界
predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T #summary_table
# 創(chuàng)建置信區(qū)間DataFrame,上下界
CI_df = pd.DataFrame(columns = ['x_data', 'low_CI', 'upper_CI'])
CI_df['x_data'] = daily_data['temp']
CI_df['low_CI'] = predict_mean_ci_low
CI_df['upper_CI'] = predict_mean_ci_upp
CI_df.sort_values('x_data', inplace = True) # 根據(jù)x_data進(jìn)行排序
# 繪制置信區(qū)間
def lineplotCI(x_data, y_data, sorted_x, low_CI, upper_CI, x_label, y_label, title):
# 創(chuàng)建繪圖對象
_, ax = plt.subplots()
# 繪制預(yù)測曲線
ax.plot(x_data, y_data, lw = 1, color = '#539caf', alpha = 1, label = 'Fit')
# 繪制置信區(qū)間,,順序填充
ax.fill_between(sorted_x, low_CI, upper_CI, color = '#539caf', alpha = 0.4, label = '95% CI') #####
# 添加標(biāo)題和坐標(biāo)說明
ax.set_title(title)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
# 顯示圖例,,配合label參數(shù),loc=“best”自適應(yīng)方式
ax.legend(loc = 'best')
# Call the function to create plot
lineplotCI(x_data = daily_data['temp']
, y_data = fitted_values
, sorted_x = CI_df['x_data']
, low_CI = CI_df['low_CI']
, upper_CI = CI_df['upper_CI']
, x_label = 'Normalized temperature (C)'
, y_label = 'Check outs'
, title = 'Line of Best Fit for Number of Check Outs vs Temperature')
雙坐標(biāo)曲線圖,,曲線擬合不滿足置信閾值時,,考慮增加獨立變量;分析不同尺度多變量的關(guān)系
# 雙縱坐標(biāo)繪圖函數(shù)
def lineplot2y(x_data, x_label, y1_data, y1_color, y1_label, y2_data, y2_color, y2_label, title):
_, ax1 = plt.subplots()
ax1.plot(x_data, y1_data, color = y1_color)
# 添加標(biāo)題和坐標(biāo)說明
ax1.set_ylabel(y1_label, color = y1_color)
ax1.set_xlabel(x_label)
ax1.set_title(title)
ax2 = ax1.twinx() # 兩個繪圖對象共享橫坐標(biāo)軸
ax2.plot(x_data, y2_data, color = y2_color)
ax2.set_ylabel(y2_label, color = y2_color)
# 右側(cè)坐標(biāo)軸可見
ax2.spines['right'].set_visible(True)
# 調(diào)用繪圖函數(shù)
lineplot2y(x_data = daily_data['dteday']
, x_label = 'Day'
, y1_data = daily_data['cnt']
, y1_color = '#539caf'
, y1_label = 'Check outs'
, y2_data = daily_data['windspeed']
, y2_color = '#7663b0'
, y2_label = 'Normalized windspeed'
, title = 'Check Outs and Windspeed Over Time')
分布分析
灰度圖,,粗略區(qū)間計算
# 繪制灰度圖的函數(shù)
def histogram(data, x_label, y_label, title):
_, ax = plt.subplots()
res = ax.hist(data, color = '#539caf', bins=20) # 設(shè)置bin的數(shù)量
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
return res
# 繪圖函數(shù)調(diào)用
res = histogram(data = daily_data['registered']
, x_label = 'Check outs'
, y_label = 'Frequency'
, title = 'Distribution of Registered Check Outs')
res[0] # value of bins 每個桶內(nèi)頻數(shù)
res[1] # boundary of bins 桶的邊界橫坐標(biāo)
堆疊直方圖,,比較兩個分布
# 繪制堆疊的直方圖
def overlaid_histogram(data1, data1_name, data1_color, data2, data2_name, data2_color, x_label, y_label, title):
# 歸一化數(shù)據(jù)區(qū)間,對齊兩個直方圖的bins
max_nbins = 10
data_range = [min(min(data1), min(data2)), max(max(data1), max(data2))]
binwidth = (data_range[1] - data_range[0]) / max_nbins
bins = np.arange(data_range[0], data_range[1] + binwidth, binwidth) # 生成直方圖bins區(qū)間
# Create the plot
_, ax = plt.subplots()
ax.hist(data1, bins = bins, color = data1_color, alpha = 1, label = data1_name)
ax.hist(data2, bins = bins, color = data2_color, alpha = 0.75, label = data2_name)
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
ax.legend(loc = 'best')
# Call the function to create plot
overlaid_histogram(data1 = daily_data['registered']
, data1_name = 'Registered'
, data1_color = '#539caf'
, data2 = daily_data['casual']
, data2_name = 'Casual'
, data2_color = '#7663b0'
, x_label = 'Check outs'
, y_label = 'Frequency'
, title = 'Distribution of Check Outs By Type')
密度圖,,精細(xì)刻畫概率密度
# 計算概率密度
from scipy.stats import gaussian_kde
data = daily_data['registered']
density_est = gaussian_kde(data) # kernal density estimate: https://en./wiki/Kernel_density_estimation
# 控制平滑程度,,數(shù)值越大,越平滑
density_est.covariance_factor = lambda : 0.3 #帶寬
density_est._compute_covariance()
x_data = np.arange(min(data), max(data), 200) #data是無序的,,x_data從小到大排序作圖
# 繪制密度估計曲線
def densityplot(x_data, density_est, x_label, y_label, title):
_, ax = plt.subplots()
ax.plot(x_data, density_est(x_data), color = '#539caf', lw =2) #lw是曲線亮度
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
# 調(diào)用繪圖函數(shù)
densityplot(x_data = x_data
, density_est = density_est
, x_label = 'Check outs'
, y_label = 'Frequency'
, title = 'Distribution of Registered Check Outs')
#type(density_est) #scipy.stats.kde.gaussian_kde
分類組間分析
組間定量比較,,分組粒度,,組間聚類
柱狀圖,一級類間均值方差比較
# 分天分析統(tǒng)計特征
mean_total_co_day = daily_data[['weekday', 'cnt']].groupby('weekday').agg([np.mean, np.std])
mean_total_co_day.columns = mean_total_co_day.columns.droplevel() #變成一維columns
# 定義繪制柱狀圖的函數(shù)
def barplot(x_data, y_data, error_data, x_label, y_label, title):
_, ax = plt.subplots()
# 柱狀圖
ax.bar(x_data, y_data, color = '#539caf', align = 'center')
# 繪制方差
# ls='none'去掉bar之間的連線
ax.errorbar(x_data, y_data, yerr = error_data, color = '#297083', ls = 'none', lw = 5)
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
# 繪圖函數(shù)調(diào)用
barplot(x_data = mean_total_co_day.index.values
, y_data = mean_total_co_day['mean']
, error_data = mean_total_co_day['std']
, x_label = 'Day of week'
, y_label = 'Check outs'
, title = 'Total Check Outs By Day of Week (0 = Sunday)')
堆積柱狀圖,,多級類間相對占比比較
# 分天統(tǒng)計注冊和偶然使用的情況
mean_by_reg_co_day = daily_data[['weekday', 'registered', 'casual']].groupby('weekday').mean()
# 分天統(tǒng)計注冊和偶然使用的占比
mean_by_reg_co_day['total'] = mean_by_reg_co_day['registered'] + mean_by_reg_co_day['casual']
mean_by_reg_co_day['reg_prop'] = mean_by_reg_co_day['registered'] / mean_by_reg_co_day['total']
mean_by_reg_co_day['casual_prop'] = mean_by_reg_co_day['casual'] / mean_by_reg_co_day['total']
# 繪制堆積柱狀圖
def stackedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label, title):
_, ax = plt.subplots()
# 循環(huán)繪制堆積柱狀圖
for i in range(0, len(y_data_list)):
if i == 0:
ax.bar(x_data, y_data_list[i], color = colors[i], align = 'center', label = y_data_names[i])
else:
# 采用堆積的方式,,除了第一個分類,后面的分類都從前一個分類的柱狀圖接著畫
# 用歸一化保證最終累積結(jié)果為1
ax.bar(x_data, y_data_list[i], color = colors[i], bottom = y_data_list[1 - i], align = 'center', label = y_data_names[i])
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
ax.legend(loc = 'upper right') # 設(shè)定圖例位置
# 調(diào)用繪圖函數(shù)
stackedbarplot(x_data = mean_by_reg_co_day.index.values
, y_data_list = [mean_by_reg_co_day['reg_prop'], mean_by_reg_co_day['casual_prop']]
, y_data_names = ['Registered', 'Casual']
, colors = ['#539caf', '#7663b0']
, x_label = 'Day of week'
, y_label = 'Proportion of check outs'
, title = 'Check Outs By Registration Status and Day of Week (0 = Sunday)')
分組柱狀圖,,多級類間絕對數(shù)值比較
(按每一類遍歷,,先畫第一類的x_data各柱子,再畫第二類的x_data各柱子)
# 繪制分組柱狀圖的函數(shù)
def groupedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label, title):
_, ax = plt.subplots()
# 設(shè)置每一組柱狀圖的寬度
total_width = 0.8
# 設(shè)置每一個柱狀圖的寬度
ind_width = total_width / len(y_data_list)
# 計算每一個柱狀圖的中心偏移
alteration = np.arange(-total_width/2+ind_width/2, total_width/2+ind_width/2, ind_width)
# 分別繪制每一個柱狀圖
for i in range(0, len(y_data_list)):
# 橫向散開繪制
ax.bar(x_data + alteration[i], y_data_list[i], color = colors[i], label = y_data_names[i], width = ind_width)
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
ax.legend(loc = 'upper right')
# 調(diào)用繪圖函數(shù)
groupedbarplot(x_data = mean_by_reg_co_day.index.values
, y_data_list = [mean_by_reg_co_day['registered'], mean_by_reg_co_day['casual']]
, y_data_names = ['Registered', 'Casual']
, colors = ['#539caf', '#7663b0']
, x_label = 'Day of week'
, y_label = 'Check outs'
, title = 'Check Outs By Registration Status and Day of Week (0 = Sunday)')
箱式圖,,多級類間數(shù)據(jù)分析比較,,柱狀圖+堆疊灰度圖
# 只需要指定分類的依據(jù),就能自動繪制箱式圖
days = np.unique(daily_data['weekday']) #np.unique返回排好序的出現(xiàn)值(集合)
bp_data = []
for day in days:
bp_data.append(daily_data[daily_data['weekday'] == day]['cnt'].values) #一次循環(huán)在列表中追加一個array
# 定義繪圖函數(shù)
def boxplot(x_data, y_data, base_color, median_color, x_label, y_label, title):
_, ax = plt.subplots()
# 設(shè)置樣式
ax.boxplot(y_data
# 箱子是否顏色填充
, patch_artist = True
# 中位數(shù)線顏色
, medianprops = {'color': base_color}
# 箱子顏色設(shè)置,,color:邊框顏色,,facecolor:填充顏色
, boxprops = {'color': base_color, 'facecolor': median_color}
# 貓須顏色whisker
, whiskerprops = {'color': median_color}
# 貓須界限顏色whisker cap
, capprops = {'color': base_color})
# 箱圖與x_data保持一致
ax.set_xticklabels(x_data)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.set_title(title)
# 調(diào)用繪圖函數(shù)
boxplot(x_data = days
, y_data = bp_data
, base_color = 'b'
, median_color = 'r'
, x_label = 'Day of week'
, y_label = 'Check outs'
, title = 'Total Check Outs By Day of Week (0 = Sunday)')
|