Python可視化庫matplotlib庫各種圖demo

hdzgx 2020-01-05

展開全文

關(guān)聯(lián)分析,、數(shù)值比較：散點圖,、曲線圖
分布分析：灰度圖,、密度圖
涉及分類的分析：柱狀圖,、箱式圖

核密度估計（Kernel density estimation）,，是一種用于估計概率密度函數(shù)的非參數(shù)方法,，采用平滑的峰值函數(shù)(“核”)來擬合觀察到的數(shù)據(jù)點,，從而對真實的概率分布曲線進(jìn)行模擬,。
https://en./wiki/Kernel_density_estimation

核密度函數(shù)的基本想法是，在知道某一事物概率分布的情況下,，如果某一個數(shù)在觀察中出現(xiàn)了,，可以認(rèn)為這個數(shù)的概率密度很大，和這個數(shù)比較近的數(shù)的概率密度也會比較大,，而那些離這個數(shù)遠(yuǎn)的數(shù)的概率密度會比較小,。

給定獨立同分布的n個樣本點，核密度估計為：
這里寫圖片描述
$K$ 為核函數(shù)(非負(fù),、積分為1,，符合概率密度性質(zhì)，并且均值為0),；有多種核函數(shù),，常用的是Gaussian KDE。 $h > 0$ 是一個平滑參數(shù),，稱作帶寬(bandwidth),，也叫窗口。
針對每個樣本點，用 $K$ 去擬合上述想象的遠(yuǎn)小近大的概率密度,。對每一個觀察數(shù)擬合出的多個概率密度分布函數(shù)取平均,。如果某些數(shù)是比較重要的，則可以取加權(quán)平均,?？偠灾?strong>核密度估計通過核函數(shù)（如高斯）將每個數(shù)據(jù)點的數(shù)據(jù)+帶寬當(dāng)作核函數(shù)的參數(shù),，得到N個核函數(shù),，再線性疊加就形成了核密度的估計函數(shù)，歸一化后得核密度概率密度函數(shù),。

導(dǎo)入數(shù)據(jù),，基本處理

import pandas as pd # 讀取數(shù)據(jù)到DataFrame
import urllib # 獲取網(wǎng)絡(luò)數(shù)據(jù)
import shutil # 文件操作
import zipfile # 壓縮解壓
import os

# 建立臨時目錄
try:
    os.system('mkdir bike_data')
except:
    os.system('rm -rf bike_data; mkdir bike_data')

data_source = 'http://archive.ics./ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip' # 網(wǎng)絡(luò)數(shù)據(jù)地址
zipname = 'bike_data/Bike-Sharing-Dataset.zip' # 拼接文件和路徑
urllib.request.urlretrieve(data_source, zipname) # 獲得數(shù)據(jù)

zip_ref = zipfile.ZipFile(zipname, 'r') # 創(chuàng)建一個ZipFile對象處理壓縮文件
#zip_ref.extractall(temp_dir) # 解壓
zip_ref.extractall('bike_data')
zip_ref.close()

daily_path = 'bike_data/day.csv'
daily_data = pd.read_csv(daily_path) # 讀取csv文件
daily_data['dteday'] = pd.to_datetime(daily_data['dteday']) # 把字符串?dāng)?shù)據(jù)傳換成日期數(shù)據(jù)
drop_list = ['instant', 'season', 'yr', 'mnth', 'holiday', 'workingday', 'weathersit', 'atemp', 'hum'] # 不關(guān)注的列
daily_data.drop(drop_list, inplace = True, axis = 1) # inplace=true在對象上直接操作
daily_data.head() # 看一看數(shù)據(jù)~

這里寫圖片描述

配置參數(shù)

from __future__ import division, print_function # 引入3.x版本的除法和打印
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
# 在notebook中顯示繪圖結(jié)果
%matplotlib inline

# 設(shè)置一些全局的資源參數(shù)，可以進(jìn)行個性化修改
import matplotlib
# 設(shè)置圖片尺寸 14" x 7"
# rc: resource configuration
matplotlib.rc('figure', figsize = (14, 7))
# 設(shè)置字體 14
matplotlib.rc('font', size = 14)
# 不顯示頂部和右側(cè)的坐標(biāo)線
matplotlib.rc('axes.spines', top = False, right = False)
# 不顯示網(wǎng)格
matplotlib.rc('axes', grid = False)
# 設(shè)置背景顏色是白色
matplotlib.rc('axes', facecolor = 'white')

關(guān)聯(lián)分析

散點圖,，分析變量關(guān)系

from matplotlib import font_manager   #設(shè)置字體的一個包
fontP = font_manager.FontProperties()
fontP.set_family('SimHei')
fontP.set_size(14)

# 包裝一個散點圖的函數(shù)便于復(fù)用
def scatterplot(x_data, y_data, x_label, y_label, title):

    # 創(chuàng)建一個繪圖對象
    fig, ax = plt.subplots()

    # 設(shè)置數(shù)據(jù),、點的大小、點的顏色和透明度
    ax.scatter(x_data, y_data, s = 10, color = '#539caf', alpha = 0.9) # http://www./other/rgb.htm

    # 添加標(biāo)題和坐標(biāo)說明
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)

# 繪制散點圖
scatterplot(x_data = daily_data['temp']
            , y_data = daily_data['cnt']
            , x_label = 'Normalized temperature (C)'
            , y_label = 'Check outs'
            , title = 'Number of Check Outs vs Temperature')

這里寫圖片描述

曲線圖,，擬合變量關(guān)系
OLS理論參考http://blog.csdn.net/cymy001/article/details/78364652

# 線性回歸
import statsmodels.api as sm # 最小二乘
from statsmodels.stats.outliers_influence import summary_table # 獲得匯總信息
x = sm.add_constant(daily_data['temp']) # 線性回歸增加常數(shù)項 y=kx+b
y = daily_data['cnt']
regr = sm.OLS(y, x) # 普通最小二乘模型,，ordinary least square model
res = regr.fit()    #res.model.endog
# 從模型獲得擬合數(shù)據(jù)
st, data, ss2 = summary_table(res, alpha=0.05) # 置信水平alpha=5%，st數(shù)據(jù)匯總,，data數(shù)據(jù)詳情,，ss2數(shù)據(jù)列名
fitted_values = data[:,2]  #等價于res.fittedvalues

# 包裝曲線繪制函數(shù)
def lineplot(x_data, y_data, x_label, y_label, title):
    # 創(chuàng)建繪圖對象
    _, ax = plt.subplots()

    # 繪制擬合曲線，lw=linewidth,，alpha=transparancy
    ax.plot(x_data, y_data, lw = 2, color = '#539caf', alpha = 1)

    # 添加標(biāo)題和坐標(biāo)說明
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)

# 調(diào)用繪圖函數(shù)
lineplot(x_data = daily_data['temp']
         , y_data = fitted_values
         , x_label = 'Normalized temperature (C)'
         , y_label = 'Check outs'
         , title = 'Line of Best Fit for Number of Check Outs vs Temperature')

這里寫圖片描述

帶置信區(qū)間的曲線圖,，評估曲線擬合結(jié)果

# 獲得5%置信區(qū)間的上下界
predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T  #summary_table

# 創(chuàng)建置信區(qū)間DataFrame，上下界
CI_df = pd.DataFrame(columns = ['x_data', 'low_CI', 'upper_CI'])
CI_df['x_data'] = daily_data['temp']  
CI_df['low_CI'] = predict_mean_ci_low
CI_df['upper_CI'] = predict_mean_ci_upp
CI_df.sort_values('x_data', inplace = True) # 根據(jù)x_data進(jìn)行排序

# 繪制置信區(qū)間
def lineplotCI(x_data, y_data, sorted_x, low_CI, upper_CI, x_label, y_label, title):
    # 創(chuàng)建繪圖對象
    _, ax = plt.subplots()

    # 繪制預(yù)測曲線
    ax.plot(x_data, y_data, lw = 1, color = '#539caf', alpha = 1, label = 'Fit')
    # 繪制置信區(qū)間,，順序填充
    ax.fill_between(sorted_x, low_CI, upper_CI, color = '#539caf', alpha = 0.4, label = '95% CI')   #####
    # 添加標(biāo)題和坐標(biāo)說明
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)

    # 顯示圖例,，配合label參數(shù)，loc=“best”自適應(yīng)方式
    ax.legend(loc = 'best')

# Call the function to create plot
lineplotCI(x_data = daily_data['temp']
           , y_data = fitted_values
           , sorted_x = CI_df['x_data']
           , low_CI = CI_df['low_CI']
           , upper_CI = CI_df['upper_CI']
           , x_label = 'Normalized temperature (C)'
           , y_label = 'Check outs'
           , title = 'Line of Best Fit for Number of Check Outs vs Temperature')

這里寫圖片描述

雙坐標(biāo)曲線圖,，曲線擬合不滿足置信閾值時,，考慮增加獨立變量；分析不同尺度多變量的關(guān)系

# 雙縱坐標(biāo)繪圖函數(shù)
def lineplot2y(x_data, x_label, y1_data, y1_color, y1_label, y2_data, y2_color, y2_label, title):
    _, ax1 = plt.subplots()
    ax1.plot(x_data, y1_data, color = y1_color)
    # 添加標(biāo)題和坐標(biāo)說明
    ax1.set_ylabel(y1_label, color = y1_color)
    ax1.set_xlabel(x_label)
    ax1.set_title(title)

    ax2 = ax1.twinx() # 兩個繪圖對象共享橫坐標(biāo)軸
    ax2.plot(x_data, y2_data, color = y2_color)
    ax2.set_ylabel(y2_label, color = y2_color)
    # 右側(cè)坐標(biāo)軸可見
    ax2.spines['right'].set_visible(True)

# 調(diào)用繪圖函數(shù)
lineplot2y(x_data = daily_data['dteday']
           , x_label = 'Day'
           , y1_data = daily_data['cnt']
           , y1_color = '#539caf'
           , y1_label = 'Check outs'
           , y2_data = daily_data['windspeed']
           , y2_color = '#7663b0'
           , y2_label = 'Normalized windspeed'
           , title = 'Check Outs and Windspeed Over Time')

這里寫圖片描述

分布分析

灰度圖,，粗略區(qū)間計算

# 繪制灰度圖的函數(shù)
def histogram(data, x_label, y_label, title):
    _, ax = plt.subplots()
    res = ax.hist(data, color = '#539caf', bins=20) # 設(shè)置bin的數(shù)量
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
    return res

# 繪圖函數(shù)調(diào)用
res = histogram(data = daily_data['registered']
           , x_label = 'Check outs'
           , y_label = 'Frequency'
           , title = 'Distribution of Registered Check Outs')
res[0] # value of bins 每個桶內(nèi)頻數(shù)
res[1] # boundary of bins 桶的邊界橫坐標(biāo)

這里寫圖片描述

堆疊直方圖,，比較兩個分布

# 繪制堆疊的直方圖
def overlaid_histogram(data1, data1_name, data1_color, data2, data2_name, data2_color, x_label, y_label, title):
    # 歸一化數(shù)據(jù)區(qū)間，對齊兩個直方圖的bins
    max_nbins = 10
    data_range = [min(min(data1), min(data2)), max(max(data1), max(data2))]
    binwidth = (data_range[1] - data_range[0]) / max_nbins
    bins = np.arange(data_range[0], data_range[1] + binwidth, binwidth) # 生成直方圖bins區(qū)間

    # Create the plot
    _, ax = plt.subplots()
    ax.hist(data1, bins = bins, color = data1_color, alpha = 1, label = data1_name)
    ax.hist(data2, bins = bins, color = data2_color, alpha = 0.75, label = data2_name)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
    ax.legend(loc = 'best')

# Call the function to create plot
overlaid_histogram(data1 = daily_data['registered']
                   , data1_name = 'Registered'
                   , data1_color = '#539caf'
                   , data2 = daily_data['casual']
                   , data2_name = 'Casual'
                   , data2_color = '#7663b0'
                   , x_label = 'Check outs'
                   , y_label = 'Frequency'
                   , title = 'Distribution of Check Outs By Type')

這里寫圖片描述

密度圖,，精細(xì)刻畫概率密度

# 計算概率密度
from scipy.stats import gaussian_kde
data = daily_data['registered']
density_est = gaussian_kde(data) # kernal density estimate: https://en./wiki/Kernel_density_estimation
# 控制平滑程度,，數(shù)值越大，越平滑
density_est.covariance_factor = lambda : 0.3   #帶寬
density_est._compute_covariance()
x_data = np.arange(min(data), max(data), 200)    #data是無序的,，x_data從小到大排序作圖

# 繪制密度估計曲線
def densityplot(x_data, density_est, x_label, y_label, title):
    _, ax = plt.subplots()
    ax.plot(x_data, density_est(x_data), color = '#539caf', lw =2)  #lw是曲線亮度
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)

# 調(diào)用繪圖函數(shù)
densityplot(x_data = x_data
            , density_est = density_est
            , x_label = 'Check outs'
            , y_label = 'Frequency'
            , title = 'Distribution of Registered Check Outs')
#type(density_est)  #scipy.stats.kde.gaussian_kde

這里寫圖片描述

分類組間分析

組間定量比較,，分組粒度,，組間聚類

柱狀圖，一級類間均值方差比較

# 分天分析統(tǒng)計特征
mean_total_co_day = daily_data[['weekday', 'cnt']].groupby('weekday').agg([np.mean, np.std])
mean_total_co_day.columns = mean_total_co_day.columns.droplevel()  #變成一維columns

# 定義繪制柱狀圖的函數(shù)
def barplot(x_data, y_data, error_data, x_label, y_label, title):
    _, ax = plt.subplots()
    # 柱狀圖
    ax.bar(x_data, y_data, color = '#539caf', align = 'center')
    # 繪制方差
    # ls='none'去掉bar之間的連線
    ax.errorbar(x_data, y_data, yerr = error_data, color = '#297083', ls = 'none', lw = 5)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)

# 繪圖函數(shù)調(diào)用
barplot(x_data = mean_total_co_day.index.values
        , y_data = mean_total_co_day['mean']
        , error_data = mean_total_co_day['std']
        , x_label = 'Day of week'
        , y_label = 'Check outs'
        , title = 'Total Check Outs By Day of Week (0 = Sunday)')

這里寫圖片描述

堆積柱狀圖,，多級類間相對占比比較

# 分天統(tǒng)計注冊和偶然使用的情況
mean_by_reg_co_day = daily_data[['weekday', 'registered', 'casual']].groupby('weekday').mean()
# 分天統(tǒng)計注冊和偶然使用的占比
mean_by_reg_co_day['total'] = mean_by_reg_co_day['registered'] + mean_by_reg_co_day['casual']
mean_by_reg_co_day['reg_prop'] = mean_by_reg_co_day['registered'] / mean_by_reg_co_day['total']
mean_by_reg_co_day['casual_prop'] = mean_by_reg_co_day['casual'] / mean_by_reg_co_day['total']


# 繪制堆積柱狀圖
def stackedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label, title):
    _, ax = plt.subplots()
    # 循環(huán)繪制堆積柱狀圖
    for i in range(0, len(y_data_list)):
        if i == 0:
            ax.bar(x_data, y_data_list[i], color = colors[i], align = 'center', label = y_data_names[i])
        else:
            # 采用堆積的方式,，除了第一個分類，后面的分類都從前一個分類的柱狀圖接著畫
            # 用歸一化保證最終累積結(jié)果為1
            ax.bar(x_data, y_data_list[i], color = colors[i], bottom = y_data_list[1 - i], align = 'center', label = y_data_names[i])
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
    ax.legend(loc = 'upper right') # 設(shè)定圖例位置

# 調(diào)用繪圖函數(shù)
stackedbarplot(x_data = mean_by_reg_co_day.index.values
               , y_data_list = [mean_by_reg_co_day['reg_prop'], mean_by_reg_co_day['casual_prop']]
               , y_data_names = ['Registered', 'Casual']
               , colors = ['#539caf', '#7663b0']
               , x_label = 'Day of week'
               , y_label = 'Proportion of check outs'
               , title = 'Check Outs By Registration Status and Day of Week (0 = Sunday)')

這里寫圖片描述

分組柱狀圖,，多級類間絕對數(shù)值比較
(按每一類遍歷,，先畫第一類的x_data各柱子，再畫第二類的x_data各柱子)

# 繪制分組柱狀圖的函數(shù)
def groupedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label, title):
    _, ax = plt.subplots()
    # 設(shè)置每一組柱狀圖的寬度
    total_width = 0.8
    # 設(shè)置每一個柱狀圖的寬度
    ind_width = total_width / len(y_data_list)
    # 計算每一個柱狀圖的中心偏移
    alteration = np.arange(-total_width/2+ind_width/2, total_width/2+ind_width/2, ind_width)

    # 分別繪制每一個柱狀圖
    for i in range(0, len(y_data_list)):
        # 橫向散開繪制
        ax.bar(x_data + alteration[i], y_data_list[i], color = colors[i], label = y_data_names[i], width = ind_width)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
    ax.legend(loc = 'upper right')

# 調(diào)用繪圖函數(shù)
groupedbarplot(x_data = mean_by_reg_co_day.index.values
               , y_data_list = [mean_by_reg_co_day['registered'], mean_by_reg_co_day['casual']]
               , y_data_names = ['Registered', 'Casual']
               , colors = ['#539caf', '#7663b0']
               , x_label = 'Day of week'
               , y_label = 'Check outs'
               , title = 'Check Outs By Registration Status and Day of Week (0 = Sunday)')

這里寫圖片描述

箱式圖,，多級類間數(shù)據(jù)分析比較,，柱狀圖+堆疊灰度圖

# 只需要指定分類的依據(jù)，就能自動繪制箱式圖
days = np.unique(daily_data['weekday'])   #np.unique返回排好序的出現(xiàn)值（集合）
bp_data = []
for day in days:
    bp_data.append(daily_data[daily_data['weekday'] == day]['cnt'].values)   #一次循環(huán)在列表中追加一個array

# 定義繪圖函數(shù)
def boxplot(x_data, y_data, base_color, median_color, x_label, y_label, title):
    _, ax = plt.subplots()

    # 設(shè)置樣式
    ax.boxplot(y_data
               # 箱子是否顏色填充
               , patch_artist = True
               # 中位數(shù)線顏色
               , medianprops = {'color': base_color}
               # 箱子顏色設(shè)置,，color：邊框顏色,，facecolor：填充顏色
               , boxprops = {'color': base_color, 'facecolor': median_color}
               # 貓須顏色whisker
               , whiskerprops = {'color': median_color}
               # 貓須界限顏色whisker cap
               , capprops = {'color': base_color})

    # 箱圖與x_data保持一致
    ax.set_xticklabels(x_data)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_title(title)

# 調(diào)用繪圖函數(shù)
boxplot(x_data = days
        , y_data = bp_data
        , base_color = 'b'
        , median_color = 'r'
        , x_label = 'Day of week'
        , y_label = 'Check outs'
        , title = 'Total Check Outs By Day of Week (0 = Sunday)')

這里寫圖片描述

本站是提供個人知識管理的網(wǎng)絡(luò)存儲空間，所有內(nèi)容均由用戶發(fā)布,，不代表本站觀點,。請注意甄別內(nèi)容中的聯(lián)系方式、誘導(dǎo)購買等信息,，謹(jǐn)防詐騙,。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,，請點擊一鍵舉報,。

轉(zhuǎn)藏 分享

QQ空間 QQ好友新浪微博微信

獻(xiàn)花（0） +1

來自： hdzgx > 《電腦》

舉報/認(rèn)領(lǐng)