GSVA簡介基因集變異分析(Gene Set Variation Analysis,, GSVA)是一種以非監(jiān)督方式對一個簡單群體評估通路活性變異的GSE方法,。GSE基因集富集(gene set enrichment)方法通常被認(rèn)為是一個生物信息學(xué)分析的終點(diǎn),但是GSVA構(gòu)成了一個構(gòu)建以通路為中心的生物學(xué)模型的起點(diǎn),。 GSVA的目的將一個基因表達(dá)矩陣轉(zhuǎn)換成基因集表達(dá)矩陣 GSEA與GSVA的區(qū)別相同點(diǎn)無需尋找差異基因 異同點(diǎn):GSEA:計算的基本原理是掃描排序序列,當(dāng)出現(xiàn)一個功能基因集中的基因時,就增加 ES 值,, 反之,, 就減少 ES 值, 所以在整個掃描過程中,, ES是一個動態(tài)的值,。 GSVA及相關(guān)R包的安裝if(length(getOption("CRAN"))==0) options(CRAN="https://mirrors.tuna./CRAN/") if(!require("GSVA")) BiocManager::install("GSVA") if(!require("GSEABase")) BiocManager::install("GSEABase") if(!require("GSVAdata")) BiocManager::install("GSVAdata") if(!require("pheatmap")) BiocManager::install("pheatmap") if(!require("ggplot2")) BiocManager::install("ggplot2") if(!require("tidyr")) BiocManager::install("tidyr") if(!require("dplyr")) BiocManager::install("dplyr") if(!require("limma")) BiocManager::install("limma") GSVA的運(yùn)行注意事項(xiàng)
GSVA參數(shù)gsva(expr, gset.idx.list, annotation, method=c("gsva", "ssgsea", "zscore", "plage"), kcdf=c("Gaussian", "Poisson", "none"), abs.ranking=FALSE, min.sz=1, max.sz=Inf, parallel.sz=0, parallel.type="SOCK", mx.diff=TRUE, tau=switch(method, gsva=1, ssgsea=0.25, NA), ssgsea.norm=TRUE, verbose=TRUE)
模擬數(shù)據(jù)的使用#構(gòu)造一個 30個樣本,2萬個基因的表達(dá)矩陣,, 加上 100 個假定的基因集 p <- 20000 ## number of genes n <- 30 ## number of samples nGS <- 100 ## number of gene sets min.sz <- 10 ## minimum gene set size max.sz <- 100 ## maximum gene set size X <- matrix(rnorm(p*n), nrow=p, dimnames=list(1:p, 1:n)) dim(X) gs <- as.list(sample(min.sz:max.sz, size=nGS, replace=TRUE)) ## sample gene set sizes gs <- lapply(gs, function(n, p) sample(1:p, size=n, replace=FALSE), p) ## sample gene sets es.max <- gsva(X, gs, mx.diff=FALSE, verbose=FALSE, parallel.sz=1) es.dif <- gsva(X, gs, mx.diff=TRUE, verbose=FALSE, parallel.sz=1) pheatmap::pheatmap(es.max) pheatmap::pheatmap(es.dif) mx.diff=TRUE時,,ES值近似正態(tài)分布,mx.diff=FALSE時,,ES值是一個雙峰的分布,。 par(mfrow=c(1,2), mar=c(4, 4, 4, 1))
#繪制密度分布圖 plot(density(as.vector(es.max)), main="Maximum deviation from zero",xlab="GSVA score", lwd=2, las=1, xaxt="n", xlim=c(-0.75, 0.75), cex.axis=0.8) axis(1, at=seq(-0.75, 0.75, by=0.25), labels=seq(-0.75, 0.75, by=0.25), cex.axis=0.8) 雙峰分布 正態(tài)分布 plot(density(as.vector(es.dif)), main="Difference between largest\npositive and negative deviations",xlab="GSVA score", lwd=2, las=1, xaxt="n", xlim=c(-0.75, 0.75), cex.axis=0.8) axis(1, at=seq(-0.75, 0.75, by=0.25), labels=seq(-0.75, 0.75, by=0.25), cex.axis=0.8) dev.off() 一般會默認(rèn)選擇正態(tài)分布 芯片數(shù)據(jù)實(shí)操導(dǎo)入數(shù)據(jù)(文末有數(shù)據(jù)獲取方式)GSE<-read.table('GSE.txt',header = T,sep = '\t') GSE[1:5,1:5] dim(GSE) 芯片數(shù)據(jù)預(yù)處理GSE=as.matrix(GSE) rownames(GSE)=GSE[,1] exp=GSE[,2:ncol(GSE)] dimnames=list(rownames(exp),colnames(exp)) mat=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames) mat=avereps(mat)#Condense a microarray data object so that values for within-array replicate probes are replaced with their average. mat[1:5,1:5] dim(mat) 芯片數(shù)據(jù)標(biāo)準(zhǔn)化mat=normalizeBetweenArrays(mat) gmt格式的基因集獲取方式登錄GSEA網(wǎng)址,https://www./gsea/index.jsp 點(diǎn)擊箭頭所示 這里需要郵箱注冊,,文件下載后這樣的,,共有2萬多行(文末有獲取方式) 加載gmt格式的基因集geneSets <- getGmt("msigdb.v7.0.symbols.gmt") GSVA分析Result=gsva(mat, geneSets, min.sz=10, max.sz=500, verbose=TRUE, parallel.sz=1 查看,保存結(jié)果Result[1:5,1:5] write.table(Result,file="gsvaresult.txt",sep="\t",quote=F,col.names=F) 用LIMMA包進(jìn)行差異分析logFCcutoff=0.3 adjPvalueCutoff=0.05 type=c( rep("con",25),rep("treat",25) ) design=model.matrix(~ type) colnames(design)=c("con", "treat") fit=lmFit(Result, design) fit=eBayes(fit) dif=topTable(fit, coef="con", number=Inf,adjust.method="holm") 查看,,保存結(jié)果dif[1:5,1:5] write.table(dif,file="dif.txt",sep="\t",quote=F) 火山圖繪制colnames(dif) dif$type[(dif$adj.P.Val > 0.05|dif$adj.P.Val=="NA")|(dif$logFC < 0.3)& dif$logFC > -0.3] <- "none significant" dif$type[dif$adj.P.Val <= 0.05 & dif$logFC >= 0.3] <- "up-regulated" dif$type[dif$adj.P.Val <= 0.05 & dif$logFC <= -0.3] <- "down-regulated"
library(ggplot2) p = ggplot(dif,aes(logFC,-1*log10(adj.P.Val),color=type)) p + geom_point() 火山圖美化(or 丑化) x_lim <- max(dif$logFC,-dif$logFC) gg=p + geom_point( size=1) + xlim(-x_lim,x_lim) +labs(x="log2(FC)",y="-log10(adjP)")+ scale_color_manual(values =c("#A52A2A","grey","#f8766d"))+ geom_hline(aes(yintercept=-1*log10(0.05)),colour="black", linetype="dashed") + geom_vline(xintercept=c(-0.3,0.3),colour="black", linetype="dashed") print(gg) 熱圖繪制篩選差異基因集矩陣 dif2 <- topTable(fit, coef="con", number=Inf, p.value=adjPvalueCutoff, adjust="holm", lfc=logFCcutoff) pheatmap_data <-Result[rownames(dif2),] 作圖 annotation=c(rep("con",25),rep("treat",25)) names(annotation)=colnames(Result) annotation=as.data.frame(annotation)
pheatmap(pheatmap_data, annotation=annotation, cluster_cols =F,show_rownames = F) RNA-seq數(shù)據(jù)實(shí)操導(dǎo)入數(shù)據(jù)rm(list = ls()) mRNAdata<-read.table('LIHC FPKM.txt',header = T, sep = '\t') 處理數(shù)據(jù)row.names(mRNAdata)<-mRNAdata[,1] mRNAdata<-mRNAdata[,-1] #將數(shù)據(jù)框轉(zhuǎn)換成矩陣 mRNAdata = as.matrix(mRNAdata) mRNAdata[1:4,1:4] 加載GMT文件geneSets <- getGmt("msigdb.v7.0.symbols.gmt") GSVA算分(由于數(shù)據(jù)量大,,這一步比較耗時)Result=gsva(mRNAdata, geneSets, min.sz=10, max.sz=500, verbose=T, parallel.sz=1)
Result[1:5,1:5] 根據(jù)TCGA樣本名設(shè)計分組(具體方法可見TCGA差異分析及ggplot作圖驗(yàn)證) group_list=ifelse(as.numeric(substr(colnames(mRNAdata),14,15)) < 10,'tumor','normal') design <- model.matrix(~0+factor(group_list)) colnames(design)=levels(factor(group_list)) rownames(design)=colnames(mRNAdata) design 用LIMMA包進(jìn)行差異分析fit <- lmFit(Result, design) cont.matrix=makeContrasts(contrasts=c('tumor-normal'),levels = design) fit2=contrasts.fit(fit,cont.matrix) fit2=eBayes(fit2) tempOutput = topTable(fit2, coef='tumor-normal', n=Inf) dif = na.omit(tempOutput) 查看,保存結(jié)果head(dif) write.table(dif,file="dif_RNAseq",sep="\t",quote=F) 繪制火山圖#火山圖繪制 colnames(dif) dif$type[(dif$adj.P.Val > 0.05|dif$adj.P.Val=="NA")|(dif$logFC < 0.5)& dif$logFC > -0.5] <- "none significant" dif$type[dif$adj.P.Val <= 0.05 & dif$logFC >= 0.5] <- "up-regulated" dif$type[dif$adj.P.Val <= 0.05 & dif$logFC <= -0.5] <- "down-regulated"
library(ggplot2) p = ggplot(dif,aes(logFC,-1*log10(adj.P.Val),color=type)) p + geom_point() 美化(or 丑化)火山圖x_lim <- max(dif$logFC,-dif$logFC) gg=p + geom_point( size=1) + xlim(-x_lim,x_lim) +labs(x="log2(FC)",y="-log10(adjP)")+ scale_color_manual(values =c("#A52A2A","grey","#f8766d"))+ geom_hline(aes(yintercept=-1*log10(0.05)),colour="black", linetype="dashed") + geom_vline(xintercept=c(-0.5,0.5),colour="black", linetype="dashed") print(gg) 繪制熱圖數(shù)據(jù)準(zhǔn)備 logFCcutoff=0.5 adjPvalueCutoff=0.05 dif2<-as.data.frame(dif) dif2$name<-row.names(dif2) dif2 <- filter(dif2, logFC>0.5|logFC< -0.5, adj.P.Val <= 0.05) row.names(dif2)<-dif2[,'name']
pheatmap_data <-Result[rownames(dif2),] 作圖 names(group_list)=colnames(Result) group_list=as.data.frame(group_list)
pheatmap(pheatmap_data, annotation=group_list, cluster_cols =F,show_rownames=F) 下面是這篇文章的示例文件和代碼 |
|