在TCGA數(shù)據(jù)挖掘如此流行的現(xiàn)在,,小編也來插一腳,,我就先拿最簡單的差異分析做例子吧。 比如我下載乳腺癌的所有芯片得到的表達(dá)矩陣,,然后根據(jù)樣本的分組,,比如正常組織的表達(dá)矩陣和疾病組織的表達(dá)矩陣就可以做差異分析,。其實和我們之前推出的GEO數(shù)據(jù)挖掘系列相差無幾,只不過芯片表達(dá)矩陣的下載地址換成了TCGA的數(shù)據(jù)庫,。 假如你成功地學(xué)習(xí)了我們之前的GEO系列教程,,那么本教程對你而言,最重要的知識點其實就是數(shù)據(jù)如何下載,。 那么我們就開始吧?。?/span> 尤其需要注意的是,,TCGA數(shù)據(jù)庫里對病人來說,,量化他們的基因的表達(dá)值,經(jīng)歷了兩個階段,。起初都是用表達(dá)芯片的手段,,對乳腺癌來說有接近600個表達(dá)芯片的數(shù)據(jù),后來隨著NGS技術(shù)的大行其道,,乳腺癌全部的1000多個病人都進(jìn)行了RNA-seq測序,,考慮到有些病人既測量了他們的疾病組織也測量了他們的正常組織,所以我們其實是可以下載到1300個樣本的數(shù)據(jù),。 芯片數(shù)據(jù) 1 通過firehose下載...... 數(shù)據(jù)存放網(wǎng)址: http://gdac./runs/stddata__2016_01_28/data/BRCA/20160128/ 下載命令: wget http://gdac./runs/stddata__2016_01_28/data/BRCA/20160128/gdac._BRCA.mRNA_Preprocess_Median.Level_3.2016012800.0.0.tar.gz 2 通過UCSC Xena下載...... 數(shù)據(jù)存放網(wǎng)址: https:///datapages/?dataset=TCGA.BRCA.sampleMap%2FAgilentG4502A_07_3&host=https%3A%2F%2Ftcga.&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443 下載命令: wget https://tcga./download/TCGA.BRCA.sampleMap/AgilentG4502A_07_3.gz UCSC Xena的測序數(shù)據(jù)位于: https:///datapages/?cohort=TCGA%20Breast%20Cancer%20(BRCA)&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443 所以我們在做芯片差異分析表達(dá)數(shù)據(jù)的時候,,一定要下載正確的數(shù)據(jù)哦~ 數(shù)據(jù)準(zhǔn)備 rm(list = ls()) options(stringsAsFactors = F) if(F){ array_brca=read.table('BRCA.medianexp.txt.gz',header = T,sep=' ',fill=T,quote = '') array_brca[1:4,1:4] array_brca=array_brca[-1,] rownames(array_brca)=array_brca[,1] array_brca=array_brca[,-1]
exprSet=array_brca exprSet[1:4,1:4] group_list=ifelse(as.numeric(substr(colnames(array_brca),14,15)) <> 根據(jù)TCGA樣本的命名可以區(qū)分正常組織和腫瘤樣本的測序結(jié)果,詳情閱讀最后的原文,。 exprSet=as.data.frame(lapply(exprSet,as.numeric)) rownames(exprSet)=rownames(array_brca) exprSet=na.omit(exprSet) exprSet[1:4,1:4] dim(exprSet) save(exprSet,group_list,file = 'tcga_brca_array_input.Rdata') } load(file = 'tcga_brca_array_input.Rdata') 差異分析 library( 'limma' ) { design <- model.matrix(="" ~0="" +="" factor(="" group_list="" )=""> colnames( design ) = levels( factor( group_list ) ) rownames( design ) = colnames( exprSet ) } design contrast.matrix <- makecontrasts(="" 'tumor-normal',="" levels="design"> contrast.matrix { fit <- lmfit(="" exprset,="" design=""> fit2 <- contrasts.fit(="" fit,="" contrast.matrix=""> fit2 <- ebayes(="" fit2=""> nrDEG = topTable( fit2, coef = 1, n = Inf ) write.table( nrDEG, file = 'nrDEG_BRCA_medianexp.out') } head(nrDEG) 繪制熱圖 library( 'pheatmap' ) { tmp = nrDEG[nrDEG$P.Value <> 差異結(jié)果需要先根據(jù)p值挑選 nrDEG_Z = tmp[ order( tmp$logFC ), ] nrDEG_F = tmp[ order( -tmp$logFC ), ] choose_gene = c( rownames( nrDEG_Z )[1:100], rownames( nrDEG_F )[1:100] ) choose_matrix = exprSet[ choose_gene, ] choose_matrix = t( scale( t( choose_matrix ) ) )
choose_matrix[choose_matrix > 2] = 2 choose_matrix[choose_matrix < -2]="">
annotation_col = data.frame( CellType = factor( group_list ) ) rownames( annotation_col ) = colnames( exprSet ) pheatmap( fontsize = 2, choose_matrix, annotation_col = annotation_col, show_rownames = F, annotation_legend = F, filename = 'heatmap_BRCA_medianexp.png') } ?????瞄一眼????? ■ ■ ■ 繪制火山圖 library( 'ggplot2' ) logFC_cutoff <- with(="" nrdeg,="" mean(="" abs(="" logfc="" )="" )="" +="" 2="" *="" sd(="" abs(="" logfc="" )="" )=""> logFC_cutoff logFC_cutoff = 1.2 { nrDEG$change = as.factor( ifelse( nrDEG$P.Value < 0.05="" &="" abs(nrdeg$logfc)=""> logFC_cutoff, ifelse( nrDEG$logFC > logFC_cutoff , 'UP', 'DOWN' ), 'NOT' ) )
save( nrDEG, file = 'nrDEG_array_medianexp.Rdata' )
this_tile <- paste0(="" 'cutoff="" for="" logfc="" is="" ',="" round(="" logfc_cutoff,="" 3=""> 'The number of up gene is ', nrow(nrDEG[ nrDEG$change =='UP', ] ), 'The number of down gene is ', nrow(nrDEG[ nrDEG$change =='DOWN', ] ) )
volcano = ggplot(data = nrDEG, aes( x = logFC, y = -log10(P.Value), color = change)) + geom_point( alpha = 0.4, size = 1.75) + theme_set( theme_set( theme_bw( base_size = 15 ) ) ) + xlab( 'log2 fold change' ) + ylab( '-log10 p-value' ) + ggtitle( this_tile ) + theme( plot.title = element_text( size = 15, hjust = 0.5)) + scale_colour_manual( values = c('blue','black','red') ) print( volcano ) ggsave( volcano, filename = 'volcano_BRCA_medianexp.png' ) dev.off() } ?????瞄一眼????? ■ ■ ■ KEGG注釋 library( 'clusterProfiler' ) library( 'org.Hs.eg.db' ) df <- bitr(="" rownames(="" nrdeg="" ),="" fromtype='SYMBOL' ,="" totype="c(" 'entrezid'="" ),="" orgdb="org.Hs.eg.db"> head( df ) { nrDEG$SYMBOL = rownames( nrDEG ) nrDEG = merge( nrDEG, df, by='SYMBOL' ) } head( nrDEG ) { gene_up = nrDEG[ nrDEG$change == 'UP', 'ENTREZID' ] gene_down = nrDEG[ nrDEG$change == 'DOWN', 'ENTREZID' ] gene_diff = c( gene_up, gene_down ) gene_all = as.character(nrDEG[ ,'ENTREZID'] ) } { geneList = nrDEG$logFC names( geneList ) = nrDEG$ENTREZID geneList = sort( geneList, decreasing = T ) } library( 'ggplot2' ) # kegg enrich {
{ ## KEGG pathway analysis kk.up <- enrichkegg( ="" gene ="" ="" ="" ="" =" " gene_up ="" =""> organism = 'hsa' , universe = gene_all , pvalueCutoff = 0.99 , qvalueCutoff = 0.99 ) kk.down <- enrichkegg(="" gene ="" ="" ="" ="" =" " gene_down =""> organism = 'hsa' , universe = gene_all , pvalueCutoff = 0.99 , qvalueCutoff = 0.99 ) }
head( kk.up )[ ,1:6 ] head( kk.down )[ ,1:6 ] kegg_down_dt <- as.data.frame(="" kk.down=""> kegg_up_dt <- as.data.frame(="" kk.up=""> down_kegg <- kegg_down_dt[="" kegg_down_dt$pvalue="">< 0.05,=""> down_kegg$group = -1 up_kegg <- kegg_up_dt[="" kegg_up_dt$pvalue="">< 0.05,=""> up_kegg$group = 1
dat = rbind( up_kegg, down_kegg ) dat$pvalue = -log10( dat$pvalue ) dat$pvalue = dat$pvalue * dat$group
dat = dat[ order( dat$pvalue, decreasing = F ), ]
g_kegg <- ggplot(=""> aes(x = reorder( Description, order( pvalue, decreasing=F ) ), y = pvalue, fill = group)) + geom_bar( stat = 'identity' ) + scale_fill_gradient( low = 'blue', high = 'red', guide = FALSE ) + scale_x_discrete( name = 'Pathway names' ) + scale_y_continuous( name = 'log10P-value' ) + coord_flip() + theme_bw() + theme( plot.title = element_text( hjust = 0.5 ) ) + ggtitle( 'Pathway Enrichment' ) print( g_kegg ) ggsave( g_kegg, filename = 'kegg_up_down.png' ) } ?????瞄一眼????? ■ ■ ■ GSEA注釋 { ### GSEA kk_gse <- gsekegg(genelist ="" ="" =""> organism = 'hsa', nPerm = 1000, minGSSize = 30, pvalueCutoff = 0.9, verbose = FALSE) head(kk_gse)[,1:6] gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
down_kegg<><0.01 &="" kk_gse$enrichmentscore="">< 0,];down_kegg$group=""> up_kegg<><0.01 &="" kk_gse$enrichmentscore=""> 0,];up_kegg$group=1
dat = rbind( up_kegg, down_kegg ) dat$pvalue = -log10( dat$pvalue ) dat$pvalue = dat$pvalue * dat$group
dat = dat[ order( dat$pvalue, decreasing = F ), ]
g_kegg <- ggplot(=""> aes(x = reorder( Description, order( pvalue, decreasing=F ) ), y = pvalue, fill = group)) + geom_bar( stat = 'identity' ) + scale_fill_gradient( low = 'blue', high = 'red', guide = FALSE ) + scale_x_discrete( name = 'Pathway names' ) + scale_y_continuous( name = 'log10P-value' ) + coord_flip() + theme_bw() + theme( plot.title = element_text( hjust = 0.5 ) ) + ggtitle( 'Pathway Enrichment' ) print( g_kegg ) ggsave(g_kegg,filename = 'kegg_up_down_gsea.png') } ?????瞄一眼????? ■ ■ ■ GO注釋 g_list = list( gene_up = gene_up, gene_down = gene_down, gene_diff = gene_diff) go_enrich_results <- lapply(="" g_list,="" function(="" gene="" )=""> lapply( c( 'BP', 'MF', 'CC' ) , function( ont ) { cat( paste( 'Now process', ont ) ) ego <- enrichgo(="" gene ="" ="" ="" ="" =" "> universe = gene_all, OrgDb = org.Hs.eg.db, ont = ont , pAdjustMethod = 'BH', pvalueCutoff = 0.99, qvalueCutoff = 0.99, readable = TRUE) print( head( ego ) ) return( ego ) }) }) save( go_enrich_results, file = 'go_enrich_results.Rdata' ) n1 = c( 'gene_up', 'gene_down', 'gene_diff' ) n2 = c( 'BP', 'MF', 'CC' ) for ( i in 1:3 ){ for ( j in 1:3 ){ fn = paste0( 'dotplot_', n1[i], '_', n2[j], '.png' ) cat( paste0( fn, '' ) ) png( fn, res = 150, width = 1080 ) print( dotplot( go_enrich_results[[i]][[j]] ) ) dev.off() } } 圖略~~~~~~ |
|