8.1 基因集富集分析 GSEA

基因集表示基因集中基因的一般共享生物属性的先验知识。

8.1.1 over-representation analysis (ORA)

基因本体论 (Gene Ontology,GO) 基因集

京都基因与基因组百科全书 (Kyoto Encyclopedia of Genes and Genomes,KEGG)基因集

Code
# BiocManager::install('clusterProfiler')
# BiocManager::install("org.Hs.eg.db")

DEG <- read_csv("data/DEGs_Analysis/resOrdered.csv")
x <-  na.omit(DEG) 

对于有参考基因组物种的分析,可以在相关软件包中直接加载该物种的背景基因集。对于常见的模式物种,例如BiocManager::install("org.Hs.eg.db") ,安装人类hg19的注释包 BiocManager::install("org.Mm.eg.db") ,安装小鼠的注释包

8.2 基因注释

Code
deg_df <- x[,c(1,3,7)]
head(deg_df)
gene log2FoldChange padj
MGAT3 -4.320278 0
RTKN2 -4.640307 0
GPM6A -5.145635 0
SERTM1 -6.886260 0
LIMS2 -2.660279 0
EMP2 -2.671839 0
Code

deg_df$expression <- factor( 
    ifelse( deg_df$padj< 0.05 & abs(deg_df$log2FoldChange) >= 1,
            if_else( deg_df$log2FoldChange >= 1 , 'up', 'down' ),
            'NS'),
    levels = c("up","down","NS"))
table(deg_df$expression)
#> 
#>    up  down    NS 
#>  2073  1641 15358

gtf_v22_transcripts <- read_delim("data/DEGs_Analysis/gencode.v22.annotation.gene.probeMap")


# 之前差异分析可以保留ENSEMBL标识符  
deg_df <- deg_df |> 
 inner_join(gtf_v22_transcripts, by = c( "gene"= "gene"))

# 移除ENSEMBL版本号
deg_df$ENSEMBL <- str_sub(deg_df$id, start = 1, end = 15)

R 语言中 AnnotationDbi 包提供的一个函数名bitr() 代表 “Biological Identifier Translation”,即生物学标识符转换。这个函数用于在不同的基因标识符之间进行转换,例如从基因的 Ensembl ID 转换为 Entrez ID(特定基因产物))、基因符号(symbol)或基因名称(gene name)。

Entrez是National Center for Biotechnology Information (NCBI) 提供的一个综合性生物信息数据库查询系统。它允许用户搜索和检索包括基因、蛋白质、核酸序列、3D蛋白质结构、基因表达数据、基因组数据、遗传多态性、生物项目、生物文献等各种生物医学信息。Entrez Gene数据库中的基因ID是唯一的数字标识符,用于在NCBI的Entrez系统内标识特定的基因。

clusterProfiler

Code
library(clusterProfiler)
conflicts_prefer(GenomicRanges::setdiff)
library(org.Hs.eg.db)
AnnotationDbi::keytypes(org.Hs.eg.db)
#>  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
#>  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
#> [11] "GENETYPE"     "GO"           "GOALL"        "IPI"          "MAP"         
#> [16] "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"        
#> [21] "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
#> [26] "UNIPROT"
head(keys(org.Hs.eg.db, "ENSEMBL"))
#> [1] "ENSG00000121410" "ENSG00000175899" "ENSG00000291190" "ENSG00000171428"
#> [5] "ENSG00000156006" "ENSG00000196136"
head(keys(org.Hs.eg.db, "ENTREZID"))
#> [1] "1"  "2"  "3"  "9"  "10" "11"
head(keys(org.Hs.eg.db, "SYMBOL")) #基因名称缩写
#> [1] "A1BG"  "A2M"   "A2MP1" "NAT1"  "NAT2"  "NATP"
head(keys(org.Hs.eg.db, "GENENAME")) # 基因名称全称
#> [1] "alpha-1-B glycoprotein"             "alpha-2-macroglobulin"             
#> [3] "alpha-2-macroglobulin pseudogene 1" "N-acetyltransferase 1"             
#> [5] "N-acetyltransferase 2"              "N-acetyltransferase pseudogene"

valid_ensembl <- deg_df$ENSEMBL %in% keys(org.Hs.eg.db, "ENSEMBL")  
table(valid_ensembl)
#> valid_ensembl
#> FALSE  TRUE 
#>  2269 16888

valid_symbol <- deg_df$gene %in% keys(org.Hs.eg.db, "SYMBOL")  
table(valid_symbol)
#> valid_symbol
#> FALSE  TRUE 
#>  3249 15908


deg_df_filtered <- deg_df[valid_ensembl, ]  

# Biological Identifier Translation
df <- bitr(
 geneID= deg_df_filtered$ENSEMBL,
 fromType = "ENSEMBL", 
 toType = c( "ENTREZID","SYMBOL","GENENAME" ), 
 OrgDb = org.Hs.eg.db )

head(df)
ENSEMBL ENTREZID SYMBOL GENENAME
ENSG00000128268 4248 MGAT3 beta-1,4-mannosyl-glycoprotein 4-beta-N-acetylglucosaminyltransferase
ENSG00000182010 219790 RTKN2 rhotekin 2
ENSG00000150625 2823 GPM6A glycoprotein M6A
ENSG00000180440 400120 SERTM1 serine rich and transmembrane domain containing 1
ENSG00000072163 55679 LIMS2 LIM zinc finger domain containing 2
ENSG00000213853 2013 EMP2 epithelial membrane protein 2
Code

deg_df <- inner_join(deg_df_filtered, df, by='ENSEMBL' )

进行上调和下调基因的分离是为了更精确地识别与特定生物学过程或疾病状态相关的途径。例如,某些途径可能在疾病发生时被激活,导致相关基因上调,而其他途径可能被抑制,导致相关基因下调。

Code
gene_up <- deg_df[ deg_df$expression == 'up', "ENTREZID" ] 
gene_down <- deg_df[ deg_df$expression == 'down', "ENTREZID"]


gene_all <- deg_df[ ,"ENTREZID"]

library(biomaRt)

# 使用biomaRt映射基因ID
gene_info <- getBM(attributes = c("hgnc_symbol", "entrezgene_id"), 
                     filters = "entrezgene_id", 
                     values = gene_up$ENTREZID, 
                     mart = useMart("ensembl", dataset = "hsapiens_gene_ensembl"))

8.3 KEGG 通路富集

Code
kegg_up <- clusterProfiler::enrichKEGG(gene = gene_up$ENTREZID ,# entrez gene id
                    organism=  'hsa',
                    keyType = "kegg",
                    pvalueCutoff  =  0.05,
                    pAdjustMethod = "BH",  # FDR多重假设检验校正 qvalue
                    universe=  gene_all$ENTREZID,
                    minGSSize = 10,
                    maxGSSize = 500,
                    qvalueCutoff  =  0.05 )
kegg_down <- clusterProfiler::enrichKEGG(gene = gene_down$ENTREZID ,
                    organism=  'hsa',
                    keyType = "kegg",
                    pvalueCutoff  =  0.05,
                    pAdjustMethod = "BH", 
                    universe=  gene_all$ENTREZID,
                    qvalueCutoff  =  0.05 )
Code
df_up<- as.data.frame(kegg_up)
df_down<- as.data.frame(kegg_down)
df_up
category subcategory ID Description GeneRatio BgRatio pvalue p.adjust qvalue geneID Count
hsa04110 Cellular Processes Cell growth and death hsa04110 Cell cycle 56/763 154/6926 0.0000000 0.0000000 0.0000000 5347/898/990/701/113130/991/151648/9232/9212/9133/891/995/9088/23594/7272/890/4998/9700/57082/1111/10403/157570/8318/4171/1029/80174/699/4173/90381/4175/4085/81620/1870/983/9134/11200/27085/993/3065/4176/1871/5111/5001/10926/9319/5933/7157/26271/10459/1875/5591/1019/1869/8317/1663/4193 56
hsa01230 Metabolism Global and overview maps hsa01230 Biosynthesis of amino acids 19/763 64/6926 0.0000356 0.0045716 0.0042807 5831/84706/5832/6472/29968/2597/5214/2023/3418/5091/1373/10993/7167/6888/226/3417/440/4143/384 19
hsa01232 Metabolism Global and overview maps hsa01232 Nucleotide metabolism 22/763 81/6926 0.0000409 0.0045716 0.0042807 205/6241/978/7083/377841/7298/4830/7498/100/4907/124583/4833/122622/4831/272/3251/56953/1841/9615/7371/5167/954 22
hsa05206 Human Diseases Cancer: overview hsa05206 MicroRNAs in cancer 34/763 155/6926 0.0000549 0.0046001 0.0043074 898/113130/1591/995/1945/2146/5268/1944/1029/9493/8091/1870/5328/9134/7473/1789/1946/672/4318/993/6659/3065/1871/6464/7157/90427/6768/3845/3925/1869/54541/4193/6624/2065 34
hsa00330 Metabolism Amino acid metabolism hsa00330 Arginine and proline metabolism 14/763 45/6926 0.0002183 0.0123365 0.0115516 5831/79814/1159/548596/5832/26/283208/6723/54498/6611/2628/219/384/2593 14
hsa03030 Genetic Information Processing Replication and repair hsa03030 DNA replication 12/763 35/6926 0.0002210 0.0123365 0.0115516 1763/4171/4173/2237/5427/4175/5984/5558/10535/4176/5111/6119 12
hsa00480 Metabolism Metabolism of other amino acids hsa00480 Glutathione metabolism 15/763 51/6926 0.0002628 0.0125753 0.0117751 2877/6241/2729/2936/79017/3418/6723/2730/6611/2539/3417/4257/493869/5226/494143 15
hsa01200 Metabolism Global and overview maps hsa01200 Carbon metabolism 23/763 101/6926 0.0004909 0.0193605 0.0181286 84706/6472/29968/2597/5214/2023/2731/3418/5091/1373/10993/2821/7167/6888/2539/80201/226/3417/55753/2653/4199/5226/6389 23
hsa00240 Metabolism Nucleotide metabolism hsa00240 Pyrimidine metabolism 15/763 54/6926 0.0005201 0.0193605 0.0181286 6241/978/7083/377841/7298/4830/4907/124583/4833/4831/790/56953/1841/7371/5167 15
hsa03460 Genetic Information Processing Replication and repair hsa03460 Fanconi anemia pathway 14/763 50/6926 0.0007264 0.0223526 0.0209303 29089/83990/116028/55215/641/146956/5888/2189/2177/2175/2187/91442/672/6119 14
hsa05203 Human Diseases Cancer: overview hsa05203 Viral carcinogenesis 33/763 171/6926 0.0008538 0.0223526 0.0209303 2648/898/991/890/3017/1111/1029/3718/983/1237/9134/7188/440689/3665/2961/3065/5366/7185/8339/5933/7157/1232/148327/8294/85236/5902/3845/7186/1019/8365/4193/8347/7419 33
hsa04115 Cellular Processes Cell growth and death hsa04115 p53 signaling pathway 18/763 74/6926 0.0008674 0.0223526 0.0209303 898/9133/891/51512/6241/5268/1111/1029/983/9134/11200/64065/5366/84883/7157/1019/3486/4193 18
hsa04974 Organismal Systems Digestive system hsa04974 Protein digestion and absorption 18/763 74/6926 0.0008674 0.0223526 0.0209303 3783/1301/1300/6564/1281/1294/1308/1290/169044/1289/1277/255631/5646/478/1278/1293/1298/1299 18
hsa01240 Metabolism Global and overview maps hsa01240 Biosynthesis of cofactors 27/763 135/6926 0.0014208 0.0339975 0.0318343 205/6999/1728/6472/29968/4830/7358/80308/2729/29926/10797/53630/4833/23475/2730/122622/4831/3145/790/23057/1719/219/8942/8836/4143/54578/93100 27
Code
df_down
category subcategory ID Description GeneRatio BgRatio pvalue p.adjust qvalue geneID Count
hsa04080 Environmental Information Processing Signaling molecules and interaction hsa04080 Neuroactive ligand-receptor interaction 44/635 181/6926 0.0000000 0.0000003 0.0000002 1910/154/7433/10316/1901/10203/1129/2890/148/2900/2358/5028/2922/59350/1268/554/8698/5745/1906/22953/122042/6751/53637/9002/117/185/5734/153/728/146/3953/2894/1908/1132/1128/2357/5179/3355/187/2690/2740/796/8862/147 44
hsa04270 Organismal Systems Circulatory system hsa04270 Vascular smooth muscle contraction 31/635 116/6926 0.0000000 0.0000042 0.0000036 10266/10268/255189/10398/2977/10203/196883/50487/148/5581/779/5319/5592/4629/54776/91807/5583/1906/72/4881/108/10242/5588/114/185/146/115/1908/4628/796/147 31
hsa04820 NA NA hsa04820 Cytoskeleton in muscle cells 43/635 201/6926 0.0000001 0.0000081 0.0000070 64236/27063/7134/6445/10398/1288/2273/23500/477/171024/88/4629/4620/1285/6442/25802/161176/72/1286/6711/10529/84467/7431/1674/2192/9172/6383/91977/25777/2006/131873/825/7273/8516/58529/56203/8736/4628/7111/29767/7140/4634/23345 43
hsa04610 Organismal Systems Immune system hsa04610 Complement and coagulation cascades 21/635 71/6926 0.0000008 0.0000611 0.0000528 5648/7056/7450/11326/732/9002/1361/2157/1604/2159/2160/728/5627/3687/10747/722/717/712/2/1675/713 21
hsa05144 Human Diseases Infectious disease: parasitic hsa05144 Malaria 16/635 46/6926 0.0000015 0.0000934 0.0000808 5175/2995/6403/1440/948/3569/2532/3043/7042/3040/6383/7099/3039/3082/3683/3553 16
hsa04024 Environmental Information Processing Signal transduction hsa04024 cAMP signaling pathway 36/635 176/6926 0.0000028 0.0001455 0.0001257 154/10398/196883/1129/2890/477/779/338442/11069/64399/627/1906/5140/4881/108/6751/114/10411/8843/117/2353/153/115/51196/1262/64411/1908/6262/5348/4772/1128/3725/3355/5295/5350/2740 36
hsa04924 Organismal Systems Endocrine system hsa04924 Renin secretion 17/635 60/6926 0.0000166 0.0007425 0.0006417 154/2977/358/779/5153/1636/5972/1906/5140/4881/117/185/5593/5734/153/1908/5137 17
hsa04015 Environmental Information Processing Signal transduction hsa04015 Rap1 signaling pathway 34/635 185/6926 0.0000562 0.0021692 0.0018748 7010/51378/196883/2255/2277/9732/5028/11069/10235/8817/1268/5587/5155/260425/2247/108/114/10411/9002/2324/5590/3791/9771/115/9863/51196/64411/284/9223/3082/3683/2357/5295/2264 34
hsa04022 Environmental Information Processing Signal transduction hsa04022 cGMP-PKG signaling pathway 28/635 141/6926 0.0000624 0.0021692 0.0018748 1910/154/10398/2977/196883/148/5581/477/779/5592/91807/5140/4881/5138/108/10242/114/185/5593/153/8654/146/115/3764/23533/4772/5350/147 28
hsa04020 Environmental Information Processing Signal transduction hsa04020 Calcium signaling pathway 36/635 203/6926 0.0000745 0.0023306 0.0020143 1910/7134/154/845/196883/1129/2255/2277/148/779/5153/8817/91807/22953/5155/2247/108/114/2324/185/153/3791/146/115/51196/6262/3082/4772/1128/2264/4915/5350/5137/2066/4843/147 36
hsa04380 Organismal Systems Development and regeneration hsa04380 Osteoclast differentiation 26/635 131/6926 0.0001144 0.0032557 0.0028139 7048/126014/10326/9846/54/6688/4286/3552/7042/4688/2353/11025/102725035/107987425/107987462/11027/2354/695/4772/3553/3725/3727/5295/3726/5468/79168 26
hsa04923 Organismal Systems Endocrine system hsa04923 Regulation of lipolysis in adipocytes 13/635 46/6926 0.0001684 0.0043932 0.0037970 2167/154/196883/5592/5140/4881/108/114/5593/153/115/11343/5295 13
hsa04970 Organismal Systems Digestive system hsa04970 Salivary secretion 16/635 69/6926 0.0003885 0.0093543 0.0080849 154/2977/196883/148/477/5592/108/114/683/5593/153/820/146/115/1473/147 16
hsa04261 Organismal Systems Circulatory system hsa04261 Adrenergic signaling in cardiomyocytes 25/635 135/6926 0.0004720 0.0100425 0.0086797 7134/154/196883/148/477/779/9254/11069/6330/108/114/10411/185/153/146/115/23533/6262/3753/785/6332/5350/4634/55799/147 25
hsa05414 Human Diseases Cardiovascular disease hsa05414 Dilated cardiomyopathy 19/635 91/6926 0.0004813 0.0100425 0.0086797 7134/6445/196883/779/9254/6442/108/7042/1674/114/153/115/7273/8516/6262/785/5350/4634/55799 19
hsa04933 Human Diseases Endocrine and metabolic disease hsa04933 AGE-RAGE signaling pathway in diabetic complications 20/635 99/6926 0.0005420 0.0106032 0.0091643 177/7048/1288/2277/5581/7056/3569/1285/1906/1286/3552/7042/5590/185/1958/51196/4772/3553/3725/5295 20
hsa04740 Organismal Systems Sensory system hsa04740 Olfactory transduction 10/635 35/6926 0.0008629 0.0150651 0.0130208 408/123041/57101/5153/5592/5138/83988/5593/1262/5137 10
hsa04060 Environmental Information Processing Signaling molecules and interaction hsa04060 Cytokine-cytokine receptor interaction 35/635 221/6926 0.0008664 0.0150651 0.0130208 7048/53342/94/6369/1440/55504/3563/3590/3569/2920/51554/6368/3977/6358/653/3552/7042/659/90865/2921/5473/3953/8741/654/9173/1271/3553/650/2662/85480/3575/2690/53832/3577/3579 35
hsa04514 Environmental Information Processing Signaling molecules and interaction hsa04514 Cell adhesion molecules 25/635 143/6926 0.0011339 0.0186796 0.0161448 23705/5175/1003/58494/6403/51208/64115/7122/6693/947/90952/5797/94030/3133/23114/83700/6383/6404/84631/8516/3683/22865/3384/4099/257194 25
hsa04926 Organismal Systems Endocrine system hsa04926 Relaxin signaling pathway 21/635 116/6926 0.0017605 0.0275521 0.0238133 1910/7048/408/53358/196883/1288/2277/59350/1285/1906/1286/122042/108/114/5590/2353/115/3725/5295/2791/4843 21
hsa03320 Organismal Systems Endocrine system hsa03320 PPAR signaling pathway 13/635 60/6926 0.0025795 0.0384467 0.0332295 2167/33/948/4973/4023/10580/6258/1593/2180/2171/79966/5468/28965 13
hsa04927 Organismal Systems Endocrine system hsa04927 Cortisol synthesis and secretion 12/635 54/6926 0.0029828 0.0409681 0.0354088 196883/779/3949/3739/108/114/185/1586/115/3777/8622/3164 12
hsa05150 Human Diseases Infectious disease: bacterial hsa05150 Staphylococcus aureus infection 13/635 61/6926 0.0030104 0.0409681 0.0354088 5648/6403/2358/6404/820/728/3683/10747/2357/717/712/1675/713 13

对于各列内容:

ID和Description,富集到的通路和功能描述;

GeneRatio和BgRatio,分别为富集到该通路中的基因数目/给定基因的总数目,以及该通路中背景基因总数目/该物种所有已知的KEGG功能基因数目;

pvalue、p.adjust和qvalue,p值、校正后p值和q值信息;

  1. p值(P-value)

    • p值是在一个假设检验中,当零假设(null hypothesis)为真时,观察到的统计量或更极端情况出现的概率。

    • 一个低的p值(如小于0.05)通常表明统计学上的显著性,意味着结果不太可能仅仅是由随机因素引起的。

    • 不太关心假阳性,而是更关注于发现尽可能多的显著性通路,原始的p值可能足够

  2. 校正后的p值(Adjusted P-value) - Benjamini-Hochberg (BH)方法

    • 在进行多重检验时,为了控制第一类错误(错误地拒绝零假设)的总体率,需要对p值进行校正。BH方法是一种流行的校正方法,它通过控制假发现率(false discovery rate, FDR)来进行校正。

    • BH校正后的p值(padj)是基于原始p值按照一定规则调整后的值,用于评估在多重比较情况下的统计显著性。

    • 平衡假阳性和假阴性的发现,可以使用BH方法的p.adjust值

  3. q值(Q-value)

    • q值同样是用于评估多重检验中的统计显著性,它直接控制了假发现率。

    • q值定义为在最坏的情况下,所接受的假设中错误发现的比例。例如,q=0.05意味着在所有被认为是显著的发现中,最多有5%实际上是假阳性。

    • 更保守地挑选显著性通路,以减少假阳性的可能性,可能会选择使用q值

    • R中使用”BY”表示q值校正

geneID和Count,富集到该通路中的基因名称和数目。

Code
df_up <- df_up[df_up$pvalue < 0.05, ]
df_down <- df_down[df_down$pvalue < 0.05, ]
lubridate::intersect(df_up$ID,df_down$ID)
#> character(0)
Code
df_up$group <- 1
df_down$group <- -1
Code
up_down <- rbind(df_up,df_down)
head(up_down)
category subcategory ID Description GeneRatio BgRatio pvalue p.adjust qvalue geneID Count group
hsa04110 Cellular Processes Cell growth and death hsa04110 Cell cycle 56/763 154/6926 0.0000000 0.0000000 0.0000000 5347/898/990/701/113130/991/151648/9232/9212/9133/891/995/9088/23594/7272/890/4998/9700/57082/1111/10403/157570/8318/4171/1029/80174/699/4173/90381/4175/4085/81620/1870/983/9134/11200/27085/993/3065/4176/1871/5111/5001/10926/9319/5933/7157/26271/10459/1875/5591/1019/1869/8317/1663/4193 56 1
hsa01230 Metabolism Global and overview maps hsa01230 Biosynthesis of amino acids 19/763 64/6926 0.0000356 0.0045716 0.0042807 5831/84706/5832/6472/29968/2597/5214/2023/3418/5091/1373/10993/7167/6888/226/3417/440/4143/384 19 1
hsa01232 Metabolism Global and overview maps hsa01232 Nucleotide metabolism 22/763 81/6926 0.0000409 0.0045716 0.0042807 205/6241/978/7083/377841/7298/4830/7498/100/4907/124583/4833/122622/4831/272/3251/56953/1841/9615/7371/5167/954 22 1
hsa05206 Human Diseases Cancer: overview hsa05206 MicroRNAs in cancer 34/763 155/6926 0.0000549 0.0046001 0.0043074 898/113130/1591/995/1945/2146/5268/1944/1029/9493/8091/1870/5328/9134/7473/1789/1946/672/4318/993/6659/3065/1871/6464/7157/90427/6768/3845/3925/1869/54541/4193/6624/2065 34 1
hsa00330 Metabolism Amino acid metabolism hsa00330 Arginine and proline metabolism 14/763 45/6926 0.0002183 0.0123365 0.0115516 5831/79814/1159/548596/5832/26/283208/6723/54498/6611/2628/219/384/2593 14 1
hsa03030 Genetic Information Processing Replication and repair hsa03030 DNA replication 12/763 35/6926 0.0002210 0.0123365 0.0115516 1763/4171/4173/2237/5427/4175/5984/5558/10535/4176/5111/6119 12 1

8.3.1 水平条图

Code
barplot(kegg_up, showCategory=20, xlab="-log10(pvalue)", ylab="Pathways") +
  theme_minimal()

Code
dotplot(kegg_up)

Code
up_down$Negative_log10pvalue <- -log10(up_down$pvalue)

up_down$Grouped_Negativelog10pvalue <- up_down$Negative_log10pvalue * up_down$group

up_down <- up_down[order( up_down$Grouped_Negativelog10pvalue), ]

ggplot(up_down, aes(
  x =  reorder(Description, order(Grouped_Negativelog10pvalue, decreasing = F)),
  y = Grouped_Negativelog10pvalue,
  fill = group
)) +
  geom_bar(stat = "identity") +
  coord_flip()+
  scale_fill_gradient(low = "blue", high = "red") +
  scale_x_discrete(name = "Pathway names") +
  scale_y_continuous(name = "-log10 pvalue")+
  theme(plot.title = element_text(hjust = 0.5) ) +
  ggtitle("KEGG Pathway Enrichment") 

8.3.2 点图

Code
# GeneRatio 转化成数值型
GR <- up_down$GeneRatio |> 
    str_split(pattern = "/",simplify = TRUE) |> as_tibble(.name_repair="unique")

up_down$GeneRatio <- parse_number(GR$...1)/parse_number(GR$...2)

up_down <- up_down[order(up_down$GeneRatio),]

ggplot(data = up_down,aes(x=GeneRatio,
                      y=reorder(Description,order(GeneRatio)),
                      color = Grouped_Negativelog10pvalue,
                      size=Count
                      )
       )+
    geom_point()+
    scale_x_continuous( name = "GeneRatio" ,breaks = seq(0,1.0,0.1))+
    scale_color_gradient(low = "blue", high = "red" )+
    scale_y_discrete( name = "Pathway names" ) +
    theme( plot.title = element_text( hjust = 0.5 )) +
    ggtitle( "KEGG Pathway Enrichment " )+
    labs(color="-log10 pvalue")

8.4 GO富集分析

Gene Ontology 基因集

Package Organism
org.Hs.eg.db Human
org.Mm.eg.db Mouse

对于输出的GO富集结果表格中的各列内容:

ONTOLOGY,GO的BP(生物学过程)、CC(细胞组分)或MF(分子功能);

ID和Description,富集到的GO term及其描述;

GeneRatio和BgRatio,分别为富集到该GO term中的基因数目/给定基因的总数目,以及该GO term中背景基因总数目/该物种所有已知GO功能基因数目;

pvalue、p.adjust和qvalue,p值、校正后p值和q值;

geneID和Count,富集到该GO term中的基因名称和数目。

Code
GO_EA<- clusterProfiler::enrichGO(gene = gene_all$ENTREZID,
                                  OrgDb = "org.Hs.eg.db",
                        
                                  keyType = "ENTREZID",
                                  ont = "ALL",
                                  pvalueCutoff  =  0.05 ,
                                  qvalueCutoff  =  0.05,
                                  readable = F)
GO_EA_tbl <-  as_tibble(GO_EA)

8.4.1 可视化

Code
#clusterProfiler 包里的一些默认作图方法,例如
barplot(GO_EA)  #富集柱形图

Code
dotplot(GO_EA)  #富集气泡图

Code
dat2 <- GO_EA_tbl[GO_EA_tbl$pvalue< 0.05, ]
dat2$Negative_log10pvalue <-  -log10(dat2$pvalue)
dat2 = dat2[order( dat2$Negative_log10pvalue, decreasing = F ), ]

dat2 <- dat2 |> 
    group_by(ONTOLOGY) |> 
    slice_max(order_by = Negative_log10pvalue,n=10) |> 
    ungroup()


ggplot(dat2,aes(x = Negative_log10pvalue, 
                y = reorder(Description,
                            order(ONTOLOGY,Negative_log10pvalue,decreasing=F)),
                fill = ONTOLOGY ))+ 
    geom_bar( stat = "identity" ) +
    scale_x_continuous( name = "-log10 pvalue" ) +
    scale_y_discrete( name = "Pathway names" ) +
    theme_bw() + 
    theme( plot.title = element_text( hjust = 0.5 ) ) +
    ggtitle( "GO Pathway Enrichment" )