3  注释数据库

https://bioconductor.org/packages/release/workflows/html/annotation.html

3.1 BioMart

即Ensembl ID

查找智人(homo sapien)基因组

https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#introduction

Code
# BiocManager::install("biomaRt")
library(biomaRt)

listEnsembl()
biomart version
genes Ensembl Genes 113
mouse_strains Mouse strains 113
snps Ensembl Variation 113
regulation Ensembl Regulation 113
Code

# 连接到Ensembl BioMart
ensembl <- useEnsembl(biomart = "genes")
ensembl
#> Object of class 'Mart':
#>   Using the ENSEMBL_MART_ENSEMBL BioMart database
#>   No dataset selected.

# 列出数据集
listDatasets(ensembl) |> as_tibble()
dataset description version
abrachyrhynchus_gene_ensembl Pink-footed goose genes (ASM259213v1) ASM259213v1
acalliptera_gene_ensembl Eastern happy genes (fAstCal1.3) fAstCal1.3
acarolinensis_gene_ensembl Green anole genes (AnoCar2.0v2) AnoCar2.0v2
acchrysaetos_gene_ensembl Golden eagle genes (bAquChr1.2) bAquChr1.2
acitrinellus_gene_ensembl Midas cichlid genes (Midas_v5) Midas_v5
amelanoleuca_gene_ensembl Giant panda genes (ASM200744v2) ASM200744v2
amexicanus_gene_ensembl Mexican tetra genes (Astyanax_mexicanus-2.0) Astyanax_mexicanus-2.0
anancymaae_gene_ensembl Ma’s night monkey genes (Anan_2.0) Anan_2.0
aocellaris_gene_ensembl Clown anemonefish genes (ASM2253959v1) ASM2253959v1
apercula_gene_ensembl Orange clownfish genes (Nemo_v1) Nemo_v1
aplatyrhynchos_gene_ensembl Mallard genes (ASM874695v1) ASM874695v1
apolyacanthus_gene_ensembl Spiny chromis genes (ASM210954v1) ASM210954v1
applatyrhynchos_gene_ensembl Duck genes (CAU_duck1.0) CAU_duck1.0
atestudineus_gene_ensembl Climbing perch genes (fAnaTes1.3) fAnaTes1.3
bbbison_gene_ensembl American bison genes (Bison_UMD1.0) Bison_UMD1.0
bgrunniens_gene_ensembl Domestic yak genes (LU_Bosgru_v3.0) LU_Bosgru_v3.0
bihybrid_gene_ensembl Hybrid - Bos Indicus genes (UOA_Brahman_1) UOA_Brahman_1
bmusculus_gene_ensembl Blue whale genes (mBalMus1.v2) mBalMus1.v2
bmutus_gene_ensembl Wild yak genes (BosGru_v2.0) BosGru_v2.0
bsplendens_gene_ensembl Siamese fighting fish genes (fBetSpl5.2) fBetSpl5.2
btaurus_gene_ensembl Cow genes (ARS-UCD1.3) ARS-UCD1.3
cabingdonii_gene_ensembl Abingdon island giant tortoise genes (ASM359739v1) ASM359739v1
catys_gene_ensembl Sooty mangabey genes (Caty_1.0) Caty_1.0
cauratus_gene_ensembl Goldfish genes (ASM336829v1) ASM336829v1
cccarpio_gene_ensembl Common carp genes (Cypcar_WagV4.0) Cypcar_WagV4.0
cdromedarius_gene_ensembl Arabian camel genes (CamDro2) CamDro2
celegans_gene_ensembl Caenorhabditis elegans (Nematode, N2) genes (WBcel235) WBcel235
cgchok1gshd_gene_ensembl Chinese hamster CHOK1GS genes (CHOK1GS_HDv1) CHOK1GS_HDv1
cgobio_gene_ensembl Channel bull blenny genes (fCotGob3.1) fCotGob3.1
charengus_gene_ensembl Atlantic herring genes (Ch_v2.0.2v2) Ch_v2.0.2v2
chircus_gene_ensembl Goat genes (ARS1) ARS1
choffmanni_gene_ensembl Sloth genes (choHof1) choHof1
chyarkandensis_gene_ensembl Yarkand deer genes (CEY_v1) CEY_v1
cimitator_gene_ensembl Panamanian white-faced capuchin genes (Cebus_imitator-1.0) Cebus_imitator-1.0
cintestinalis_gene_ensembl C.intestinalis genes (KH) KH
cjacchus_gene_ensembl White-tufted-ear marmoset genes (mCalJac1.pat.X) mCalJac1.pat.X
cjaponica_gene_ensembl Japanese quail genes (Coturnix_japonica_2.0) Coturnix_japonica_2.0
clanigera_gene_ensembl Long-tailed chinchilla genes (ChiLan1.0) ChiLan1.0
cldingo_gene_ensembl Dingo genes (ASM325472v1) ASM325472v1
clfamiliaris_gene_ensembl Dog genes (ROS_Cfam_1.0) ROS_Cfam_1.0
clumpus_gene_ensembl Lumpfish genes (fCycLum1.pri) fCycLum1.pri
cmilii_gene_ensembl Elephant shark genes (Callorhinchus_milii-6.1.3) Callorhinchus_milii-6.1.3
cpbellii_gene_ensembl Painted turtle genes (Chrysemys_picta_bellii-3.0.3) Chrysemys_picta_bellii-3.0.3
cporcellus_gene_ensembl Guinea Pig genes (Cavpor3.0) Cavpor3.0
cporosus_gene_ensembl Australian saltwater crocodile genes (CroPor_comp1) CroPor_comp1
csabaeus_gene_ensembl Vervet-AGM genes (ChlSab1.1) ChlSab1.1
csavignyi_gene_ensembl C.savignyi genes (CSAV 2.0) CSAV 2.0
csemilaevis_gene_ensembl Tongue sole genes (Cse_v1.0) Cse_v1.0
csyrichta_gene_ensembl Tarsier genes (Tarsius_syrichta-2.0.1) Tarsius_syrichta-2.0.1
cvariegatus_gene_ensembl Sheepshead minnow genes (C_variegatus-1.0) C_variegatus-1.0
cwagneri_gene_ensembl Chacoan peccary genes (CatWag_v2_BIUU_UCD) CatWag_v2_BIUU_UCD
dclupeoides_gene_ensembl Denticle herring genes (fDenClu1.2) fDenClu1.2
dlabrax_gene_ensembl European seabass genes (dlabrax2021) dlabrax2021
dleucas_gene_ensembl Beluga whale genes (ASM228892v3) ASM228892v3
dmelanogaster_gene_ensembl Drosophila melanogaster (Fruit fly) genes (BDGP6.46) BDGP6.46
dnovemcinctus_gene_ensembl Armadillo genes (Dasnov3.0) Dasnov3.0
dordii_gene_ensembl Kangaroo rat genes (Dord_2.0) Dord_2.0
drerio_gene_ensembl Zebrafish genes (GRCz11) GRCz11
easinus_gene_ensembl Donkey genes (ASM1607732v2) ASM1607732v2
eburgeri_gene_ensembl Hagfish genes (Eburgeri_3.2) Eburgeri_3.2
ecaballus_gene_ensembl Horse genes (EquCab3.0) EquCab3.0
ecalabaricus_gene_ensembl Reedfish genes (fErpCal1.1) fErpCal1.1
eelectricus_gene_ensembl Electric eel genes (fEleEle1.pri) fEleEle1.pri
eeuropaeus_gene_ensembl Hedgehog genes (eriEur1) eriEur1
elucius_gene_ensembl Northern pike genes (fEsoLuc1.pri) fEsoLuc1.pri
etelfairi_gene_ensembl Lesser hedgehog tenrec genes (TENREC) TENREC
falbicollis_gene_ensembl Collared flycatcher genes (FicAlb1.5) FicAlb1.5
fcatus_gene_ensembl Cat genes (Felis_catus_9.0) Felis_catus_9.0
fheteroclitus_gene_ensembl Mummichog genes (Fundulus_heteroclitus-3.0.2) Fundulus_heteroclitus-3.0.2
gaculeatus_gene_ensembl Stickleback genes (GAculeatus_UGA_version5) GAculeatus_UGA_version5
gevgoodei_gene_ensembl Goodes thornscrub tortoise genes (rGopEvg1_v1.p) rGopEvg1_v1.p
gfortis_gene_ensembl Medium ground-finch genes (GeoFor_1.0) GeoFor_1.0
ggallus_gene_ensembl Chicken genes (bGalGal1.mat.broiler.GRCg7b) bGalGal1.mat.broiler.GRCg7b
ggorilla_gene_ensembl Gorilla genes (gorGor4) gorGor4
gmorhua_gene_ensembl Atlantic cod genes (gadMor3.0) gadMor3.0
hburtoni_gene_ensembl Burton’s mouthbrooder genes (AstBur1.0) AstBur1.0
hcomes_gene_ensembl Tiger tail seahorse genes (H_comes_QL1_v1) H_comes_QL1_v1
hgfemale_gene_ensembl Naked mole-rat female genes (Naked_mole-rat_maternal) Naked_mole-rat_maternal
hhucho_gene_ensembl Huchen genes (ASM331708v1) ASM331708v1
hsapiens_gene_ensembl Human genes (GRCh38.p14) GRCh38.p14
ipunctatus_gene_ensembl Channel catfish genes (ASM400665v3) ASM400665v3
itridecemlineatus_gene_ensembl Squirrel genes (SpeTri2.0) SpeTri2.0
jjaculus_gene_ensembl Lesser Egyptian jerboa genes (JacJac1.0) JacJac1.0
kmarmoratus_gene_ensembl Mangrove rivulus genes (ASM164957v1) ASM164957v1
lafricana_gene_ensembl Elephant genes (Loxafr3.0) Loxafr3.0
lbergylta_gene_ensembl Ballan wrasse genes (BallGen_V1) BallGen_V1
lcalcarifer_gene_ensembl Barramundi perch genes (ASB_HGAPassembly_v1) ASB_HGAPassembly_v1
lchalumnae_gene_ensembl Coelacanth genes (LatCha1) LatCha1
lcrocea_gene_ensembl Large yellow croaker genes (L_crocea_2.0) L_crocea_2.0
llaticaudata_gene_ensembl Blue-ringed sea krait genes (latLat_1.0) latLat_1.0
lleishanense_gene_ensembl Leishan spiny toad genes (ASM966780v1) ASM966780v1
loculatus_gene_ensembl Spotted gar genes (LepOcu1) LepOcu1
marmatus_gene_ensembl Zig-zag eel genes (fMasArm1.2) fMasArm1.2
mauratus_gene_ensembl Golden Hamster genes (MesAur1.0) MesAur1.0
mcaroli_gene_ensembl Ryukyu mouse genes (CAROLI_EIJ_v1.1) CAROLI_EIJ_v1.1
mdomestica_gene_ensembl Opossum genes (ASM229v1) ASM229v1
mfascicularis_gene_ensembl Crab-eating macaque genes (Macaca_fascicularis_6.0) Macaca_fascicularis_6.0
mgallopavo_gene_ensembl Turkey genes (Turkey_5.1) Turkey_5.1
mleucophaeus_gene_ensembl Drill genes (Mleu.le_1.0) Mleu.le_1.0
mlucifugus_gene_ensembl Microbat genes (Myoluc2.0) Myoluc2.0
mmmarmota_gene_ensembl Alpine marmot genes (marMar2.1) marMar2.1
mmonoceros_gene_ensembl Narwhal genes (NGI_Narwhal_1) NGI_Narwhal_1
mmoschiferus_gene_ensembl Siberian musk deer genes (MosMos_v2_BIUU_UCD) MosMos_v2_BIUU_UCD
mmulatta_gene_ensembl Macaque genes (Mmul_10) Mmul_10
mmurdjan_gene_ensembl Pinecone soldierfish genes (fMyrMur1.1) fMyrMur1.1
mmurinus_gene_ensembl Mouse Lemur genes (Mmur_3.0) Mmur_3.0
mmusculus_gene_ensembl Mouse genes (GRCm39) GRCm39
mnemestrina_gene_ensembl Pig-tailed macaque genes (Mnem_1.0) Mnem_1.0
mochrogaster_gene_ensembl Prairie vole genes (MicOch1.0) MicOch1.0
mpahari_gene_ensembl Shrew mouse genes (PAHARI_EIJ_v1.1) PAHARI_EIJ_v1.1
mpfuro_gene_ensembl Ferret genes (MusPutFur1.0) MusPutFur1.0
mspicilegus_gene_ensembl Steppe mouse genes (MUSP714) MUSP714
mspretus_gene_ensembl Algerian mouse genes (SPRET_EiJ_v1) SPRET_EiJ_v1
mzebra_gene_ensembl Zebra mbuna genes (M_zebra_UMD2a) M_zebra_UMD2a
nbrichardi_gene_ensembl Lyretail cichlid genes (NeoBri1.0) NeoBri1.0
neugenii_gene_ensembl Wallaby genes (Meug_1.0) Meug_1.0
nfurzeri_gene_ensembl Turquoise killifish genes (Nfu_20140520) Nfu_20140520
ngalili_gene_ensembl Upper Galilee mountains blind mole rat genes (S.galili_v1.0) S.galili_v1.0
nleucogenys_gene_ensembl Gibbon genes (Nleu_3.0) Nleu_3.0
nnaja_gene_ensembl Indian cobra genes (Nana_v5) Nana_v5
nscutatus_gene_ensembl Mainland tiger snake genes (TS10Xv2-PRI) TS10Xv2-PRI
nvison_gene_ensembl American mink genes (NNQGG.v01) NNQGG.v01
odegus_gene_ensembl Degu genes (OctDeg1.0) OctDeg1.0
okisutch_gene_ensembl Coho salmon genes (Okis_V2) Okis_V2
oprinceps_gene_ensembl Pika genes (OchPri2.0-Ens) OchPri2.0-Ens
Code

# 查找智人
searchDatasets(mart = ensembl, pattern = "hsapiens")
dataset description version
80 hsapiens_gene_ensembl Human genes (GRCh38.p14) GRCh38.p14
Code

# 选择数据集
dataset <- useDataset(dataset = "hsapiens_gene_ensembl", mart = ensembl)
dataset
#> Object of class 'Mart':
#>   Using the ENSEMBL_MART_ENSEMBL BioMart database
#>   Using the hsapiens_gene_ensembl dataset

GRCh38(Genome Research Consortium human genome build 38)

Code
# 人类基因BioMart
ensembl <- useEnsembl(biomart = "ensembl", 
                   dataset = "hsapiens_gene_ensembl", 
                   mirror = "asia")

3.1.1 映射

Code
biomart version
ENSEMBL_MART_ENSEMBL Ensembl Genes 113
ENSEMBL_MART_MOUSE Mouse strains 113
ENSEMBL_MART_SNP Ensembl Variation 113
ENSEMBL_MART_FUNCGEN Ensembl Regulation 113
Code
# 连接到Ensembl BioMart
mart <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl")
listAttributes(mart) %>% DT::datatable()
Code




gene_ids <- c("1017", "1018", "1019")  

getBM(attributes = c("entrezgene_id", "ensembl_gene_id", "external_gene_name"),
                  filters = "entrezgene_id",
                  values = gene_ids,
                  mart = mart)
entrezgene_id ensembl_gene_id external_gene_name
1017 ENSG00000123374 CDK2
1018 ENSG00000250506 CDK3
1019 ENSG00000135446 CDK4

3.1.2 使用 Ensembl 的存档版本

Code
listEnsemblArchives()
listEnsembl(version = 112)
ensembl_112 <- useEnsembl(biomart = 'genes', 
                       dataset = 'hsapiens_gene_ensembl',
                       mirror = "asia",
                       version = 112)

3.1.3 使用 Ensembl 基因组

Code
biomart version
protists_mart Ensembl Protists Genes 60
protists_variations Ensembl Protists Variations 60
fungi_mart Ensembl Fungi Genes 60
fungi_variations Ensembl Fungi Variations 60
metazoa_mart Ensembl Metazoa Genes 60
metazoa_variations Ensembl Metazoa Variations 60
plants_mart Ensembl Plants Genes 60
plants_variations Ensembl Plants Variations 60
Code
ensembl_plants <- useEnsemblGenomes(biomart = "plants_mart")
searchDatasets(ensembl_plants, pattern = "Arabidopsis")
dataset description version
4 ahalleri_eg_gene Arabidopsis halleri genes (Ahal2.2) Ahal2.2
6 alyrata_eg_gene Arabidopsis lyrata genes (v.1.0) v.1.0
11 athaliana_eg_gene Arabidopsis thaliana genes (TAIR10) TAIR10
Code
ensembl_arabidopsis <- useEnsemblGenomes(biomart = "plants_mart", 
                                         dataset = "athaliana_eg_gene")

3.2 AnnotationHub

基于annotationhub构建所需orgdb,用于clusterprofiler富集分析

Code
suppressMessages(library(AnnotationHub))
conflicts_prefer(BiocGenerics::setdiff)
# proxy <- httr::use_proxy(Sys.getenv('http_proxy'))
# httr::set_config(proxy)
# AnnotationHub::setAnnotationHubOption("PROXY", proxy)

ah <- AnnotationHub(cache = "D:/AnnotationHub")
ah
#> AnnotationHub with 72098 records
#> # snapshotDate(): 2024-10-28
#> # $dataprovider: Ensembl, BroadInstitute, UCSC, ftp://ftp.ncbi.nlm.nih.gov/g...
#> # $species: Homo sapiens, Mus musculus, Drosophila melanogaster, Rattus norv...
#> # $rdataclass: GRanges, TwoBitFile, BigWigFile, EnsDb, Rle, OrgDb, SQLiteFil...
#> # additional mcols(): taxonomyid, genome, description,
#> #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
#> #   rdatapath, sourceurl, sourcetype 
#> # retrieve records with, e.g., 'object[["AH5012"]]' 
#> 
#>              title                                             
#>   AH5012   | Chromosome Band                                   
#>   AH5013   | STS Markers                                       
#>   AH5014   | FISH Clones                                       
#>   AH5015   | Recomb Rate                                       
#>   AH5016   | ENCODE Pilot                                      
#>   ...        ...                                               
#>   AH119504 | Ensembl 113 EnsDb for Xiphophorus maculatus       
#>   AH119505 | Ensembl 113 EnsDb for Xenopus tropicalis          
#>   AH119506 | Ensembl 113 EnsDb for Zonotrichia albicollis      
#>   AH119507 | Ensembl 113 EnsDb for Zalophus californianus      
#>   AH119508 | Ensembl 113 EnsDb for Zosterops lateralis melanops
Code
# 当前可用数据的更新时间
snapshotDate(ah)
#> [1] "2024-10-28"

# 缓存路径
hubCache(ah)
#> [1] "D:/AnnotationHub"

# https://annotationhub.bioconductor.org
hubUrl(ah)
#> [1] "https://annotationhub.bioconductor.org"
isLocalHub(ah)
#> [1] FALSE
dbconn(ah)
#> <SQLiteConnection>
#>   Path: D:\AnnotationHub\39283aec6c3b_annotationhub.sqlite3
#>   Extensions: TRUE
Code
length(ah)
#> [1] 72098
ah$dataprovider |> unique()
#>  [1] "UCSC"                                                                                                      
#>  [2] "Ensembl"                                                                                                   
#>  [3] "RefNet"                                                                                                    
#>  [4] "Inparanoid8"                                                                                               
#>  [5] "NHLBI"                                                                                                     
#>  [6] "ChEA"                                                                                                      
#>  [7] "Pazar"                                                                                                     
#>  [8] "NIH Pathway Interaction Database"                                                                          
#>  [9] "Haemcode"                                                                                                  
#> [10] "BroadInstitute"                                                                                            
#> [11] "PRIDE"                                                                                                     
#> [12] "Gencode"                                                                                                   
#> [13] "CRIBI"                                                                                                     
#> [14] "Genoscope"                                                                                                 
#> [15] "MISO, VAST-TOOLS, UCSC"                                                                                    
#> [16] "Stanford"                                                                                                  
#> [17] "dbSNP"                                                                                                     
#> [18] "BioMart"                                                                                                   
#> [19] "GeneOntology"                                                                                              
#> [20] "KEGG"                                                                                                      
#> [21] "URGI"                                                                                                      
#> [22] "EMBL-EBI"                                                                                                  
#> [23] "MicrosporidiaDB"                                                                                           
#> [24] "FungiDB"                                                                                                   
#> [25] "TriTrypDB"                                                                                                 
#> [26] "ToxoDB"                                                                                                    
#> [27] "AmoebaDB"                                                                                                  
#> [28] "PlasmoDB"                                                                                                  
#> [29] "PiroplasmaDB"                                                                                              
#> [30] "CryptoDB"                                                                                                  
#> [31] "TrichDB"                                                                                                   
#> [32] "GiardiaDB"                                                                                                 
#> [33] "The Gene Ontology Consortium"                                                                              
#> [34] "ENCODE Project"                                                                                            
#> [35] "SchistoDB"                                                                                                 
#> [36] "NCBI/UniProt"                                                                                              
#> [37] "GENCODE"                                                                                                   
#> [38] "http://www.pantherdb.org"                                                                                  
#> [39] "RMBase v2.0"                                                                                               
#> [40] "snoRNAdb"                                                                                                  
#> [41] "tRNAdb"                                                                                                    
#> [42] "NCBI"                                                                                                      
#> [43] "DrugAge, DrugBank, Broad Institute"                                                                        
#> [44] "DrugAge"                                                                                                   
#> [45] "DrugBank"                                                                                                  
#> [46] "Broad Institute"                                                                                           
#> [47] "HMDB, EMBL-EBI, EPA"                                                                                       
#> [48] "STRING"                                                                                                    
#> [49] "OMA"                                                                                                       
#> [50] "OrthoDB"                                                                                                   
#> [51] "PathBank"                                                                                                  
#> [52] "EBI/EMBL"                                                                                                  
#> [53] "NCBI,DBCLS"                                                                                                
#> [54] "FANTOM5,DLRP,IUPHAR,HPRD,STRING,SWISSPROT,TREMBL,ENSEMBL,CELLPHONEDB,BADERLAB,SINGLECELLSIGNALR,HOMOLOGENE"
#> [55] "WikiPathways"                                                                                              
#> [56] "VAST-TOOLS"                                                                                                
#> [57] "pyGenomeTracks "                                                                                           
#> [58] "NA"                                                                                                        
#> [59] "UoE"                                                                                                       
#> [60] "TargetScan,miRTarBase,USCS,ENSEMBL"                                                                        
#> [61] "TargetScan"                                                                                                
#> [62] "QuickGO"                                                                                                   
#> [63] "CIS-BP"                                                                                                    
#> [64] "CTCFBSDB 2.0"                                                                                              
#> [65] "HOCOMOCO v11"                                                                                              
#> [66] "JASPAR 2022"                                                                                               
#> [67] "Jolma 2013"                                                                                                
#> [68] "SwissRegulon"                                                                                              
#> [69] "ENCODE SCREEN v3"                                                                                          
#> [70] "MassBank"                                                                                                  
#> [71] "excluderanges"                                                                                             
#> [72] "ENCODE"                                                                                                    
#> [73] "GitHub"                                                                                                    
#> [74] "Stanford.edu"                                                                                              
#> [75] "Publication"                                                                                               
#> [76] "CHM13"                                                                                                     
#> [77] "UCSChub"                                                                                                   
#> [78] "Google DeepMind"                                                                                           
#> [79] "UWashington"                                                                                               
#> [80] "Bioconductor"                                                                                              
#> [81] "ENCODE cCREs"                                                                                              
#> [82] "The Human Phenotype Ontology"                                                                              
#> [83] "MGI"                                                                                                       
#> [84] "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/"
Code
unique(ah$species) %>% as.data.frame() %>% DT::datatable()
Code
unique(ah$rdataclass) 
#>  [1] "GRanges"                           "data.frame"                       
#>  [3] "Inparanoid8Db"                     "TwoBitFile"                       
#>  [5] "ChainFile"                         "SQLiteConnection"                 
#>  [7] "biopax"                            "BigWigFile"                       
#>  [9] "AAStringSet"                       "MSnSet"                           
#> [11] "mzRident"                          "list"                             
#> [13] "TxDb"                              "Rle"                              
#> [15] "EnsDb"                             "VcfFile"                          
#> [17] "igraph"                            "data.frame, DNAStringSet, GRanges"
#> [19] "sqlite"                            "data.table"                       
#> [21] "character"                         "SQLite"                           
#> [23] "SQLiteFile"                        "Tibble"                           
#> [25] "Rda"                               "FaFile"                           
#> [27] "String"                            "CompDb"                           
#> [29] "OrgDb"

3.2.1 子集

Code
ah[ah$dataprovider=="UCSC"]
#> AnnotationHub with 11204 records
#> # snapshotDate(): 2024-10-28
#> # $dataprovider: UCSC
#> # $species: Homo sapiens, Mus musculus, Drosophila melanogaster, Bos taurus,...
#> # $rdataclass: GRanges, Rle, ChainFile, TxDb, TwoBitFile
#> # additional mcols(): taxonomyid, genome, description,
#> #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
#> #   rdatapath, sourceurl, sourcetype 
#> # retrieve records with, e.g., 'object[["AH5012"]]' 
#> 
#>              title                                    
#>   AH5012   | Chromosome Band                          
#>   AH5013   | STS Markers                              
#>   AH5014   | FISH Clones                              
#>   AH5015   | Recomb Rate                              
#>   AH5016   | ENCODE Pilot                             
#>   ...        ...                                      
#>   AH114096 | TxDb.Mmusculus.UCSC.mm39.refGene.sqlite  
#>   AH116719 | TxDb.Hsapiens.UCSC.hg38.refGene.sqlite   
#>   AH116720 | TxDb.Mmusculus.UCSC.mm39.refGene.sqlite  
#>   AH117076 | TxDb.Hsapiens.UCSC.hg38.knownGene.sqlite 
#>   AH117077 | TxDb.Mmusculus.UCSC.mm39.knownGene.sqlite

subset(ah, species == "homo sapiens")
#> AnnotationHub with 71 records
#> # snapshotDate(): 2024-10-28
#> # $dataprovider: Ensembl
#> # $species: homo sapiens
#> # $rdataclass: GRanges, TwoBitFile
#> # additional mcols(): taxonomyid, genome, description,
#> #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
#> #   rdatapath, sourceurl, sourcetype 
#> # retrieve records with, e.g., 'object[["AH83649"]]' 
#> 
#>              title                                           
#>   AH83649  | Homo_sapiens.GRCh38.101.abinitio.gtf            
#>   AH83650  | Homo_sapiens.GRCh38.101.chr.gtf                 
#>   AH83651  | Homo_sapiens.GRCh38.101.chr_patch_hapl_scaff.gtf
#>   AH83652  | Homo_sapiens.GRCh38.101.gtf                     
#>   AH84627  | Homo_sapiens.GRCh38.cdna.all.2bit               
#>   ...        ...                                             
#>   AH110100 | Homo_sapiens.GRCh38.108.gtf                     
#>   AH110866 | Homo_sapiens.GRCh38.109.abinitio.gtf            
#>   AH110867 | Homo_sapiens.GRCh38.109.chr.gtf                 
#>   AH110868 | Homo_sapiens.GRCh38.109.chr_patch_hapl_scaff.gtf
#>   AH110869 | Homo_sapiens.GRCh38.109.gtf

3.2.2 查询

Code
query(ah, c("Ensembl"))
#> AnnotationHub with 37283 records
#> # snapshotDate(): 2024-10-28
#> # $dataprovider: Ensembl, FANTOM5,DLRP,IUPHAR,HPRD,STRING,SWISSPROT,TREMBL,E...
#> # $species: Mus musculus, Sus scrofa, Homo sapiens, Rattus norvegicus, Danio...
#> # $rdataclass: TwoBitFile, GRanges, EnsDb, SQLiteFile, data.frame, OrgDb, li...
#> # additional mcols(): taxonomyid, genome, description,
#> #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
#> #   rdatapath, sourceurl, sourcetype 
#> # retrieve records with, e.g., 'object[["AH5046"]]' 
#> 
#>              title                                             
#>   AH5046   | Ensembl Genes                                     
#>   AH5160   | Ensembl Genes                                     
#>   AH5311   | Ensembl Genes                                     
#>   AH5434   | Ensembl Genes                                     
#>   AH5435   | Ensembl EST Genes                                 
#>   ...        ...                                               
#>   AH119504 | Ensembl 113 EnsDb for Xiphophorus maculatus       
#>   AH119505 | Ensembl 113 EnsDb for Xenopus tropicalis          
#>   AH119506 | Ensembl 113 EnsDb for Zonotrichia albicollis      
#>   AH119507 | Ensembl 113 EnsDb for Zalophus californianus      
#>   AH119508 | Ensembl 113 EnsDb for Zosterops lateralis melanops
query(ah, c("Gencode"))
#> AnnotationHub with 5426 records
#> # snapshotDate(): 2024-10-28
#> # $dataprovider: UCSC, GENCODE, Gencode, ENCODE, BroadInstitute
#> # $species: Homo sapiens, Mus musculus, Rattus norvegicus, Pan troglodytes, ...
#> # $rdataclass: GRanges, list, TxDb, SQLiteConnection
#> # additional mcols(): taxonomyid, genome, description,
#> #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
#> #   rdatapath, sourceurl, sourcetype 
#> # retrieve records with, e.g., 'object[["AH5073"]]' 
#> 
#>              title                                                
#>   AH5073   | Affy RNA Loc                                         
#>   AH5079   | CSHL Small RNA-seq                                   
#>   AH5080   | GIS RNA PET                                          
#>   AH5084   | RIKEN CAGE Loc                                       
#>   AH5088   | SUNY SwitchGear                                      
#>   ...        ...                                                  
#>   AH107317 | hg19.Crawford.wgEncodeDukeMapabilityRegionsExcludable
#>   AH116727 | ENCODE_dELS_regions                                  
#>   AH116728 | ENCODE_pELS_regions                                  
#>   AH116729 | ENCODE_PLS_regions                                   
#>   AH116730 | GENCODE basic gene annotation v40
Code
# Web界面
BiocHubsShiny::BiocHubsShiny()

3.3 BSgenome

Code
library(BSgenome)
conflicts_prefer(Biostrings::setdiff)
available.genomes()
#>   [1] "BSgenome.Alyrata.JGI.v1"                           
#>   [2] "BSgenome.Amellifera.BeeBase.assembly4"             
#>   [3] "BSgenome.Amellifera.NCBI.AmelHAv3.1"               
#>   [4] "BSgenome.Amellifera.UCSC.apiMel2"                  
#>   [5] "BSgenome.Amellifera.UCSC.apiMel2.masked"           
#>   [6] "BSgenome.Aofficinalis.NCBI.V1"                     
#>   [7] "BSgenome.Athaliana.TAIR.04232008"                  
#>   [8] "BSgenome.Athaliana.TAIR.TAIR9"                     
#>   [9] "BSgenome.Btaurus.UCSC.bosTau3"                     
#>  [10] "BSgenome.Btaurus.UCSC.bosTau3.masked"              
#>  [11] "BSgenome.Btaurus.UCSC.bosTau4"                     
#>  [12] "BSgenome.Btaurus.UCSC.bosTau4.masked"              
#>  [13] "BSgenome.Btaurus.UCSC.bosTau6"                     
#>  [14] "BSgenome.Btaurus.UCSC.bosTau6.masked"              
#>  [15] "BSgenome.Btaurus.UCSC.bosTau8"                     
#>  [16] "BSgenome.Btaurus.UCSC.bosTau9"                     
#>  [17] "BSgenome.Btaurus.UCSC.bosTau9.masked"              
#>  [18] "BSgenome.Carietinum.NCBI.v1"                       
#>  [19] "BSgenome.Celegans.UCSC.ce10"                       
#>  [20] "BSgenome.Celegans.UCSC.ce11"                       
#>  [21] "BSgenome.Celegans.UCSC.ce2"                        
#>  [22] "BSgenome.Celegans.UCSC.ce6"                        
#>  [23] "BSgenome.Cfamiliaris.UCSC.canFam2"                 
#>  [24] "BSgenome.Cfamiliaris.UCSC.canFam2.masked"          
#>  [25] "BSgenome.Cfamiliaris.UCSC.canFam3"                 
#>  [26] "BSgenome.Cfamiliaris.UCSC.canFam3.masked"          
#>  [27] "BSgenome.Cjacchus.UCSC.calJac3"                    
#>  [28] "BSgenome.Cjacchus.UCSC.calJac4"                    
#>  [29] "BSgenome.CneoformansVarGrubiiKN99.NCBI.ASM221672v1"
#>  [30] "BSgenome.Creinhardtii.JGI.v5.6"                    
#>  [31] "BSgenome.Dmelanogaster.UCSC.dm2"                   
#>  [32] "BSgenome.Dmelanogaster.UCSC.dm2.masked"            
#>  [33] "BSgenome.Dmelanogaster.UCSC.dm3"                   
#>  [34] "BSgenome.Dmelanogaster.UCSC.dm3.masked"            
#>  [35] "BSgenome.Dmelanogaster.UCSC.dm6"                   
#>  [36] "BSgenome.Drerio.UCSC.danRer10"                     
#>  [37] "BSgenome.Drerio.UCSC.danRer11"                     
#>  [38] "BSgenome.Drerio.UCSC.danRer5"                      
#>  [39] "BSgenome.Drerio.UCSC.danRer5.masked"               
#>  [40] "BSgenome.Drerio.UCSC.danRer6"                      
#>  [41] "BSgenome.Drerio.UCSC.danRer6.masked"               
#>  [42] "BSgenome.Drerio.UCSC.danRer7"                      
#>  [43] "BSgenome.Drerio.UCSC.danRer7.masked"               
#>  [44] "BSgenome.Dvirilis.Ensembl.dvircaf1"                
#>  [45] "BSgenome.Ecoli.NCBI.20080805"                      
#>  [46] "BSgenome.Gaculeatus.UCSC.gasAcu1"                  
#>  [47] "BSgenome.Gaculeatus.UCSC.gasAcu1.masked"           
#>  [48] "BSgenome.Ggallus.UCSC.galGal3"                     
#>  [49] "BSgenome.Ggallus.UCSC.galGal3.masked"              
#>  [50] "BSgenome.Ggallus.UCSC.galGal4"                     
#>  [51] "BSgenome.Ggallus.UCSC.galGal4.masked"              
#>  [52] "BSgenome.Ggallus.UCSC.galGal5"                     
#>  [53] "BSgenome.Ggallus.UCSC.galGal6"                     
#>  [54] "BSgenome.Gmax.NCBI.Gmv40"                          
#>  [55] "BSgenome.Hsapiens.1000genomes.hs37d5"              
#>  [56] "BSgenome.Hsapiens.NCBI.GRCh38"                     
#>  [57] "BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0"              
#>  [58] "BSgenome.Hsapiens.UCSC.hg17"                       
#>  [59] "BSgenome.Hsapiens.UCSC.hg17.masked"                
#>  [60] "BSgenome.Hsapiens.UCSC.hg18"                       
#>  [61] "BSgenome.Hsapiens.UCSC.hg18.masked"                
#>  [62] "BSgenome.Hsapiens.UCSC.hg19"                       
#>  [63] "BSgenome.Hsapiens.UCSC.hg19.masked"                
#>  [64] "BSgenome.Hsapiens.UCSC.hg38"                       
#>  [65] "BSgenome.Hsapiens.UCSC.hg38.dbSNP151.major"        
#>  [66] "BSgenome.Hsapiens.UCSC.hg38.dbSNP151.minor"        
#>  [67] "BSgenome.Hsapiens.UCSC.hg38.masked"                
#>  [68] "BSgenome.Hsapiens.UCSC.hs1"                        
#>  [69] "BSgenome.Mdomestica.UCSC.monDom5"                  
#>  [70] "BSgenome.Mfascicularis.NCBI.5.0"                   
#>  [71] "BSgenome.Mfascicularis.NCBI.6.0"                   
#>  [72] "BSgenome.Mfuro.UCSC.musFur1"                       
#>  [73] "BSgenome.Mmulatta.UCSC.rheMac10"                   
#>  [74] "BSgenome.Mmulatta.UCSC.rheMac2"                    
#>  [75] "BSgenome.Mmulatta.UCSC.rheMac2.masked"             
#>  [76] "BSgenome.Mmulatta.UCSC.rheMac3"                    
#>  [77] "BSgenome.Mmulatta.UCSC.rheMac3.masked"             
#>  [78] "BSgenome.Mmulatta.UCSC.rheMac8"                    
#>  [79] "BSgenome.Mmusculus.UCSC.mm10"                      
#>  [80] "BSgenome.Mmusculus.UCSC.mm10.masked"               
#>  [81] "BSgenome.Mmusculus.UCSC.mm39"                      
#>  [82] "BSgenome.Mmusculus.UCSC.mm8"                       
#>  [83] "BSgenome.Mmusculus.UCSC.mm8.masked"                
#>  [84] "BSgenome.Mmusculus.UCSC.mm9"                       
#>  [85] "BSgenome.Mmusculus.UCSC.mm9.masked"                
#>  [86] "BSgenome.Osativa.MSU.MSU7"                         
#>  [87] "BSgenome.Ppaniscus.UCSC.panPan1"                   
#>  [88] "BSgenome.Ppaniscus.UCSC.panPan2"                   
#>  [89] "BSgenome.Ptroglodytes.UCSC.panTro2"                
#>  [90] "BSgenome.Ptroglodytes.UCSC.panTro2.masked"         
#>  [91] "BSgenome.Ptroglodytes.UCSC.panTro3"                
#>  [92] "BSgenome.Ptroglodytes.UCSC.panTro3.masked"         
#>  [93] "BSgenome.Ptroglodytes.UCSC.panTro5"                
#>  [94] "BSgenome.Ptroglodytes.UCSC.panTro6"                
#>  [95] "BSgenome.Rnorvegicus.UCSC.rn4"                     
#>  [96] "BSgenome.Rnorvegicus.UCSC.rn4.masked"              
#>  [97] "BSgenome.Rnorvegicus.UCSC.rn5"                     
#>  [98] "BSgenome.Rnorvegicus.UCSC.rn5.masked"              
#>  [99] "BSgenome.Rnorvegicus.UCSC.rn6"                     
#> [100] "BSgenome.Rnorvegicus.UCSC.rn7"                     
#> [101] "BSgenome.Scerevisiae.UCSC.sacCer1"                 
#> [102] "BSgenome.Scerevisiae.UCSC.sacCer2"                 
#> [103] "BSgenome.Scerevisiae.UCSC.sacCer3"                 
#> [104] "BSgenome.Sscrofa.UCSC.susScr11"                    
#> [105] "BSgenome.Sscrofa.UCSC.susScr3"                     
#> [106] "BSgenome.Sscrofa.UCSC.susScr3.masked"              
#> [107] "BSgenome.Tgondii.ToxoDB.7.0"                       
#> [108] "BSgenome.Tguttata.UCSC.taeGut1"                    
#> [109] "BSgenome.Tguttata.UCSC.taeGut1.masked"             
#> [110] "BSgenome.Tguttata.UCSC.taeGut2"                    
#> [111] "BSgenome.Vvinifera.URGI.IGGP12Xv0"                 
#> [112] "BSgenome.Vvinifera.URGI.IGGP12Xv2"                 
#> [113] "BSgenome.Vvinifera.URGI.IGGP8X"

从UCSC hg38中加载智人的全基因组序列,计算14号染色体GC含量

BSgenome.Hsapiens.UCSC.hg38

Code
library(BSgenome.Hsapiens.UCSC.hg38)
chr14_range = GRanges(seqnames = "chr14",
                      ranges =  IRanges(start = 1, 
                                       end = seqlengths(Hsapiens)["chr14"])
                      )
chr14_dna <- getSeq(Hsapiens, chr14_range)
letterFrequency(chr14_dna, letters = "GC", as.prob=TRUE)
#>            G|C
#> [1,] 0.3454924

3.4 AnnotationDbi

https://bioconductor.org/packages/release/bioc/html/AnnotationDbi.html

Package Name Contents
org.Hs.eg.db Genome wide annotation for Human;useful for mapping between gene IDs, Names, Symbols, GO and KEGG identifiers, etc.
org.Mm.eg.db Genome wide annotation for Mouse
GO.db A set of annotation maps describing the entire Gene Ontology
HPO.db A set of annotation maps describing the entire Human Phenotype Ontology
MPO.db A set of annotation maps describing the Mouse Phenotype Ontology

3.4.1 OrgDb

org.Hs.eg.db

Code
library(AnnotationDbi)
library(org.Hs.eg.db)


keytypes(org.Hs.eg.db) 
#>  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
#>  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
#> [11] "GENETYPE"     "GO"           "GOALL"        "IPI"          "MAP"         
#> [16] "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"        
#> [21] "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
#> [26] "UNIPROT"
# 等价 AnnotationDbi::columns(org.Hs.eg.db)

# 示例 GeneId
gene_ids <- c("1017", "1018", "1019")  # 替换为你自己的 GeneId

# 进行映射
AnnotationDbi::select(org.Hs.eg.db, 
                       keys = gene_ids, 
                       columns = c("GENENAME","SYMBOL","GO","ONTOLOGY"), 
                       keytype = "ENTREZID") %>% 
  head()
ENTREZID GENENAME SYMBOL GO EVIDENCE ONTOLOGY
1017 cyclin dependent kinase 2 CDK2 GO:0000082 IBA BP
1017 cyclin dependent kinase 2 CDK2 GO:0000082 NAS BP
1017 cyclin dependent kinase 2 CDK2 GO:0000086 NAS BP
1017 cyclin dependent kinase 2 CDK2 GO:0000122 IEA BP
1017 cyclin dependent kinase 2 CDK2 GO:0000287 IEA MF
1017 cyclin dependent kinase 2 CDK2 GO:0000307 IBA CC
Code


# mapIds can only use one column.
mapIds(org.Hs.eg.db, 
     keys = gene_ids, 
     column = c("GENENAME"), 
     keytype = "ENTREZID") %>% as.data.frame()
.
1017 cyclin dependent kinase 2
1018 cyclin dependent kinase 3
1019 cyclin dependent kinase 4

3.5 ExperimentHub