注释包
https://bioconductor.org/packages/3.22/data/annotation/
https://bioconductor.org/packages/release/workflows/html/annotation.html
https://bioconductor.org/packages/release/bioc/html/AnnotationDbi.html
BSgenome
Code
library(BSgenome)
bs <- available.genomes(splitNameParts = T)
human_genomes <- bs[bs$organism == "Hsapiens", ]
human_genomes
#> pkgname organism provider
#> 55 BSgenome.Hsapiens.1000genomes.hs37d5 Hsapiens 1000genomes
#> 56 BSgenome.Hsapiens.NCBI.GRCh38 Hsapiens NCBI
#> 57 BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0 Hsapiens NCBI
#> 58 BSgenome.Hsapiens.UCSC.hg17 Hsapiens UCSC
#> 59 BSgenome.Hsapiens.UCSC.hg17.masked Hsapiens UCSC
#> 60 BSgenome.Hsapiens.UCSC.hg18 Hsapiens UCSC
#> 61 BSgenome.Hsapiens.UCSC.hg18.masked Hsapiens UCSC
#> 62 BSgenome.Hsapiens.UCSC.hg19 Hsapiens UCSC
#> 63 BSgenome.Hsapiens.UCSC.hg19.masked Hsapiens UCSC
#> 64 BSgenome.Hsapiens.UCSC.hg38 Hsapiens UCSC
#> 65 BSgenome.Hsapiens.UCSC.hg38.dbSNP151.major Hsapiens UCSC
#> 66 BSgenome.Hsapiens.UCSC.hg38.dbSNP151.minor Hsapiens UCSC
#> 67 BSgenome.Hsapiens.UCSC.hg38.masked Hsapiens UCSC
#> 68 BSgenome.Hsapiens.UCSC.hs1 Hsapiens UCSC
#> genome masked
#> 55 hs37d5 FALSE
#> 56 GRCh38 FALSE
#> 57 T2T.CHM13v2.0 FALSE
#> 58 hg17 FALSE
#> 59 hg17 TRUE
#> 60 hg18 FALSE
#> 61 hg18 TRUE
#> 62 hg19 FALSE
#> 63 hg19 TRUE
#> 64 hg38 FALSE
#> 65 hg38.dbSNP151.major FALSE
#> 66 hg38.dbSNP151.minor FALSE
#> 67 hg38 TRUE
#> 68 hs1 FALSE
installed.genomes()
#> [1] "BSgenome.Hsapiens.UCSC.hg38"
从UCSC hg38中加载智人的全基因组序列,计算14号染色体GC含量
Code
# pak::pak("BSgenome.Hsapiens.UCSC.hg38")
library(BSgenome.Hsapiens.UCSC.hg38)
chr14_range = GRanges(seqnames = "chr14",
ranges = IRanges(start = 1,
end = seqlengths(Hsapiens)["chr14"])
)
chr14_dna <- getSeq(Hsapiens, chr14_range)
letterFrequency(chr14_dna, letters = "GC", as.prob=TRUE)
#> G|C
#> [1,] 0.3454924
OrgDb
org.Hs.eg.db
Code
library(AnnotationDbi)
library(org.Hs.eg.db)
AnnotationDbi::keytypes(org.Hs.eg.db) # 等价 AnnotationDbi::columns(org.Hs.eg.db)
#> [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
#> [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
#> [11] "GENETYPE" "GO" "GOALL" "IPI" "MAP"
#> [16] "OMIM" "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM"
#> [21] "PMID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
#> [26] "UNIPROT"
# 示例
entrez_id <- c("1017", "1018", "1019") # 替换为你自己的 entrez_id
# 进行映射
AnnotationDbi::select(org.Hs.eg.db,
keys = entrez_id,
columns = c("GENENAME","ALIAS","SYMBOL","GO","ONTOLOGY"),
keytype = "ENTREZID", multiVals = "CharacterList") |>
head()
#> ENTREZID GENENAME ALIAS SYMBOL GO EVIDENCE ONTOLOGY
#> 1 1017 cyclin dependent kinase 2 CDKN2 CDK2 GO:0000082 IBA BP
#> 2 1017 cyclin dependent kinase 2 CDKN2 CDK2 GO:0000082 IEA BP
#> 3 1017 cyclin dependent kinase 2 CDKN2 CDK2 GO:0000082 NAS BP
#> 4 1017 cyclin dependent kinase 2 CDKN2 CDK2 GO:0000086 NAS BP
#> 5 1017 cyclin dependent kinase 2 CDKN2 CDK2 GO:0000122 IEA BP
#> 6 1017 cyclin dependent kinase 2 CDKN2 CDK2 GO:0000166 IEA MF
# mapIds can only use one column.
mapIds(org.Hs.eg.db,
keys = entrez_id,
column = c("ALIAS"), # 别名
keytype = "ENTREZID", multiVals = "CharacterList")
#> CharacterList of length 3
#> [["1017"]] CDKN2 p33(CDK2) CDK2
#> [["1018"]] CDK3
#> [["1019"]] CMM3 PSK-J3 CDK4