2  生物数据结构

https://bioconductor.org/packages/release/workflows/html/sequencing.html

2.1 S4 类

2.1.1 DataFrame

Code
library(S4Vectors)
conflicts_prefer(S4Vectors::setdiff)
# 创建一个 DataFrame 对象
df <- DataFrame(a = 1:3, b = letters[1:3])
df
#> DataFrame with 3 rows and 2 columns
#>           a           b
#>   <integer> <character>
#> 1         1           a
#> 2         2           b
#> 3         3           c

2.1.2 Rle

Run-Length Encoding (RLE) 是一种简单且常用的数据压缩方法,特别适用于存储和处理重复数据序列。class?Rle

Code
# 创建一个 Rle 对象

rle_obj <- Rle(c(rep("A", 5), rep("B", 3), rep("C", 2), "D", rep("A", 2)))

print(rle_obj)
#> character-Rle of length 13 with 5 runs
#>   Lengths:   5   3   2   1   2
#>   Values : "A" "B" "C" "D" "A"

runLength(rle_obj)
#> [1] 5 3 2 1 2

在 RLE 中,连续的重复元素(称为“运行”)被压缩成一个值和一个计数对。这个值表示重复的元素,计数表示它们的数量。例如,序列 [AAAAABBBCCDAA] 可以被编码为 (A5, B3, C2, D1, A2)

2.2 Biostrings

Biological strings

DNAstringSetRNAStringSetAAStringSetBStringSet

2.2.1 DNA

Code
# BiocManager::install("Biostrings")
library(Biostrings) %>% suppressMessages()

Biostrings::DNA_ALPHABET
#>  [1] "A" "C" "G" "T" "M" "R" "W" "S" "Y" "K" "V" "H" "D" "B" "N" "-" "+" "."
IUPAC_CODE_MAP
#>      A      C      G      T      M      R      W      S      Y      K      V 
#>    "A"    "C"    "G"    "T"   "AC"   "AG"   "AT"   "CG"   "CT"   "GT"  "ACG" 
#>      H      D      B      N 
#>  "ACT"  "AGT"  "CGT" "ACGT"

d <- DNAStringSet(c("TGCACGTGCATT","ACTGCA"))
d
#> DNAStringSet object of length 2:
#>     width seq
#> [1]    12 TGCACGTGCATT
#> [2]     6 ACTGCA
length(d)
#> [1] 2

width(d)
#> [1] 12  6
rev(d)
#> DNAStringSet object of length 2:
#>     width seq
#> [1]     6 ACTGCA
#> [2]    12 TGCACGTGCATT
reverse(d)
#> DNAStringSet object of length 2:
#>     width seq
#> [1]    12 TTACGTGCACGT
#> [2]     6 ACGTCA
reverseComplement(d)
#> DNAStringSet object of length 2:
#>     width seq
#> [1]    12 AATGCACGTGCA
#> [2]     6 TGCAGT
translate(d)
#> AAStringSet object of length 2:
#>     width seq
#> [1]     4 CTCI
#> [2]     2 TA

alphabetFrequency(d)
#>      A C G T M R W S Y K V H D B N - + .
#> [1,] 2 3 3 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#> [2,] 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
letterFrequency(d,letters = "GC")
#>      G|C
#> [1,]   6
#> [2,]   3
dinucleotideFrequency(d)
#>      AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT
#> [1,]  0  1  0  1  2  0  1  0  0  2  0  1  0  0  2  1
#> [2,]  0  1  0  0  1  0  0  1  0  1  0  0  0  0  1  0

# 共识矩阵

consensusMatrix(d)
#>   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
#> A    1    0    0    1    0    1    0    0    0     1     0     0
#> C    0    1    1    0    2    0    0    0    1     0     0     0
#> G    0    1    0    1    0    1    0    1    0     0     0     0
#> T    1    0    1    0    0    0    1    0    0     0     1     1
#> M    0    0    0    0    0    0    0    0    0     0     0     0
#> R    0    0    0    0    0    0    0    0    0     0     0     0
#> W    0    0    0    0    0    0    0    0    0     0     0     0
#> S    0    0    0    0    0    0    0    0    0     0     0     0
#> Y    0    0    0    0    0    0    0    0    0     0     0     0
#> K    0    0    0    0    0    0    0    0    0     0     0     0
#> V    0    0    0    0    0    0    0    0    0     0     0     0
#> H    0    0    0    0    0    0    0    0    0     0     0     0
#> D    0    0    0    0    0    0    0    0    0     0     0     0
#> B    0    0    0    0    0    0    0    0    0     0     0     0
#> N    0    0    0    0    0    0    0    0    0     0     0     0
#> -    0    0    0    0    0    0    0    0    0     0     0     0
#> +    0    0    0    0    0    0    0    0    0     0     0     0
#> .    0    0    0    0    0    0    0    0    0     0     0     0

从 Ensembl 的FASTA文件’Homo_sapiens.GRCh38.cdna.all.fa’中下载所有智人cDNA序列

Code
library(AnnotationHub) %>% suppressMessages()
ah <- AnnotationHub(cache = "D:/AnnotationHub")
ah
ah2 <- query(ah, c("fasta", "homo sapiens", "Ensembl", "cdna"))
ah2

# 下载为TwoBitFile文件
dna <- ah2[["AH68262"]]

dna@resource
#> [1]"C:\\Users\\WANGAN~1\\AppData\\Local\\Temp\\RtmpeUSnD9/BiocFileCache/40e014f2123b_75008"

Biostrings::getSeq(dna)
Code
library(rtracklayer)
dna <- import.2bit(con = "C:\\Users\\WANGAN~1\\AppData\\Local\\Temp\\RtmpeUSnD9/BiocFileCache/40e014f2123b_75008")

dna
#> DNAStringSet object of length 187626:
#>           width seq                                         names               
#>      [1]     12 GGGACAGGGGGC                                ENST00000632684.1
#>      [2]      9 CCTTCCTAC                                   ENST00000434970.2
#>      [3]     13 ACTGGGGGATACG                               ENST00000448914.1
#>      [4]      8 GAAATAGT                                    ENST00000415118.1
#>      [5]     12 GGGACAGGGGGC                                ENST00000631435.1
#>      ...    ... ...
#> [187622]   1370 GGCTGAGTCTGGGCCCCAGG...AAGATGCAGCCGGGAGGTGA ENST00000639790.1
#> [187623]    284 GGCGTCTACAAGAGACCTTC...GCCCTAGATGACCACTGTTA ENST00000639660.1
#> [187624]    105 TGCATCACTCTGGCATTGAC...GTGGACCCCAGAAAGTTAAT ENST00000643577.1
#> [187625]    900 ATGGGATGTCACCAATCAAT...GGAAGGAGAGGCTGATGTAA ENST00000646356.1
#> [187626]    930 ATGGGAGTCAACCAATCATG...TGCAGAGGACGCTGTCTATG ENST00000645792.1

2.2.2 RNA

Code
Biostrings::GENETIC_CODE
#> TTT TTC TTA TTG TCT TCC TCA TCG TAT TAC TAA TAG TGT TGC TGA TGG CTT CTC CTA CTG 
#> "F" "F" "L" "L" "S" "S" "S" "S" "Y" "Y" "*" "*" "C" "C" "*" "W" "L" "L" "L" "L" 
#> CCT CCC CCA CCG CAT CAC CAA CAG CGT CGC CGA CGG ATT ATC ATA ATG ACT ACC ACA ACG 
#> "P" "P" "P" "P" "H" "H" "Q" "Q" "R" "R" "R" "R" "I" "I" "I" "M" "T" "T" "T" "T" 
#> AAT AAC AAA AAG AGT AGC AGA AGG GTT GTC GTA GTG GCT GCC GCA GCG GAT GAC GAA GAG 
#> "N" "N" "K" "K" "S" "S" "R" "R" "V" "V" "V" "V" "A" "A" "A" "A" "D" "D" "E" "E" 
#> GGT GGC GGA GGG 
#> "G" "G" "G" "G" 
#> attr(,"alt_init_codons")
#> [1] "TTG" "CTG"
Biostrings::RNA_GENETIC_CODE
#> UUU UUC UUA UUG UCU UCC UCA UCG UAU UAC UAA UAG UGU UGC UGA UGG CUU CUC CUA CUG 
#> "F" "F" "L" "L" "S" "S" "S" "S" "Y" "Y" "*" "*" "C" "C" "*" "W" "L" "L" "L" "L" 
#> CCU CCC CCA CCG CAU CAC CAA CAG CGU CGC CGA CGG AUU AUC AUA AUG ACU ACC ACA ACG 
#> "P" "P" "P" "P" "H" "H" "Q" "Q" "R" "R" "R" "R" "I" "I" "I" "M" "T" "T" "T" "T" 
#> AAU AAC AAA AAG AGU AGC AGA AGG GUU GUC GUA GUG GCU GCC GCA GCG GAU GAC GAA GAG 
#> "N" "N" "K" "K" "S" "S" "R" "R" "V" "V" "V" "V" "A" "A" "A" "A" "D" "D" "E" "E" 
#> GGU GGC GGA GGG 
#> "G" "G" "G" "G" 
#> attr(,"alt_init_codons")
#> [1] "UUG" "CUG"
RNA_ALPHABET
#>  [1] "A" "C" "G" "U" "M" "R" "W" "S" "Y" "K" "V" "H" "D" "B" "N" "-" "+" "."
r <- RNAStringSet(c("AUCG", "GCAU", "AUCGAU", "GCUA"))
r
#> RNAStringSet object of length 4:
#>     width seq
#> [1]     4 AUCG
#> [2]     4 GCAU
#> [3]     6 AUCGAU
#> [4]     4 GCUA

2.2.3 蛋白质

组成人体蛋白质的21种氨基酸,20 + 硒代半胱氨酸(UGA-终止密码子)

Code
Biostrings::AMINO_ACID_CODE
#>     A     R     N     D     C     Q     E     G     H     I     L     K     M 
#> "Ala" "Arg" "Asn" "Asp" "Cys" "Gln" "Glu" "Gly" "His" "Ile" "Leu" "Lys" "Met" 
#>     F     P     S     T     W     Y     V     U     O     B     J     Z     X 
#> "Phe" "Pro" "Ser" "Thr" "Trp" "Tyr" "Val" "Sec" "Pyl" "Asx" "Xle" "Glx" "Xaa"
AA_ALPHABET
#>  [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
#> [20] "V" "U" "O" "B" "J" "Z" "X" "*" "-" "+" "."

protein_strings <- AAStringSet(c("MATH", "GCAU", "MATHMATH", "CUMA"))
protein_strings
#> AAStringSet object of length 4:
#>     width seq
#> [1]     4 MATH
#> [2]     4 GCAU
#> [3]     8 MATHMATH
#> [4]     4 CUMA

# 二进制字符串存储 如seq_id、quality
binary_strings <- BStringSet(c("ERCC010101", "CRCC110011", "101010", "111000"))
binary_strings
#> BStringSet object of length 4:
#>     width seq
#> [1]    10 ERCC010101
#> [2]    10 CRCC110011
#> [3]     6 101010
#> [4]     6 111000

2.3 Ranges

2.3.1 IntervalRanges

IRanges

Code
library(IRanges)
set.seed(10)
ranges <- IRanges::IRanges(
  start = round(runif(10, 1, 100)),
  width = round(runif(10, 0, 50)),
  names = paste0("exton_", letters[sample(1:26, 10)])
)
ranges
#> IRanges object with 10 ranges and 0 metadata columns:
#>               start       end     width
#>           <integer> <integer> <integer>
#>   exton_h        51        83        33
#>   exton_n        31        58        28
#>   exton_g        43        48         6
#>   exton_f        70        99        30
#>   exton_x         9        26        18
#>   exton_r        23        43        21
#>   exton_u        28        30         3
#>   exton_m        28        40        13
#>   exton_e        62        81        20
#>   exton_a        44        85        42


start(ranges)
#>  [1] 51 31 43 70  9 23 28 28 62 44
width(ranges)
#>  [1] 33 28  6 30 18 21  3 13 20 42
end(ranges)
#>  [1] 83 58 48 99 26 43 30 40 81 85

没有维度,但有长度

Code
dim(ranges)
#> NULL
length(ranges)
#> [1] 10

重叠检测: 检查不同区间是否有重叠。

Code
query <- IRanges(1, 10)
subject <- IRanges(c(5, 15), c(10, 20))
overlaps <- findOverlaps(query, subject)
overlaps
#> Hits object with 1 hit and 0 metadata columns:
#>       queryHits subjectHits
#>       <integer>   <integer>
#>   [1]         1           1
#>   -------
#>   queryLength: 1 / subjectLength: 2

queryHits(overlaps)
#> [1] 1

合并区间: 将重叠或相邻的区间合并为一个更大的区间。

Code
merged <- GenomicRanges::reduce(IRanges(start = c(1, 5), end = c(10, 20)))
print(merged)
#> IRanges object with 1 range and 0 metadata columns:
#>           start       end     width
#>       <integer> <integer> <integer>
#>   [1]         1        20        20
Code
plotRanges <- function(x, xlim = x , 
                       main =deparse(substitute(x)),
                       col = "black", sep = 0.5, ...){
  height = 1
  if(is(xlim, class2 = "Ranges"))
    xlim = c(min(start(xlim)), max(end(xlim)))
  bins <- disjointBins(IRanges(start(x),end(x)+1))
  plot.new()
  plot.window(xlim, c(0, max(bins)*(height + sep)))
  ybottom <- bins * (sep + height) - height
  rect(start(x)-0.5, ybottom, end(x)+0.5, ybottom + height,
       col = col, ...)
  title(main)
  axis(1)
  
}
Code
ir <- IRanges::IRanges(
  start =c(1, 1, 4, 10),
  end =  c(6, 3, 8, 10)
)
ir
#> IRanges object with 4 ranges and 0 metadata columns:
#>           start       end     width
#>       <integer> <integer> <integer>
#>   [1]         1         6         6
#>   [2]         1         3         3
#>   [3]         4         8         5
#>   [4]        10        10         1
disjoin(ir)
#> IRanges object with 4 ranges and 0 metadata columns:
#>           start       end     width
#>       <integer> <integer> <integer>
#>   [1]         1         3         3
#>   [2]         4         6         3
#>   [3]         7         8         2
#>   [4]        10        10         1

disjointBins(IRanges(start(ir), end(ir) + 1)) # 1,2,3,1 放于第1,2,3,1层
#> [1] 1 2 3 1
disjointBins(IRanges(start(disjoin(ir)), end(disjoin(ir)) + 1)) #放于第1,2,1,1层
#> [1] 1 2 1 1

par(mfrow = c(2,1))
plotRanges(ir)
plotRanges(disjoin(ir))

2.3.2 GenomicRanges

GRanges:用于表示基因组(染色体)范围的数据结构,例如Promoters,Genes,SNPs,CpG Islands,……

Code
library(GenomicRanges)
# help("GRanges-class")

set.seed(10)
Granges <- GRanges(
  seqnames = Rle( values =  c('chr1', 'chr2', 'chr3'), lengths =  c(3, 3, 3)),
  ranges = IRanges(start = seq(1,18,2), width = 3),
  strand = rep(c("+","-","*"),each=3),
  score = 101:109,
  GC = runif(9)
)
Granges
#> GRanges object with 9 ranges and 2 metadata columns:
#>       seqnames    ranges strand |     score        GC
#>          <Rle> <IRanges>  <Rle> | <integer> <numeric>
#>   [1]     chr1       1-3      + |       101  0.507478
#>   [2]     chr1       3-5      + |       102  0.306769
#>   [3]     chr1       5-7      + |       103  0.426908
#>   [4]     chr2       7-9      - |       104  0.693102
#>   [5]     chr2      9-11      - |       105  0.085136
#>   [6]     chr2     11-13      - |       106  0.225437
#>   [7]     chr3     13-15      * |       107  0.274531
#>   [8]     chr3     15-17      * |       108  0.272305
#>   [9]     chr3     17-19      * |       109  0.615829
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome; no seqlengths
Code
sort(Granges)
#> GRanges object with 9 ranges and 2 metadata columns:
#>       seqnames    ranges strand |     score        GC
#>          <Rle> <IRanges>  <Rle> | <integer> <numeric>
#>   [1]     chr1       1-3      + |       101  0.507478
#>   [2]     chr1       3-5      + |       102  0.306769
#>   [3]     chr1       5-7      + |       103  0.426908
#>   [4]     chr2       7-9      - |       104  0.693102
#>   [5]     chr2      9-11      - |       105  0.085136
#>   [6]     chr2     11-13      - |       106  0.225437
#>   [7]     chr3     13-15      * |       107  0.274531
#>   [8]     chr3     15-17      * |       108  0.272305
#>   [9]     chr3     17-19      * |       109  0.615829
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome; no seqlengths

values(Granges) <- DataFrame(score = rnorm(9), GC = runif(9))
Granges
#> GRanges object with 9 ranges and 2 metadata columns:
#>       seqnames    ranges strand |     score        GC
#>          <Rle> <IRanges>  <Rle> | <numeric> <numeric>
#>   [1]     chr1       1-3      + | -0.177211 0.2395891
#>   [2]     chr1       3-5      + |  0.170618 0.7707715
#>   [3]     chr1       5-7      + |  0.242814 0.3558977
#>   [4]     chr2       7-9      - | -0.179406 0.5355970
#>   [5]     chr2      9-11      - | -0.630519 0.0930881
#>   [6]     chr2     11-13      - |  0.978693 0.1698030
#>   [7]     chr3     13-15      * |  0.293297 0.8998325
#>   [8]     chr3     15-17      * | -0.370329 0.4226376
#>   [9]     chr3     17-19      * |  0.543615 0.7477465
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome; no seqlengths

flank(x, width, start = TRUE, both = FALSE) 用于在基因组范围对象 (GRanges) 上扩展或调整范围。它会基于输入的基因组范围返回其上游或下游的邻近区域。flanking sequence

  • x: 一个 GRanges 对象。
  • width: 你想要的上下游区域的宽度(如上例中的 5)。
  • start: 这个参数决定了是否使用起始端(TRUE)还是结束端(FALSE)来生成上下游区域。如果你的基因组是正链,start = TRUE 表示上游。如果是负链,start = TRUE 则表示下游。默认为 TRUE。
  • both: 如果设置为 TRUE,则会在范围的两端生成 width 宽度的区域。
Code
flank(Granges,5)
#> GRanges object with 9 ranges and 2 metadata columns:
#>       seqnames    ranges strand |     score        GC
#>          <Rle> <IRanges>  <Rle> | <numeric> <numeric>
#>   [1]     chr1      -4-0      + | -0.177211 0.2395891
#>   [2]     chr1      -2-2      + |  0.170618 0.7707715
#>   [3]     chr1       0-4      + |  0.242814 0.3558977
#>   [4]     chr2     10-14      - | -0.179406 0.5355970
#>   [5]     chr2     12-16      - | -0.630519 0.0930881
#>   [6]     chr2     14-18      - |  0.978693 0.1698030
#>   [7]     chr3      8-12      * |  0.293297 0.8998325
#>   [8]     chr3     10-14      * | -0.370329 0.4226376
#>   [9]     chr3     12-16      * |  0.543615 0.7477465
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome; no seqlengths

promoters(x, upstream = 2000, downstream = 200) 通常用于提取基因的启动子区域(promoter regions)。在基因组分析中,启动子区域是靠近基因起始位点的区域,通常被认为是调控基因表达的关键区域。启动子区域是基于基因组范围的起始位置提取的,上游(upstream)和下游(downstream)的长度可以自定义。

Code
promoters(Granges)
#> GRanges object with 9 ranges and 2 metadata columns:
#>       seqnames    ranges strand |     score        GC
#>          <Rle> <IRanges>  <Rle> | <numeric> <numeric>
#>   [1]     chr1 -1999-200      + | -0.177211 0.2395891
#>   [2]     chr1 -1997-202      + |  0.170618 0.7707715
#>   [3]     chr1 -1995-204      + |  0.242814 0.3558977
#>   [4]     chr2 -190-2009      - | -0.179406 0.5355970
#>   [5]     chr2 -188-2011      - | -0.630519 0.0930881
#>   [6]     chr2 -186-2013      - |  0.978693 0.1698030
#>   [7]     chr3 -1987-212      * |  0.293297 0.8998325
#>   [8]     chr3 -1985-214      * | -0.370329 0.4226376
#>   [9]     chr3 -1983-216      * |  0.543615 0.7477465
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome; no seqlengths
Code
library(GenomeInfoDb)

genome(Granges) <- "hg19"

seqinfo(Granges)
#> Seqinfo object with 3 sequences from hg19 genome; no seqlengths:
#>   seqnames seqlengths isCircular genome
#>   chr1             NA         NA   hg19
#>   chr2             NA         NA   hg19
#>   chr3             NA         NA   hg19
seqlevels(Granges)
#> [1] "chr1" "chr2" "chr3"
seqlengths(Granges)
#> chr1 chr2 chr3 
#>   NA   NA   NA

gaps(x)用于计算给定的 GRanges 对象中缺失(gaps)的基因组区域。它返回的是不被输入的 GRanges 对象覆盖的范围(即空白区域),通常用于确定在某些染色体区域中没有基因或特征的部分。

Code
gaps(Granges)
#> GRanges object with 2 ranges and 0 metadata columns:
#>       seqnames    ranges strand
#>          <Rle> <IRanges>  <Rle>
#>   [1]     chr2       1-6      -
#>   [2]     chr3      1-12      *
#>   -------
#>   seqinfo: 3 sequences from hg19 genome; no seqlengths
Code
library(gggenes)
Granges %>% as.data.frame() %>% 
  rownames_to_column(var = "gene") %>% 
  ggplot(aes(xmin = start, xmax= end,y =seqnames,fill = gene))+
  geom_gene_arrow()+
  facet_wrap(~ seqnames, scales = "free", ncol = 1) +
  scale_fill_brewer(palette = "Set3")+
  theme_genes()

Code

gr1 <- GRanges(seqnames = "chr1", 
               ranges = IRanges(start = c(1, 20), end = c(10, 30)))
gr2 <- GRanges(seqnames = "chr1", 
               ranges = IRanges(start = c(5, 25), end = c(15, 35)))
gr1
#> GRanges object with 2 ranges and 0 metadata columns:
#>       seqnames    ranges strand
#>          <Rle> <IRanges>  <Rle>
#>   [1]     chr1      1-10      *
#>   [2]     chr1     20-30      *
#>   -------
#>   seqinfo: 1 sequence from an unspecified genome; no seqlengths
gr2
#> GRanges object with 2 ranges and 0 metadata columns:
#>       seqnames    ranges strand
#>          <Rle> <IRanges>  <Rle>
#>   [1]     chr1      5-15      *
#>   [2]     chr1     25-35      *
#>   -------
#>   seqinfo: 1 sequence from an unspecified genome; no seqlengths
GenomicRanges::setdiff(gr1, gr2)
#> GRanges object with 2 ranges and 0 metadata columns:
#>       seqnames    ranges strand
#>          <Rle> <IRanges>  <Rle>
#>   [1]     chr1       1-4      *
#>   [2]     chr1     20-24      *
#>   -------
#>   seqinfo: 1 sequence from an unspecified genome; no seqlengths

findOverlaps(gr1,gr2)
#> Hits object with 2 hits and 0 metadata columns:
#>       queryHits subjectHits
#>       <integer>   <integer>
#>   [1]         1           1
#>   [2]         2           2
#>   -------
#>   queryLength: 2 / subjectLength: 2

subsetByOverlaps(gr1, gr2)
#> GRanges object with 2 ranges and 0 metadata columns:
#>       seqnames    ranges strand
#>          <Rle> <IRanges>  <Rle>
#>   [1]     chr1      1-10      *
#>   [2]     chr1     20-30      *
#>   -------
#>   seqinfo: 1 sequence from an unspecified genome; no seqlengths