https://bioconductor.org/packages/release/workflows/html/sequencing.html
S4 类
DataFrame
Code
library (S4Vectors)
# 创建一个 DataFrame 对象
df <- DataFrame (a = 1 : 3 , b = letters[1 : 3 ])
df
#> DataFrame with 3 rows and 2 columns
#> a b
#> <integer> <character>
#> 1 1 a
#> 2 2 b
#> 3 3 c
Rle
Run-Length Encoding (RLE) 是一种简单且常用的数据压缩方法,特别适用于存储和处理重复数据序列。class?Rle
Code
# 创建一个 Rle 对象
rle_obj <- Rle (c (rep ("A" , 5 ), rep ("B" , 3 ), rep ("C" , 2 ), "D" , rep ("A" , 2 )))
print (rle_obj)
#> character-Rle of length 13 with 5 runs
#> Lengths: 5 3 2 1 2
#> Values : "A" "B" "C" "D" "A"
runLength (rle_obj)
#> [1] 5 3 2 1 2
在 RLE 中,连续的重复元素(称为“run”)被压缩成一个值和一个计数对。这个值表示重复的元素,计数表示它们的数量。例如,序列 [AAAAABBBCCDAA] 可以被编码为 (A5, B3, C2, D1, A2)。
Biostrings
Biological strings
DNAstringSet、RNAStringSet、AAStringSet、BStringSet
DNA
Code
# BiocManager::install("Biostrings")
library (Biostrings)
Biostrings:: DNA_ALPHABET
#> [1] "A" "C" "G" "T" "M" "R" "W" "S" "Y" "K" "V" "H" "D" "B" "N" "-" "+" "."
IUPAC_CODE_MAP
#> A C G T M R W S Y K V
#> "A" "C" "G" "T" "AC" "AG" "AT" "CG" "CT" "GT" "ACG"
#> H D B N
#> "ACT" "AGT" "CGT" "ACGT"
d <- DNAStringSet (c ("TGCACGTGCATT" ,"ACTGCA" ))
d
#> DNAStringSet object of length 2:
#> width seq
#> [1] 12 TGCACGTGCATT
#> [2] 6 ACTGCA
length (d)
#> [1] 2
width (d)
#> [1] 12 6
rev (d)
#> DNAStringSet object of length 2:
#> width seq
#> [1] 6 ACTGCA
#> [2] 12 TGCACGTGCATT
reverse (d)
#> DNAStringSet object of length 2:
#> width seq
#> [1] 12 TTACGTGCACGT
#> [2] 6 ACGTCA
reverseComplement (d)
#> DNAStringSet object of length 2:
#> width seq
#> [1] 12 AATGCACGTGCA
#> [2] 6 TGCAGT
translate (d)
#> AAStringSet object of length 2:
#> width seq
#> [1] 4 CTCI
#> [2] 2 TA
alphabetFrequency (d)
#> A C G T M R W S Y K V H D B N - + .
#> [1,] 2 3 3 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#> [2,] 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
letterFrequency (d,letters = "GC" )
#> G|C
#> [1,] 6
#> [2,] 3
dinucleotideFrequency (d)
#> AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT
#> [1,] 0 1 0 1 2 0 1 0 0 2 0 1 0 0 2 1
#> [2,] 0 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0
# 共识矩阵
consensusMatrix (d)
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
#> A 1 0 0 1 0 1 0 0 0 1 0 0
#> C 0 1 1 0 2 0 0 0 1 0 0 0
#> G 0 1 0 1 0 1 0 1 0 0 0 0
#> T 1 0 1 0 0 0 1 0 0 0 1 1
#> M 0 0 0 0 0 0 0 0 0 0 0 0
#> R 0 0 0 0 0 0 0 0 0 0 0 0
#> W 0 0 0 0 0 0 0 0 0 0 0 0
#> S 0 0 0 0 0 0 0 0 0 0 0 0
#> Y 0 0 0 0 0 0 0 0 0 0 0 0
#> K 0 0 0 0 0 0 0 0 0 0 0 0
#> V 0 0 0 0 0 0 0 0 0 0 0 0
#> H 0 0 0 0 0 0 0 0 0 0 0 0
#> D 0 0 0 0 0 0 0 0 0 0 0 0
#> B 0 0 0 0 0 0 0 0 0 0 0 0
#> N 0 0 0 0 0 0 0 0 0 0 0 0
#> - 0 0 0 0 0 0 0 0 0 0 0 0
#> + 0 0 0 0 0 0 0 0 0 0 0 0
#> . 0 0 0 0 0 0 0 0 0 0 0 0
RNA
Code
Biostrings:: GENETIC_CODE
#> TTT TTC TTA TTG TCT TCC TCA TCG TAT TAC TAA TAG TGT TGC TGA TGG CTT CTC CTA CTG
#> "F" "F" "L" "L" "S" "S" "S" "S" "Y" "Y" "*" "*" "C" "C" "*" "W" "L" "L" "L" "L"
#> CCT CCC CCA CCG CAT CAC CAA CAG CGT CGC CGA CGG ATT ATC ATA ATG ACT ACC ACA ACG
#> "P" "P" "P" "P" "H" "H" "Q" "Q" "R" "R" "R" "R" "I" "I" "I" "M" "T" "T" "T" "T"
#> AAT AAC AAA AAG AGT AGC AGA AGG GTT GTC GTA GTG GCT GCC GCA GCG GAT GAC GAA GAG
#> "N" "N" "K" "K" "S" "S" "R" "R" "V" "V" "V" "V" "A" "A" "A" "A" "D" "D" "E" "E"
#> GGT GGC GGA GGG
#> "G" "G" "G" "G"
#> attr(,"alt_init_codons")
#> [1] "TTG" "CTG"
Biostrings:: RNA_GENETIC_CODE
#> UUU UUC UUA UUG UCU UCC UCA UCG UAU UAC UAA UAG UGU UGC UGA UGG CUU CUC CUA CUG
#> "F" "F" "L" "L" "S" "S" "S" "S" "Y" "Y" "*" "*" "C" "C" "*" "W" "L" "L" "L" "L"
#> CCU CCC CCA CCG CAU CAC CAA CAG CGU CGC CGA CGG AUU AUC AUA AUG ACU ACC ACA ACG
#> "P" "P" "P" "P" "H" "H" "Q" "Q" "R" "R" "R" "R" "I" "I" "I" "M" "T" "T" "T" "T"
#> AAU AAC AAA AAG AGU AGC AGA AGG GUU GUC GUA GUG GCU GCC GCA GCG GAU GAC GAA GAG
#> "N" "N" "K" "K" "S" "S" "R" "R" "V" "V" "V" "V" "A" "A" "A" "A" "D" "D" "E" "E"
#> GGU GGC GGA GGG
#> "G" "G" "G" "G"
#> attr(,"alt_init_codons")
#> [1] "UUG" "CUG"
RNA_ALPHABET
#> [1] "A" "C" "G" "U" "M" "R" "W" "S" "Y" "K" "V" "H" "D" "B" "N" "-" "+" "."
r <- RNAStringSet (c ("AUCG" , "GCAU" , "AUCGAU" , "GCUA" ))
r
#> RNAStringSet object of length 4:
#> width seq
#> [1] 4 AUCG
#> [2] 4 GCAU
#> [3] 6 AUCGAU
#> [4] 4 GCUA
蛋白质
组成人体蛋白质的21种氨基酸,20 + 硒代半胱氨酸(UGA-终止密码子)
Code
Biostrings:: AMINO_ACID_CODE
#> A R N D C Q E G H I L K M
#> "Ala" "Arg" "Asn" "Asp" "Cys" "Gln" "Glu" "Gly" "His" "Ile" "Leu" "Lys" "Met"
#> F P S T W Y V U O B J Z X
#> "Phe" "Pro" "Ser" "Thr" "Trp" "Tyr" "Val" "Sec" "Pyl" "Asx" "Xle" "Glx" "Xaa"
AA_ALPHABET
#> [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
#> [20] "V" "U" "O" "B" "J" "Z" "X" "*" "-" "+" "."
protein_strings <- AAStringSet (c ("MATH" , "GCAU" , "MATHMATH" , "CUMA" ))
protein_strings
#> AAStringSet object of length 4:
#> width seq
#> [1] 4 MATH
#> [2] 4 GCAU
#> [3] 8 MATHMATH
#> [4] 4 CUMA
# 二进制字符串存储 如seq_id、quality
binary_strings <- BStringSet (c ("ERCC010101" , "CRCC110011" , "101010" , "111000" ))
binary_strings
#> BStringSet object of length 4:
#> width seq
#> [1] 10 ERCC010101
#> [2] 10 CRCC110011
#> [3] 6 101010
#> [4] 6 111000
Ranges
IntervalRanges
IRanges
Code
library (IRanges)
set.seed (10 )
ranges <- IRanges:: IRanges (
start = round (runif (10 , 1 , 100 )),
width = round (runif (10 , 0 , 50 )),
names = paste0 ("exton_" , letters[sample (1 : 26 , 10 )])
)
ranges
#> IRanges object with 10 ranges and 0 metadata columns:
#> start end width
#> <integer> <integer> <integer>
#> exton_h 51 83 33
#> exton_n 31 58 28
#> exton_g 43 48 6
#> exton_f 70 99 30
#> exton_x 9 26 18
#> exton_r 23 43 21
#> exton_u 28 30 3
#> exton_m 28 40 13
#> exton_e 62 81 20
#> exton_a 44 85 42
start (ranges)
#> [1] 51 31 43 70 9 23 28 28 62 44
width (ranges)
#> [1] 33 28 6 30 18 21 3 13 20 42
end (ranges)
#> [1] 83 58 48 99 26 43 30 40 81 85
没有维度,但有长度
Code
dim (ranges)
#> NULL
length (ranges)
#> [1] 10
重叠检测: 检查不同区间是否有重叠。
Code
query <- IRanges (1 , 10 )
subject <- IRanges (c (5 , 15 ), c (10 , 20 ))
overlaps <- findOverlaps (query, subject)
overlaps
#> Hits object with 1 hit and 0 metadata columns:
#> queryHits subjectHits
#> <integer> <integer>
#> [1] 1 1
#> -------
#> queryLength: 1 / subjectLength: 2
queryHits (overlaps)
#> [1] 1
合并区间: 将重叠或相邻的区间合并为一个更大的区间。
Code
merged <- GenomicRanges:: reduce (IRanges (start = c (1 , 5 ), end = c (10 , 20 )))
print (merged)
#> IRanges object with 1 range and 0 metadata columns:
#> start end width
#> <integer> <integer> <integer>
#> [1] 1 20 20
Code
plotRanges <- function (x, xlim = x ,
main = deparse (substitute (x)),
col = "black" , sep = 0.5 , ...){
height = 1
if (is (xlim, class2 = "Ranges" ))
xlim = c (min (start (xlim)), max (end (xlim)))
bins <- disjointBins (IRanges (start (x),end (x)+ 1 ))
plot.new ()
plot.window (xlim, c (0 , max (bins)* (height + sep)))
ybottom <- bins * (sep + height) - height
rect (start (x)- 0.5 , ybottom, end (x)+ 0.5 , ybottom + height,
col = col, ...)
title (main)
axis (1 )
}
Code
ir <- IRanges:: IRanges (
start = c (1 , 1 , 4 , 10 ),
end = c (6 , 3 , 8 , 10 )
)
ir
#> IRanges object with 4 ranges and 0 metadata columns:
#> start end width
#> <integer> <integer> <integer>
#> [1] 1 6 6
#> [2] 1 3 3
#> [3] 4 8 5
#> [4] 10 10 1
disjoin (ir)
#> IRanges object with 4 ranges and 0 metadata columns:
#> start end width
#> <integer> <integer> <integer>
#> [1] 1 3 3
#> [2] 4 6 3
#> [3] 7 8 2
#> [4] 10 10 1
disjointBins (IRanges (start (ir), end (ir) + 1 )) # 1,2,3,1 放于第1,2,3,1层
#> [1] 1 2 3 1
disjointBins (IRanges (start (disjoin (ir)), end (disjoin (ir)) + 1 )) #放于第1,2,1,1层
#> [1] 1 2 1 1
par (mfrow = c (2 ,1 ))
plotRanges (ir)
plotRanges (disjoin (ir))
GenomicRanges
GRanges :用于表示基因组(染色体)范围的数据结构,例如Promoters,Genes,SNPs,CpG Islands,……
Code
library (GenomicRanges)
# help("GRanges-class")
set.seed (10 )
Granges <- GRanges (
seqnames = Rle ( values = c ('chr1' , 'chr2' , 'chr3' ), lengths = c (3 , 3 , 3 )),
ranges = IRanges (start = seq (1 ,18 ,2 ), width = 3 ),
strand = rep (c ("+" ,"-" ,"*" ),each= 3 ),
score = 101 : 109 ,
GC = runif (9 )
)
Granges
#> GRanges object with 9 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> [1] chr1 1-3 + | 101 0.507478
#> [2] chr1 3-5 + | 102 0.306769
#> [3] chr1 5-7 + | 103 0.426908
#> [4] chr2 7-9 - | 104 0.693102
#> [5] chr2 9-11 - | 105 0.085136
#> [6] chr2 11-13 - | 106 0.225437
#> [7] chr3 13-15 * | 107 0.274531
#> [8] chr3 15-17 * | 108 0.272305
#> [9] chr3 17-19 * | 109 0.615829
#> -------
#> seqinfo: 3 sequences from an unspecified genome; no seqlengths
Code
sort (Granges)
#> GRanges object with 9 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> [1] chr1 1-3 + | 101 0.507478
#> [2] chr1 3-5 + | 102 0.306769
#> [3] chr1 5-7 + | 103 0.426908
#> [4] chr2 7-9 - | 104 0.693102
#> [5] chr2 9-11 - | 105 0.085136
#> [6] chr2 11-13 - | 106 0.225437
#> [7] chr3 13-15 * | 107 0.274531
#> [8] chr3 15-17 * | 108 0.272305
#> [9] chr3 17-19 * | 109 0.615829
#> -------
#> seqinfo: 3 sequences from an unspecified genome; no seqlengths
values (Granges) <- DataFrame (score = rnorm (9 ), GC = runif (9 ))
Granges
#> GRanges object with 9 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <numeric> <numeric>
#> [1] chr1 1-3 + | -0.177211 0.2395891
#> [2] chr1 3-5 + | 0.170618 0.7707715
#> [3] chr1 5-7 + | 0.242814 0.3558977
#> [4] chr2 7-9 - | -0.179406 0.5355970
#> [5] chr2 9-11 - | -0.630519 0.0930881
#> [6] chr2 11-13 - | 0.978693 0.1698030
#> [7] chr3 13-15 * | 0.293297 0.8998325
#> [8] chr3 15-17 * | -0.370329 0.4226376
#> [9] chr3 17-19 * | 0.543615 0.7477465
#> -------
#> seqinfo: 3 sequences from an unspecified genome; no seqlengths
flank(x, width, start = TRUE, both = FALSE) 用于在基因组范围对象 (GRanges) 上扩展或调整范围。它会基于输入的基因组范围返回其上游或下游的邻近区域。flanking sequence
x: 一个 GRanges 对象。
width: 你想要的上下游区域的宽度(如上例中的 5)。
start: 这个参数决定了是否使用起始端(TRUE)还是结束端(FALSE)来生成上下游区域。如果你的基因组是正链,start = TRUE 表示上游。如果是负链,start = TRUE 则表示下游。默认为 TRUE。
both: 如果设置为 TRUE,则会在范围的两端生成 width 宽度的区域。
Code
flank (Granges,5 )
#> GRanges object with 9 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <numeric> <numeric>
#> [1] chr1 -4-0 + | -0.177211 0.2395891
#> [2] chr1 -2-2 + | 0.170618 0.7707715
#> [3] chr1 0-4 + | 0.242814 0.3558977
#> [4] chr2 10-14 - | -0.179406 0.5355970
#> [5] chr2 12-16 - | -0.630519 0.0930881
#> [6] chr2 14-18 - | 0.978693 0.1698030
#> [7] chr3 8-12 * | 0.293297 0.8998325
#> [8] chr3 10-14 * | -0.370329 0.4226376
#> [9] chr3 12-16 * | 0.543615 0.7477465
#> -------
#> seqinfo: 3 sequences from an unspecified genome; no seqlengths
promoters(x, upstream = 2000, downstream = 200) 通常用于提取基因的启动子区域(promoter regions)。在基因组分析中,启动子区域是靠近基因起始位点的区域,通常被认为是调控基因表达的关键区域。启动子区域是基于基因组范围的起始位置提取的,上游(upstream)和下游(downstream)的长度可以自定义。
Code
promoters (Granges)
#> GRanges object with 9 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <numeric> <numeric>
#> [1] chr1 -1999-200 + | -0.177211 0.2395891
#> [2] chr1 -1997-202 + | 0.170618 0.7707715
#> [3] chr1 -1995-204 + | 0.242814 0.3558977
#> [4] chr2 -190-2009 - | -0.179406 0.5355970
#> [5] chr2 -188-2011 - | -0.630519 0.0930881
#> [6] chr2 -186-2013 - | 0.978693 0.1698030
#> [7] chr3 -1987-212 * | 0.293297 0.8998325
#> [8] chr3 -1985-214 * | -0.370329 0.4226376
#> [9] chr3 -1983-216 * | 0.543615 0.7477465
#> -------
#> seqinfo: 3 sequences from an unspecified genome; no seqlengths
Code
genome (Granges) <- "hg19"
seqinfo (Granges)
#> Seqinfo object with 3 sequences from hg19 genome; no seqlengths:
#> seqnames seqlengths isCircular genome
#> chr1 NA NA hg19
#> chr2 NA NA hg19
#> chr3 NA NA hg19
seqlevels (Granges)
#> [1] "chr1" "chr2" "chr3"
seqlengths (Granges)
#> chr1 chr2 chr3
#> NA NA NA
gaps(x)用于计算给定的 GRanges 对象中缺失(gaps)的基因组区域。它返回的是不被输入的 GRanges 对象覆盖的范围(即空白区域),通常用于确定在某些染色体区域中没有基因或特征的部分。
Code
gaps (Granges)
#> GRanges object with 2 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> [1] chr2 1-6 -
#> [2] chr3 1-12 *
#> -------
#> seqinfo: 3 sequences from hg19 genome; no seqlengths
Code
gr1 <- GRanges (seqnames = "chr1" ,
ranges = IRanges (start = c (1 , 20 ), end = c (10 , 30 )))
gr2 <- GRanges (seqnames = "chr1" ,
ranges = IRanges (start = c (5 , 25 ), end = c (15 , 35 )))
gr1
#> GRanges object with 2 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> [1] chr1 1-10 *
#> [2] chr1 20-30 *
#> -------
#> seqinfo: 1 sequence from an unspecified genome; no seqlengths
gr2
#> GRanges object with 2 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> [1] chr1 5-15 *
#> [2] chr1 25-35 *
#> -------
#> seqinfo: 1 sequence from an unspecified genome; no seqlengths
GenomicRanges:: setdiff (gr1, gr2)
#> GRanges object with 2 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> [1] chr1 1-4 *
#> [2] chr1 20-24 *
#> -------
#> seqinfo: 1 sequence from an unspecified genome; no seqlengths
findOverlaps (gr1,gr2)
#> Hits object with 2 hits and 0 metadata columns:
#> queryHits subjectHits
#> <integer> <integer>
#> [1] 1 1
#> [2] 2 2
#> -------
#> queryLength: 2 / subjectLength: 2
subsetByOverlaps (gr1, gr2)
#> GRanges object with 2 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> [1] chr1 1-10 *
#> [2] chr1 20-30 *
#> -------
#> seqinfo: 1 sequence from an unspecified genome; no seqlengths