14  差异表达基因分析

14.1 数据

肺腺癌

Lung Adenocarcinoma (LUAD) gene expression RNAseq HTSeq - Counts (n=585) GDC 中心 和Phenotype (n=877)

14.2 Assays

14.2.1 对数计数矩阵

arrorw 包

  • read_parquet():读取 Parquet 格式的文件
  • read_delim_arrow():读取带分隔符的文本文件
  • read_csv_arrow():读取逗号分隔值 (CSV) 文件
  • read_tsv_arrow():读取制表符分隔值 (TSV) 文件
Code
library(arrow, warn.conflicts = FALSE)
raw_logCount <- read_tsv_arrow("data/TCGA-LUAD.htseq_counts.tsv")
# 读取计数矩阵 log2(count+1)  # 60488 个gene ×  586 个sample


raw_logCount[1:6,1:3]
Ensembl_ID TCGA-97-7938-01A TCGA-55-7574-01A
ENSG00000000003.13 10.989394 9.967226
ENSG00000000005.5 4.000000 0.000000
ENSG00000000419.11 10.253847 9.541097
ENSG00000000457.12 9.776433 9.131857
ENSG00000000460.15 7.971544 8.348728
ENSG00000000938.11 8.724514 9.703904

14.2.2 基因编码文件

我想要分析总的转录本,不需要把mRNA、lncRNA等挑出来,所以下载的这个自带注释文件ID/Gene Mapping

Code
gtf_v22_transcripts <- read_delim("data/gencode.v22.annotation.gene.probeMap")

如果只想用蛋白质编码(protein_coding)相关基因,用全基因编码注释文件 gencode.v22.annotation

Code
# 全基因编码  BiocManager::install("rtracklayer")
gtf_v22 <- rtracklayer::import('data/gencode.v22.annotation.gtf.gz') |>
 as_tibble()

distinct(gtf_v22,gene_type)
distinct(gtf_v22,gene_name)

# 选择 蛋白编码mRNA相关基因
gtf_v22_mRNA <- dplyr::select(gtf_v22,
                              c("gene_id","gene_type", "gene_name"))|> 
    dplyr::filter(gene_type == "protein_coding") |>  
    distinct()

head(gtf_v22_mRNA )
write.csv(gtf_v22_mRNA,"data/gtf_v22_mRNA.csv")

14.2.3 根据 ENSEBML 编码内连接

Code
gene_logCount <- raw_logCount |> 
  inner_join(gtf_v22_transcripts, by = c("Ensembl_ID" = "id")) |> 
  dplyr::select(gene, starts_with("TCGA") )
Code


dim(gene_logCount)
#> [1] 60483   586
gene_logCount[1:6,1:3]
gene TCGA-97-7938-01A TCGA-55-7574-01A
TSPAN6 10.989394 9.967226
TNMD 4.000000 0.000000
DPM1 10.253847 9.541097
SCYL3 9.776433 9.131857
C1orf112 7.971544 8.348728
FGR 8.724514 9.703904

14.2.4 去对数

先去对数 log2(count+1)

Code
gene_Count <- gene_logCount |> 
 mutate_if(is.numeric,~2^(.)-1)
gene_Count[1:6,1:3]
gene TCGA-97-7938-01A TCGA-55-7574-01A
TSPAN6 2032 1000
TNMD 15 0
DPM1 1220 744
SCYL3 876 560
C1orf112 250 325
FGR 422 833

14.2.5 去重

基因名是有重复的,查看重复情况

Code
gene_Count |> distinct(gene) |> dim()
#> [1] 58387     1
repeat_n <- gene_Count |> summarise(n=n(),.by = gene) |> ungroup() |> 
  summarise(重复n次的基因个数=n(),.by =n ) |> arrange(n)
# 唯一基因个数
sum(repeat_n$重复n次的基因个数)
#> [1] 58387

根据基因名分组,重复则取平均值去重,去掉重复的行

Code
library(data.table)
data.table::setDT(gene_Count)
Counts <- gene_Count[,lapply(.SD,mean),by=gene, .SDcols=patterns("^(TCGA)")]

Counts <- Counts |> column_to_rownames("gene")
dim(Counts)
#> [1] 58387   585

提取lncRNA、miRNA等也是类似的操作

Code
df <- tibble(
    g=c('A','B','A','C','C'),
    x=c(1:5),
    y=c(3:7),
    
)
df
g x y
A 1 3
B 2 4
A 3 5
C 4 6
C 5 7
Code
# 数据多太慢了
df|> group_by(g) |> summarise_all(list(mean))
g x y
A 2.0 4.0
B 2.0 4.0
C 4.5 6.5
Code


# 转化成data.table
# library(data.table) 非常快

setDT(df)
# 对基因g进行分组,并对x和y列计算均值
df[, lapply(.SD, mean), by = g, .SDcols = patterns("x|y")]
g x y
A 2.0 4.0
B 2.0 4.0
C 4.5 6.5

14.3 Sample metadata

14.3.1 表型注释文件

Code
#首先载入下载好胆管癌的样本信息文件
# 877 个sample × 125 个样本变量
colData <- fread("data/TCGA-LUAD.GDC_phenotype.tsv.gz", sep = "\t", header = TRUE)



# 选择样本  放疗与kras基因
table(colData$radiation_therapy,colData$kras_mutation_found)
#>      
#>            NO YES
#>       230   1   3
#>   NO  491  41  24
#>   YES  72  11   4

pf <- colData |> dplyr::filter(kras_mutation_found!= "") |> 
  dplyr::filter(radiation_therapy!= "") |> 
  dplyr::select(submitter_id.samples,submitter_id,radiation_therapy,kras_mutation_found)


xtabs(formula = ~radiation_therapy+kras_mutation_found,data = pf,subset = NULL)
#>                  kras_mutation_found
#> radiation_therapy NO YES
#>               NO  41  24
#>               YES 11   4

到这里,就从877个样本中挑出了80个样本,但是,我们只要这80个样本中成对样本的表达数据,即同一个病人既有癌旁正常细胞的表达蛋白,又有肿瘤细胞的异常表达蛋白。

14.3.2 colData的成对样本

Code
ID <- pf$submitter_id  # 前1~12字符


# 21对配对样本
table(ID)[table(ID)==2] |> length()
#> [1] 21

PairedSample <- pf |> summarise(n=n(),.by = "submitter_id")|> dplyr::filter(n==2)
PairedSample 
submitter_id n
TCGA-50-8457 2
TCGA-55-7913 2
TCGA-55-8089 2
TCGA-44-6148 2
TCGA-50-6593 2
TCGA-99-8028 2
TCGA-50-8459 2
TCGA-50-6594 2
TCGA-50-5936 2
TCGA-55-8205 2
TCGA-49-6744 2
TCGA-50-8460 2
TCGA-49-6742 2
TCGA-99-8033 2
TCGA-49-6745 2
TCGA-91-6848 2
TCGA-50-5066 2
TCGA-50-5935 2
TCGA-J2-8192 2
TCGA-44-6145 2
TCGA-55-7914 2
Code
sample <-  left_join(PairedSample,pf,by=join_by("submitter_id"))
sample
submitter_id n submitter_id.samples radiation_therapy kras_mutation_found
TCGA-50-8457 2 TCGA-50-8457-01A NO YES
TCGA-50-8457 2 TCGA-50-8457-11A NO YES
TCGA-55-7913 2 TCGA-55-7913-01B NO NO
TCGA-55-7913 2 TCGA-55-7913-11A NO NO
TCGA-55-8089 2 TCGA-55-8089-01A NO NO
TCGA-55-8089 2 TCGA-55-8089-11A NO NO
TCGA-44-6148 2 TCGA-44-6148-01A NO NO
TCGA-44-6148 2 TCGA-44-6148-11A NO NO
TCGA-50-6593 2 TCGA-50-6593-01A YES NO
TCGA-50-6593 2 TCGA-50-6593-11A YES NO
TCGA-99-8028 2 TCGA-99-8028-01A NO YES
TCGA-99-8028 2 TCGA-99-8028-11A NO YES
TCGA-50-8459 2 TCGA-50-8459-01A NO YES
TCGA-50-8459 2 TCGA-50-8459-11A NO YES
TCGA-50-6594 2 TCGA-50-6594-01A NO NO
TCGA-50-6594 2 TCGA-50-6594-11A NO NO
TCGA-50-5936 2 TCGA-50-5936-01A NO YES
TCGA-50-5936 2 TCGA-50-5936-11A NO YES
TCGA-55-8205 2 TCGA-55-8205-01A NO NO
TCGA-55-8205 2 TCGA-55-8205-11A NO NO
TCGA-49-6744 2 TCGA-49-6744-01A NO YES
TCGA-49-6744 2 TCGA-49-6744-11A NO YES
TCGA-50-8460 2 TCGA-50-8460-01A YES NO
TCGA-50-8460 2 TCGA-50-8460-11A YES NO
TCGA-49-6742 2 TCGA-49-6742-01A NO NO
TCGA-49-6742 2 TCGA-49-6742-11A NO NO
TCGA-99-8033 2 TCGA-99-8033-01A YES NO
TCGA-99-8033 2 TCGA-99-8033-11A YES NO
TCGA-49-6745 2 TCGA-49-6745-01A NO NO
TCGA-49-6745 2 TCGA-49-6745-11A NO NO
TCGA-91-6848 2 TCGA-91-6848-01A NO NO
TCGA-91-6848 2 TCGA-91-6848-11A NO NO
TCGA-50-5066 2 TCGA-50-5066-01A NO NO
TCGA-50-5066 2 TCGA-50-5066-02A NO NO
TCGA-50-5935 2 TCGA-50-5935-01A YES YES
TCGA-50-5935 2 TCGA-50-5935-11A YES YES
TCGA-J2-8192 2 TCGA-J2-8192-01A NO NO
TCGA-J2-8192 2 TCGA-J2-8192-11A NO NO
TCGA-44-6145 2 TCGA-44-6145-01A NO NO
TCGA-44-6145 2 TCGA-44-6145-11A NO NO
TCGA-55-7914 2 TCGA-55-7914-01A NO YES
TCGA-55-7914 2 TCGA-55-7914-11A NO YES
Code
table(sample$radiation_therapy,sample$kras_mutation_found)
#>      
#>       NO YES
#>   NO  22  12
#>   YES  6   2

14.3.3 匹配 assays的成对样本

并且colData 中出现的成对样本(21对)要匹配到assays 中的成对样本(60对)。(取交集)

Code
# 表达矩阵  
patient_of_counts <- str_sub(colnames(Counts),1,12)


paired_patient <- table(patient_of_counts)[table(patient_of_counts)==2]|> 
 names()  # 筛选出60对成对数据
paired_patient |> length()
#> [1] 60

samplepaired_patient的交集,有8对成对数据

Code
# 取交集
final_sample <- lubridate::intersect(sample$submitter_id,paired_patient)
final_sample
#> [1] "TCGA-44-6148" "TCGA-50-5936" "TCGA-49-6744" "TCGA-49-6742" "TCGA-49-6745"
#> [6] "TCGA-50-5066" "TCGA-50-5935" "TCGA-44-6145"

sample <- sample |>
  dplyr::filter(sample$submitter_id %in% final_sample) |>
  dplyr::select(-submitter_id, -n) |>
  dplyr::rename("sample_id" = "submitter_id.samples") |>
  arrange(sample_id)

sample
sample_id radiation_therapy kras_mutation_found
TCGA-44-6145-01A NO NO
TCGA-44-6145-11A NO NO
TCGA-44-6148-01A NO NO
TCGA-44-6148-11A NO NO
TCGA-49-6742-01A NO NO
TCGA-49-6742-11A NO NO
TCGA-49-6744-01A NO YES
TCGA-49-6744-11A NO YES
TCGA-49-6745-01A NO NO
TCGA-49-6745-11A NO NO
TCGA-50-5066-01A NO NO
TCGA-50-5066-02A NO NO
TCGA-50-5935-01A YES YES
TCGA-50-5935-11A YES YES
TCGA-50-5936-01A NO YES
TCGA-50-5936-11A NO YES
Code
table(sample$radiation_therapy,sample$kras_mutation_found)
#>      
#>       NO YES
#>   NO  10   4
#>   YES  0   2




# 最后的表达矩阵
exprCounts <- Counts  |> dplyr::select(which(colnames(Counts) %in% sample$sample_id)) 

exprCounts <- dplyr::select(exprCounts,order(colnames(exprCounts)))
colnames(exprCounts)
#>  [1] "TCGA-44-6145-01A" "TCGA-44-6145-11A" "TCGA-44-6148-01A" "TCGA-44-6148-11A"
#>  [5] "TCGA-49-6742-01A" "TCGA-49-6742-11A" "TCGA-49-6744-01A" "TCGA-49-6744-11A"
#>  [9] "TCGA-49-6745-01A" "TCGA-49-6745-11A" "TCGA-50-5066-01A" "TCGA-50-5066-02A"
#> [13] "TCGA-50-5935-01A" "TCGA-50-5935-11A" "TCGA-50-5936-01A" "TCGA-50-5936-11A"
dim(exprCounts)
#> [1] 58387    16

# 去除所有样本都不表达的基因
exprCounts <- exprCounts |>
    mutate(rowsum = apply(exprCounts,1,sum)) |>
    dplyr::filter(rowsum!=0) |> 
    dplyr::select(-rowsum)

dim(exprCounts)
#> [1] 46488    16
Code
# 检查计数矩阵和列数据,看看它们在样本顺序方面是否一致。
colnames(exprCounts)
#>  [1] "TCGA-44-6145-01A" "TCGA-44-6145-11A" "TCGA-44-6148-01A" "TCGA-44-6148-11A"
#>  [5] "TCGA-49-6742-01A" "TCGA-49-6742-11A" "TCGA-49-6744-01A" "TCGA-49-6744-11A"
#>  [9] "TCGA-49-6745-01A" "TCGA-49-6745-11A" "TCGA-50-5066-01A" "TCGA-50-5066-02A"
#> [13] "TCGA-50-5935-01A" "TCGA-50-5935-11A" "TCGA-50-5936-01A" "TCGA-50-5936-11A"
sample$sample_id
#>  [1] "TCGA-44-6145-01A" "TCGA-44-6145-11A" "TCGA-44-6148-01A" "TCGA-44-6148-11A"
#>  [5] "TCGA-49-6742-01A" "TCGA-49-6742-11A" "TCGA-49-6744-01A" "TCGA-49-6744-11A"
#>  [9] "TCGA-49-6745-01A" "TCGA-49-6745-11A" "TCGA-50-5066-01A" "TCGA-50-5066-02A"
#> [13] "TCGA-50-5935-01A" "TCGA-50-5935-11A" "TCGA-50-5936-01A" "TCGA-50-5936-11A"

all(sample$sample_id%in% colnames(exprCounts))
#> [1] TRUE
all(sample$SampleID == colnames(exprCounts))
#> [1] TRUE

14.4 构造DEG实例

Code
dim(exprCounts)
#> [1] 46488    16
dim(sample)
#> [1] 16  3
exprCounts <- ceiling(exprCounts)

根据TCGA样本的命名可以区分正常组织和肿瘤样本的测序结果 其中编号01-09表示肿瘤,10-19表示正常

Code
# 字符14~15,
sample$condition <- factor(
 ifelse(as.numeric(str_sub(sample$sample_id,14,15)) < 10,'tumor','normal'),
 levels = c("normal","tumor"),
 ) 

sample <- column_to_rownames(sample,var ="sample_id" )
sample
radiation_therapy kras_mutation_found condition
TCGA-44-6145-01A NO NO tumor
TCGA-44-6145-11A NO NO normal
TCGA-44-6148-01A NO NO tumor
TCGA-44-6148-11A NO NO normal
TCGA-49-6742-01A NO NO tumor
TCGA-49-6742-11A NO NO normal
TCGA-49-6744-01A NO YES tumor
TCGA-49-6744-11A NO YES normal
TCGA-49-6745-01A NO NO tumor
TCGA-49-6745-11A NO NO normal
TCGA-50-5066-01A NO NO tumor
TCGA-50-5066-02A NO NO tumor
TCGA-50-5935-01A YES YES tumor
TCGA-50-5935-11A YES YES normal
TCGA-50-5936-01A NO YES tumor
TCGA-50-5936-11A NO YES normal
Code
library(DESeq2)

dds <- DESeqDataSetFromMatrix(countData = exprCounts,
                              colData = sample,
                              design = ~ condition)
dds
#> class: DESeqDataSet 
#> dim: 46488 16 
#> metadata(1): version
#> assays(1): counts
#> rownames(46488): TSPAN6 TNMD ... LINC01144 RP11-418H16.1
#> rowData names(0):
#> colnames(16): TCGA-44-6145-01A TCGA-44-6145-11A ... TCGA-50-5936-01A
#>   TCGA-50-5936-11A
#> colData names(3): radiation_therapy kras_mutation_found condition

14.4.1 预过滤

Code
smallestGroupSize <- 8

keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize
dds <- dds[keep,]
dds
#> class: DESeqDataSet 
#> dim: 19562 16 
#> metadata(1): version
#> assays(1): counts
#> rownames(19562): TSPAN6 DPM1 ... RP11-274B21.13 LINC01144
#> rowData names(0):
#> colnames(16): TCGA-44-6145-01A TCGA-44-6145-11A ... TCGA-50-5936-01A
#>   TCGA-50-5936-11A
#> colData names(3): radiation_therapy kras_mutation_found condition
Code
dds$condition
#>  [1] tumor  normal tumor  normal tumor  normal tumor  normal tumor  normal
#> [11] tumor  tumor  tumor  normal tumor  normal
#> Levels: normal tumor

dds@colData
#> DataFrame with 16 rows and 3 columns
#>                  radiation_therapy kras_mutation_found condition
#>                        <character>         <character>  <factor>
#> TCGA-44-6145-01A                NO                  NO    tumor 
#> TCGA-44-6145-11A                NO                  NO    normal
#> TCGA-44-6148-01A                NO                  NO    tumor 
#> TCGA-44-6148-11A                NO                  NO    normal
#> TCGA-49-6742-01A                NO                  NO    tumor 
#> ...                            ...                 ...       ...
#> TCGA-50-5066-02A                NO                  NO    tumor 
#> TCGA-50-5935-01A               YES                 YES    tumor 
#> TCGA-50-5935-11A               YES                 YES    normal
#> TCGA-50-5936-01A                NO                 YES    tumor 
#> TCGA-50-5936-11A                NO                 YES    normal
head(counts(dds)) #  19562   ×   16 
#>          TCGA-44-6145-01A TCGA-44-6145-11A TCGA-44-6148-01A TCGA-44-6148-11A
#> TSPAN6               1407              782             4794             3163
#> DPM1                 1293             1003              982             1158
#> SCYL3                 758              389              985              885
#> C1orf112              340               74              173              183
#> FGR                   747             3617              759             3076
#> CFH                  5718             2611            30270             8704
#>          TCGA-49-6742-01A TCGA-49-6742-11A TCGA-49-6744-01A TCGA-49-6744-11A
#> TSPAN6               3275             1697             2798             1360
#> DPM1                  922              986             1453              846
#> SCYL3                 838              498              808              514
#> C1orf112              261               95              271              119
#> FGR                   286             2126             1470             2712
#> CFH                  1574             5208             5880             3922
#>          TCGA-49-6745-01A TCGA-49-6745-11A TCGA-50-5066-01A TCGA-50-5066-02A
#> TSPAN6               2699             2193             3589             8759
#> DPM1                 1159             1584             2848             4398
#> SCYL3                 556              570              602             2118
#> C1orf112              386               98              483             1518
#> FGR                  1064             1631              541             1170
#> CFH                  6554             4154             2071             9081
#>          TCGA-50-5935-01A TCGA-50-5935-11A TCGA-50-5936-01A TCGA-50-5936-11A
#> TSPAN6               2199             1202             2537              725
#> DPM1                 1128              715              621              769
#> SCYL3                1349              441              556              553
#> C1orf112              386               60              346               80
#> FGR                   560             1831              519             3599
#> CFH                  1869             3667             4488             1881

14.5 差异分析

Code
des <- DESeq(dds)

results <- results(
  object = des,
  contrast = c("condition", "tumor", "normal"),
  alpha = 0.05,
  filter = rowMeans(counts(des, normalized = TRUE)),# 独立过滤
  theta = c(0.025, 0.975),
  pAdjustMethod = "BH",
)
metadata(results)["alpha"]
#> $alpha
#> [1] 0.05

results[1:6,]
#> log2 fold change (MLE): condition tumor vs normal 
#> Wald test p-value: condition tumor vs normal 
#> DataFrame with 6 rows and 6 columns
#>           baseMean log2FoldChange     lfcSE      stat      pvalue        padj
#>          <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
#> TSPAN6    2491.467       1.009841  0.284397   3.55082 3.84037e-04 2.72281e-03
#> DPM1      1318.537       0.500298  0.336581   1.48641 1.37170e-01 2.73195e-01
#> SCYL3      734.677       0.605586  0.202364   2.99256 2.76646e-03 1.38374e-02
#> C1orf112   268.629       1.951605  0.287018   6.79959 1.04916e-11 8.03601e-10
#> FGR       1655.054      -1.948066  0.290578  -6.70412 2.02629e-11 1.43131e-09
#> CFH       5564.097       0.608538  0.464534   1.31000 1.90197e-01 3.45701e-01
summary(results)
#> 
#> out of 19562 with nonzero total read count
#> adjusted p-value < 0.05
#> LFC > 0 (up)       : 3248, 17%
#> LFC < 0 (down)     : 2162, 11%
#> outliers [1]       : 0, 0%
#> low counts [2]     : 490, 2.5%
#> (mean count < 13)
#> [1] see 'cooksCutoff' argument of ?results
#> [2] see 'independentFiltering' argument of ?results
sum(results$padj < 0.05, na.rm=TRUE)
#> [1] 5410

# 根据padj 升序
resOrdered <- results[order(results$padj),]

x <- as.data.frame(resOrdered) |> rownames_to_column(var = "gene")
Code
# 导出到csv
write_csv(x, file="data/resOrdered.csv")
Code
mcols(resOrdered)$description
#> [1] "mean of normalized counts for all samples"        
#> [2] "log2 fold change (MLE): condition tumor vs normal"
#> [3] "standard error: condition tumor vs normal"        
#> [4] "Wald statistic: condition tumor vs normal"        
#> [5] "Wald test p-value: condition tumor vs normal"     
#> [6] "BH adjusted p-values"

pvalue,是统计学检验变量,代表差异显著性,一般认为P < 0.05 为显著, P <0.01 为非常显著。其含义为:由抽样误差导致样本间差异的概率小于0.05 或0.01。

padj,是对pvalue进行多重假设检验校正后的结果。转录组测序的差异表达分析是对大量的基因表达值进行的独立统计假设检验,存在假阳性(Family-Wise Error Rate,FWER),Bonferroni矫正

qvalue,False Discovery Rate,FDR,所有假阳性结果占所有拒绝零假设结果的比例,假阳性结果的比例不超过预设的FDR水平(0.05/0.1),基于pvalue的分布。多重假设检验校正方法:Benjamini-Hochberg,

log2FoldChange:对数倍数变化,1.5倍差异即0.585,2倍差异即1。

Code
DEG <- read_csv("data/resOrdered.csv")

DESeq2_DEG <- na.omit(DEG) 
nrDEG <- DESeq2_DEG
nrDEG[1:25,]
gene baseMean log2FoldChange lfcSE stat pvalue padj
MGAT3 1412.48784 -4.320512 0.2750351 -15.708950 0 0
RTKN2 5131.30675 -4.640581 0.3175376 -14.614275 0 0
GPM6A 1228.76968 -5.145913 0.3599325 -14.296883 0 0
SERTM1 298.26711 -6.886846 0.5456395 -12.621604 0 0
LIMS2 1926.16225 -2.660491 0.2146865 -12.392447 0 0
EMP2 31712.01191 -2.672054 0.2229267 -11.986249 0 0
CNTN6 352.34354 -4.394006 0.3796170 -11.574840 0 0
KANK3 494.99214 -2.848104 0.2522619 -11.290266 0 0
RP1-78O14.1 240.68800 -4.394981 0.3899333 -11.271108 0 0
GPM6B 796.46130 -2.528190 0.2297936 -11.002004 0 0
WNT3A 280.10537 -5.271204 0.4854972 -10.857331 0 0
PDLIM2 1757.52070 -2.208155 0.2065943 -10.688365 0 0
ITLN2 655.74413 -5.911975 0.5599572 -10.557904 0 0
SLC6A13 35.47249 -3.659554 0.3472313 -10.539239 0 0
GRK5 2390.05577 -2.795299 0.2713274 -10.302310 0 0
EDNRB 3236.67412 -3.288727 0.3211718 -10.239777 0 0
KCNN4 1240.49343 3.836694 0.3783690 10.140085 0 0
CLIC5 8327.53172 -3.732958 0.3685035 -10.130047 0 0
RASAL1 261.37053 4.545893 0.4524802 10.046612 0 0
RP11-371A19.2 54.45064 -5.168343 0.5162747 -10.010838 0 0
PTPN21 2327.32072 -2.528024 0.2537924 -9.960991 0 0
CLEC3B 2091.83893 -4.223886 0.4295406 -9.833498 0 0
CTD-2135D7.5 38.21502 -3.366427 0.3434194 -9.802669 0 0
FAM83A 2565.79810 6.377766 0.6515020 9.789327 0 0
PTPRH 490.58126 6.130308 0.6282370 9.757954 0 0

14.5.1 热图

Code

log2exprCounts <- log2(counts(des)+1)
library(pheatmap)

pheatmap(log2exprCounts[(head(DESeq2_DEG$gene,n=25)) , ])

14.5.2 挑选指定第一个基因看它在同一个病人的配对表达情况

Code
library(ggpubr)

df <- tibble(
  group = sample$condition,
  patient = colnames(exprCounts),
  expressionValue = as.numeric(exprCounts[DESeq2_DEG$gene[1],]),
  )

ggpubr::ggpaired(df, x = "group", y = "expressionValue",color = "group", 
                 line.color = "gray", line.size = 0.4,palette = "npg")+
    stat_compare_means()

14.5.3 火山图

Code
nrDEG$group <- factor(
    if_else(nrDEG$padj<0.05& abs(nrDEG$log2FoldChange)>=1,
            if_else(nrDEG$log2FoldChange>=1,"up","down"),
            "NS",),
    levels =c("up","down","NS") )
table(nrDEG$group)
#> 
#>    up  down    NS 
#>  2075  1641 15356

gene_labels <- dplyr::filter(nrDEG,nrDEG$padj<0.05 & 
                                 abs(nrDEG$log2FoldChange)>=1)|> 
    slice_head(n=25) 


ggplot(nrDEG,aes( x = log2FoldChange,y = -log10(padj),color=group,shape=group))+
    geom_point(alpha=0.5,size=1.5)+
    scale_color_manual(values = c("red","green","gray"))+
    #scale_shape_manual(values = c(2,25,4))+
    geom_vline(xintercept = c(-1,0,1),lty=3,color="grey25",lwd=0.8)+
    geom_hline(yintercept = -log10(0.05),lty=3,color="grey25",lwd=0.8)+
    ggrepel::geom_text_repel(
        data = gene_labels,
        aes(label=gene),
        color="black",
        size=2)+
    theme_pubr()

14.5.4 LFC收缩和pagj

Code
resultsNames(des)
#> [1] "Intercept"                 "condition_tumor_vs_normal"
# BiocManager::install("apeglm")  自适应 t 先验收缩估计器  默认
resLFC <- lfcShrink(des, coef="condition_tumor_vs_normal", type="apeglm")

# `normal`是原始的 DESeq2 收缩估计器,是先验的自适应正态分布
resNorm <- lfcShrink(des, coef=2, type="normal")

# BiocManager::install("ashr")
resAsh <- lfcShrink(des, coef=2, type="ashr")


## 先验信息

priorInfo(resLFC)
#> $type
#> [1] "apeglm"
#> 
#> $package
#> [1] "apeglm"
#> 
#> $version
#> [1] '1.24.0'
#> 
#> $prior.control
#> $prior.control$no.shrink
#> [1] 1
#> 
#> $prior.control$prior.mean
#> [1] 0
#> 
#> $prior.control$prior.scale
#> [1] 0.6068578
#> 
#> $prior.control$prior.df
#> [1] 1
#> 
#> $prior.control$prior.no.shrink.mean
#> [1] 0
#> 
#> $prior.control$prior.no.shrink.scale
#> [1] 15
#> 
#> $prior.control$prior.var
#> [1] 0.3682764
priorInfo(resNorm)
#> $type
#> [1] "normal"
#> 
#> $package
#> [1] "DESeq2"
#> 
#> $version
#> [1] '1.42.1'
#> 
#> $betaPriorVar
#>      Intercept conditiontumor 
#>   1.000000e+06   1.659073e+00
priorInfo(resAsh)
#> $type
#> [1] "ashr"
#> 
#> $package
#> [1] "ashr"
#> 
#> $version
#> [1] '2.2.63'
#> 
#> $fitted_g
#> $pi
#>  [1] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
#>  [7] 0.00000000 0.00000000 0.00000000 0.00000000 0.05790045 0.49310211
#> [13] 0.16319352 0.00000000 0.00000000 0.20429865 0.08150528 0.00000000
#> [19] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
#> 
#> $mean
#>  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#> 
#> $sd
#>  [1]  0.008276313  0.011704474  0.016552626  0.023408948  0.033105252
#>  [6]  0.046817896  0.066210503  0.093635792  0.132421007  0.187271584
#> [11]  0.264842013  0.374543167  0.529684027  0.749086335  1.059368054
#> [16]  1.498172669  2.118736108  2.996345339  4.237472216  5.992690678
#> [21]  8.474944432 11.985381355 16.949888863
#> 
#> attr(,"class")
#> [1] "normalmix"
#> attr(,"row.names")
#>  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23

14.5.5 MA图 Mean-AbsDeviation plot

Code
# 对gene添加“是否呈现显著差异表达”的标签
DESeq2_DEG$significant <- factor(ifelse(DESeq2_DEG$padj <0.05, "Significant", "NS"),
levels = c("Significant", "NS"))

table(DESeq2_DEG$significant)
#> 
#> Significant          NS 
#>        5410       13662
# 以baseMean作为x,log2FoldChange作为y   以significant作为分类变量
ggplot(DESeq2_DEG, aes(baseMean, log2FoldChange, colour=significant)) + 
 geom_point(size=1) + 
 scale_y_continuous(limits=c(-3, 3)) + 
 scale_x_log10() + 
 geom_hline(yintercept = 0, colour="black", linewidth=1) + 
 labs(x="mean of normalized counts", y="log2FoldChange") + 
 scale_colour_manual(name="padj", 
                     values=c("Significant"="blue","NS"="grey50")) +
 theme_classic()

Code
xlim <- c(1,10e6)
ylim <- c(-3,3)
plotMA(resOrdered,  xlim=xlim, ylim=ylim,main="none")

Code
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")

Code
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")

Code
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")

14.5.6 readCounts

Code
#  the counts of reads for a single gene across the groups
plotCounts(des, gene=which.min(resOrdered$padj), 
  intgroup="condition",returnData=TRUE) |> 
 ggplot(aes(x=condition, y=count)) + 
 geom_point(position=position_jitter(w=0.1,h=0))

14.5.7 异常观测

Code
W <- resOrdered$stat
maxCooks <- apply(assays(des)[["cooks"]],1,max)
idx <- !is.na(W)
plot(rank(W[idx]), maxCooks[idx], xlab="rank of Wald statistic", 
  ylab="maximum Cook's distance per gene",
  ylim=c(0,5), cex=.4, col=rgb(0,0,0,.3))
m <- ncol(des)
p <- 3
abline(h=qf(.99, p, m - p))

14.5.8 多因素设计

Code
colData(dds)
#> DataFrame with 16 rows and 3 columns
#>                  radiation_therapy kras_mutation_found condition
#>                        <character>         <character>  <factor>
#> TCGA-44-6145-01A                NO                  NO    tumor 
#> TCGA-44-6145-11A                NO                  NO    normal
#> TCGA-44-6148-01A                NO                  NO    tumor 
#> TCGA-44-6148-11A                NO                  NO    normal
#> TCGA-49-6742-01A                NO                  NO    tumor 
#> ...                            ...                 ...       ...
#> TCGA-50-5066-02A                NO                  NO    tumor 
#> TCGA-50-5935-01A               YES                 YES    tumor 
#> TCGA-50-5935-11A               YES                 YES    normal
#> TCGA-50-5936-01A                NO                 YES    tumor 
#> TCGA-50-5936-11A                NO                 YES    normal