Code
library(arrow, warn.conflicts = FALSE)
logCount <- read_tsv_arrow("data/DEGs_Analysis/TCGA-LUAD.htseq_counts.tsv.gz")
# 读取计数矩阵 log2(count+1) # 60488 个gene × 586 个sample
https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html
肺腺癌
Lung Adenocarcinoma (LUAD) gene expression RNAseq HTSeq - Counts (n=585) GDC 中心 和Phenotype (n=877)
library(arrow, warn.conflicts = FALSE)
logCount <- read_tsv_arrow("data/DEGs_Analysis/TCGA-LUAD.htseq_counts.tsv.gz")
# 读取计数矩阵 log2(count+1) # 60488 个gene × 586 个sample
如果分析总的转录本,不需要把mRNA、lncRNA等挑出来,所以下载的这个自带注释文件ID/Gene Mapping
gtf_v22_transcripts <- read_delim("data/DEGs_Analysis/gencode.v22.annotation.gene.probeMap")
如果只想用蛋白质编码(protein_coding)相关基因,用全基因编码注释文件 gencode.v46.annotation.gtf.gz
# 全基因编码 BiocManager::install("rtracklayer")
gtf_v46 <- rtracklayer::import('data/DEGs_Analysis/gencode.v46.annotation.gtf.gz') |>
as_tibble()
distinct(gtf_v46,gene_type)
distinct(gtf_v46,gene_name)
# 选择 蛋白编码mRNA相关基因
gtf_v46_mRNA <- dplyr::select(gtf_v46,
c("gene_id","gene_type", "gene_name"))|>
dplyr::filter(gene_type == "protein_coding") |>
distinct()
head(gtf_v46_mRNA )
write.csv(gtf_v46_mRNA,"data/gtf_v46_mRNA.csv")
gene_logCount <- logCount |>
inner_join(gtf_v22_transcripts, by = c("Ensembl_ID" = "id")) |>
dplyr::select(gene, starts_with("TCGA") )
dim(gene_logCount)
#> [1] 60483 586
gene_logCount[1:6,1:3]
gene | TCGA-97-7938-01A | TCGA-55-7574-01A |
---|---|---|
TSPAN6 | 10.989394 | 9.967226 |
TNMD | 4.000000 | 0.000000 |
DPM1 | 10.253847 | 9.541097 |
SCYL3 | 9.776433 | 9.131857 |
C1orf112 | 7.971544 | 8.348728 |
FGR | 8.724514 | 9.703904 |
先去对数 log2(count+1)
基因名是有重复的,查看重复情况
根据基因名分组,重复则取平均值去重,去掉重复的行
library(data.table)
setDT(gene_Count)
Counts <- gene_Count[,lapply(.SD,mean),by=gene, .SDcols=patterns("^(TCGA)")]
Counts <- Counts |> column_to_rownames("gene")
dim(Counts)
#> [1] 58387 585
g | x | y |
---|---|---|
A | 1 | 3 |
B | 2 | 4 |
A | 3 | 5 |
C | 4 | 6 |
C | 5 | 7 |
# 数据多太慢了
df|> group_by(g) |> summarise_all(list(mean))
g | x | y |
---|---|---|
A | 2.0 | 4.0 |
B | 2.0 | 4.0 |
C | 4.5 | 6.5 |
g | x | y |
---|---|---|
A | 2.0 | 4.0 |
B | 2.0 | 4.0 |
C | 4.5 | 6.5 |
提取lncRNA、miRNA等也是类似的操作
#首先载入下载好胆管癌的样本信息文件
# 877 个sample × 125 个样本变量
colData <- fread("data/DEGs_Analysis/TCGA-LUAD.GDC_phenotype.tsv.gz", sep = "\t", header = TRUE)
# 选择样本 放疗与kras基因
table(colData$radiation_therapy,colData$kras_mutation_found)
#>
#> NO YES
#> 230 1 3
#> NO 491 41 24
#> YES 72 11 4
pf <- colData |> dplyr::filter(kras_mutation_found!= "") |>
dplyr::filter(radiation_therapy!= "") |>
dplyr::select(submitter_id.samples,submitter_id,radiation_therapy,kras_mutation_found)
xtabs(formula = ~radiation_therapy+kras_mutation_found,data = pf,subset = NULL)
#> kras_mutation_found
#> radiation_therapy NO YES
#> NO 41 24
#> YES 11 4
到这里,就从877个样本中挑出了80个样本,但是,我们只要这80个样本中成对样本的表达数据,即同一个病人既有癌旁正常细胞的表达蛋白,又有肿瘤细胞的异常表达蛋白。
submitter_id | n |
---|---|
TCGA-50-8457 | 2 |
TCGA-55-7913 | 2 |
TCGA-55-8089 | 2 |
TCGA-44-6148 | 2 |
TCGA-50-6593 | 2 |
TCGA-99-8028 | 2 |
TCGA-50-8459 | 2 |
TCGA-50-6594 | 2 |
TCGA-50-5936 | 2 |
TCGA-55-8205 | 2 |
TCGA-49-6744 | 2 |
TCGA-50-8460 | 2 |
TCGA-49-6742 | 2 |
TCGA-99-8033 | 2 |
TCGA-49-6745 | 2 |
TCGA-91-6848 | 2 |
TCGA-50-5066 | 2 |
TCGA-50-5935 | 2 |
TCGA-J2-8192 | 2 |
TCGA-44-6145 | 2 |
TCGA-55-7914 | 2 |
sample <- left_join(PairedSample,pf,by=join_by("submitter_id"))
sample
submitter_id | n | submitter_id.samples | radiation_therapy | kras_mutation_found |
---|---|---|---|---|
TCGA-50-8457 | 2 | TCGA-50-8457-01A | NO | YES |
TCGA-50-8457 | 2 | TCGA-50-8457-11A | NO | YES |
TCGA-55-7913 | 2 | TCGA-55-7913-01B | NO | NO |
TCGA-55-7913 | 2 | TCGA-55-7913-11A | NO | NO |
TCGA-55-8089 | 2 | TCGA-55-8089-01A | NO | NO |
TCGA-55-8089 | 2 | TCGA-55-8089-11A | NO | NO |
TCGA-44-6148 | 2 | TCGA-44-6148-01A | NO | NO |
TCGA-44-6148 | 2 | TCGA-44-6148-11A | NO | NO |
TCGA-50-6593 | 2 | TCGA-50-6593-01A | YES | NO |
TCGA-50-6593 | 2 | TCGA-50-6593-11A | YES | NO |
TCGA-99-8028 | 2 | TCGA-99-8028-01A | NO | YES |
TCGA-99-8028 | 2 | TCGA-99-8028-11A | NO | YES |
TCGA-50-8459 | 2 | TCGA-50-8459-01A | NO | YES |
TCGA-50-8459 | 2 | TCGA-50-8459-11A | NO | YES |
TCGA-50-6594 | 2 | TCGA-50-6594-01A | NO | NO |
TCGA-50-6594 | 2 | TCGA-50-6594-11A | NO | NO |
TCGA-50-5936 | 2 | TCGA-50-5936-01A | NO | YES |
TCGA-50-5936 | 2 | TCGA-50-5936-11A | NO | YES |
TCGA-55-8205 | 2 | TCGA-55-8205-01A | NO | NO |
TCGA-55-8205 | 2 | TCGA-55-8205-11A | NO | NO |
TCGA-49-6744 | 2 | TCGA-49-6744-01A | NO | YES |
TCGA-49-6744 | 2 | TCGA-49-6744-11A | NO | YES |
TCGA-50-8460 | 2 | TCGA-50-8460-01A | YES | NO |
TCGA-50-8460 | 2 | TCGA-50-8460-11A | YES | NO |
TCGA-49-6742 | 2 | TCGA-49-6742-01A | NO | NO |
TCGA-49-6742 | 2 | TCGA-49-6742-11A | NO | NO |
TCGA-99-8033 | 2 | TCGA-99-8033-01A | YES | NO |
TCGA-99-8033 | 2 | TCGA-99-8033-11A | YES | NO |
TCGA-49-6745 | 2 | TCGA-49-6745-01A | NO | NO |
TCGA-49-6745 | 2 | TCGA-49-6745-11A | NO | NO |
TCGA-91-6848 | 2 | TCGA-91-6848-01A | NO | NO |
TCGA-91-6848 | 2 | TCGA-91-6848-11A | NO | NO |
TCGA-50-5066 | 2 | TCGA-50-5066-01A | NO | NO |
TCGA-50-5066 | 2 | TCGA-50-5066-02A | NO | NO |
TCGA-50-5935 | 2 | TCGA-50-5935-01A | YES | YES |
TCGA-50-5935 | 2 | TCGA-50-5935-11A | YES | YES |
TCGA-J2-8192 | 2 | TCGA-J2-8192-01A | NO | NO |
TCGA-J2-8192 | 2 | TCGA-J2-8192-11A | NO | NO |
TCGA-44-6145 | 2 | TCGA-44-6145-01A | NO | NO |
TCGA-44-6145 | 2 | TCGA-44-6145-11A | NO | NO |
TCGA-55-7914 | 2 | TCGA-55-7914-01A | NO | YES |
TCGA-55-7914 | 2 | TCGA-55-7914-11A | NO | YES |
table(sample$radiation_therapy,sample$kras_mutation_found)
#>
#> NO YES
#> NO 22 12
#> YES 6 2
并且colData
中出现的成对样本(21对)要匹配到assays
中的成对样本(60对)。(取交集)
取sample
和paired_patient
的交集,有8对成对数据
# 取交集
final_sample <- lubridate::intersect(sample$submitter_id,paired_patient)
final_sample
#> [1] "TCGA-44-6148" "TCGA-50-5936" "TCGA-49-6744" "TCGA-49-6742" "TCGA-49-6745"
#> [6] "TCGA-50-5066" "TCGA-50-5935" "TCGA-44-6145"
sample <- sample |>
dplyr::filter(sample$submitter_id %in% final_sample) |>
dplyr::select(-submitter_id, -n) |>
dplyr::rename("sample_id" = "submitter_id.samples") |>
arrange(sample_id)
sample
sample_id | radiation_therapy | kras_mutation_found |
---|---|---|
TCGA-44-6145-01A | NO | NO |
TCGA-44-6145-11A | NO | NO |
TCGA-44-6148-01A | NO | NO |
TCGA-44-6148-11A | NO | NO |
TCGA-49-6742-01A | NO | NO |
TCGA-49-6742-11A | NO | NO |
TCGA-49-6744-01A | NO | YES |
TCGA-49-6744-11A | NO | YES |
TCGA-49-6745-01A | NO | NO |
TCGA-49-6745-11A | NO | NO |
TCGA-50-5066-01A | NO | NO |
TCGA-50-5066-02A | NO | NO |
TCGA-50-5935-01A | YES | YES |
TCGA-50-5935-11A | YES | YES |
TCGA-50-5936-01A | NO | YES |
TCGA-50-5936-11A | NO | YES |
table(sample$radiation_therapy,sample$kras_mutation_found)
#>
#> NO YES
#> NO 10 4
#> YES 0 2
# 最后的表达矩阵
exprCounts <- Counts |> dplyr::select(which(colnames(Counts) %in% sample$sample_id))
exprCounts <- dplyr::select(exprCounts,order(colnames(exprCounts)))
colnames(exprCounts)
#> [1] "TCGA-44-6145-01A" "TCGA-44-6145-11A" "TCGA-44-6148-01A" "TCGA-44-6148-11A"
#> [5] "TCGA-49-6742-01A" "TCGA-49-6742-11A" "TCGA-49-6744-01A" "TCGA-49-6744-11A"
#> [9] "TCGA-49-6745-01A" "TCGA-49-6745-11A" "TCGA-50-5066-01A" "TCGA-50-5066-02A"
#> [13] "TCGA-50-5935-01A" "TCGA-50-5935-11A" "TCGA-50-5936-01A" "TCGA-50-5936-11A"
dim(exprCounts)
#> [1] 58387 16
# 去除所有样本都不表达的基因
exprCounts <- exprCounts |>
mutate(rowsum = apply(exprCounts,1,sum)) |>
dplyr::filter(rowsum!=0) |>
dplyr::select(-rowsum)
dim(exprCounts)
#> [1] 46488 16
# 检查计数矩阵和列数据,看看它们在样本顺序方面是否一致。
colnames(exprCounts)
#> [1] "TCGA-44-6145-01A" "TCGA-44-6145-11A" "TCGA-44-6148-01A" "TCGA-44-6148-11A"
#> [5] "TCGA-49-6742-01A" "TCGA-49-6742-11A" "TCGA-49-6744-01A" "TCGA-49-6744-11A"
#> [9] "TCGA-49-6745-01A" "TCGA-49-6745-11A" "TCGA-50-5066-01A" "TCGA-50-5066-02A"
#> [13] "TCGA-50-5935-01A" "TCGA-50-5935-11A" "TCGA-50-5936-01A" "TCGA-50-5936-11A"
sample$sample_id
#> [1] "TCGA-44-6145-01A" "TCGA-44-6145-11A" "TCGA-44-6148-01A" "TCGA-44-6148-11A"
#> [5] "TCGA-49-6742-01A" "TCGA-49-6742-11A" "TCGA-49-6744-01A" "TCGA-49-6744-11A"
#> [9] "TCGA-49-6745-01A" "TCGA-49-6745-11A" "TCGA-50-5066-01A" "TCGA-50-5066-02A"
#> [13] "TCGA-50-5935-01A" "TCGA-50-5935-11A" "TCGA-50-5936-01A" "TCGA-50-5936-11A"
all(sample$sample_id%in% colnames(exprCounts))
#> [1] TRUE
all(sample$SampleID == colnames(exprCounts))
#> [1] TRUE
根据TCGA样本的命名可以区分正常组织和肿瘤样本的测序结果 其中编号01-09表示肿瘤,10-19表示正常
# 字符14~15,
sample$condition <- factor(
ifelse(as.numeric(str_sub(sample$sample_id,14,15)) < 10,'tumor','normal'),
levels = c("normal","tumor"),
)
sample <- column_to_rownames(sample,var ="sample_id" )
sample
radiation_therapy | kras_mutation_found | condition | |
---|---|---|---|
TCGA-44-6145-01A | NO | NO | tumor |
TCGA-44-6145-11A | NO | NO | normal |
TCGA-44-6148-01A | NO | NO | tumor |
TCGA-44-6148-11A | NO | NO | normal |
TCGA-49-6742-01A | NO | NO | tumor |
TCGA-49-6742-11A | NO | NO | normal |
TCGA-49-6744-01A | NO | YES | tumor |
TCGA-49-6744-11A | NO | YES | normal |
TCGA-49-6745-01A | NO | NO | tumor |
TCGA-49-6745-11A | NO | NO | normal |
TCGA-50-5066-01A | NO | NO | tumor |
TCGA-50-5066-02A | NO | NO | tumor |
TCGA-50-5935-01A | YES | YES | tumor |
TCGA-50-5935-11A | YES | YES | normal |
TCGA-50-5936-01A | NO | YES | tumor |
TCGA-50-5936-11A | NO | YES | normal |
# BiocManager::install("DESeq2")
library(DESeq2)
conflicts_prefer(GenomicRanges::setdiff)
dds <- DESeqDataSetFromMatrix(countData = exprCounts,
colData = sample,
design = ~ condition)
dds
#> class: DESeqDataSet
#> dim: 46488 16
#> metadata(1): version
#> assays(1): counts
#> rownames(46488): TSPAN6 TNMD ... LINC01144 RP11-418H16.1
#> rowData names(0):
#> colnames(16): TCGA-44-6145-01A TCGA-44-6145-11A ... TCGA-50-5936-01A
#> TCGA-50-5936-11A
#> colData names(3): radiation_therapy kras_mutation_found condition
smallestGroupSize <- 8
keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize
dds <- dds[keep,]
dds
#> class: DESeqDataSet
#> dim: 19562 16
#> metadata(1): version
#> assays(1): counts
#> rownames(19562): TSPAN6 DPM1 ... RP11-274B21.13 LINC01144
#> rowData names(0):
#> colnames(16): TCGA-44-6145-01A TCGA-44-6145-11A ... TCGA-50-5936-01A
#> TCGA-50-5936-11A
#> colData names(3): radiation_therapy kras_mutation_found condition
dds$condition
#> [1] tumor normal tumor normal tumor normal tumor normal tumor normal
#> [11] tumor tumor tumor normal tumor normal
#> Levels: normal tumor
dds@colData
#> DataFrame with 16 rows and 3 columns
#> radiation_therapy kras_mutation_found condition
#> <character> <character> <factor>
#> TCGA-44-6145-01A NO NO tumor
#> TCGA-44-6145-11A NO NO normal
#> TCGA-44-6148-01A NO NO tumor
#> TCGA-44-6148-11A NO NO normal
#> TCGA-49-6742-01A NO NO tumor
#> ... ... ... ...
#> TCGA-50-5066-02A NO NO tumor
#> TCGA-50-5935-01A YES YES tumor
#> TCGA-50-5935-11A YES YES normal
#> TCGA-50-5936-01A NO YES tumor
#> TCGA-50-5936-11A NO YES normal
head(counts(dds)) # 19562 × 16
#> TCGA-44-6145-01A TCGA-44-6145-11A TCGA-44-6148-01A TCGA-44-6148-11A
#> TSPAN6 1407 783 4794 3163
#> DPM1 1293 1003 982 1158
#> SCYL3 758 389 985 885
#> C1orf112 340 74 173 183
#> FGR 747 3617 759 3076
#> CFH 5718 2611 30270 8703
#> TCGA-49-6742-01A TCGA-49-6742-11A TCGA-49-6744-01A TCGA-49-6744-11A
#> TSPAN6 3275 1696 2798 1360
#> DPM1 922 986 1453 846
#> SCYL3 838 498 808 514
#> C1orf112 261 95 271 119
#> FGR 286 2126 1470 2712
#> CFH 1574 5208 5880 3922
#> TCGA-49-6745-01A TCGA-49-6745-11A TCGA-50-5066-01A TCGA-50-5066-02A
#> TSPAN6 2700 2194 3589 8759
#> DPM1 1159 1584 2848 4398
#> SCYL3 555 570 602 2118
#> C1orf112 386 98 483 1518
#> FGR 1065 1631 541 1169
#> CFH 6554 4153 2071 9081
#> TCGA-50-5935-01A TCGA-50-5935-11A TCGA-50-5936-01A TCGA-50-5936-11A
#> TSPAN6 2199 1202 2537 725
#> DPM1 1129 715 621 769
#> SCYL3 1349 441 556 553
#> C1orf112 386 60 346 80
#> FGR 560 1831 519 3600
#> CFH 1869 3667 4488 1881
des <- DESeq(dds)
results <- results(
object = des,
contrast = c("condition", "tumor", "normal"),
alpha = 0.05,
filter = rowMeans(counts(des, normalized = TRUE)),# 独立过滤
theta = c(0.025, 0.975),
pAdjustMethod = "BH",
)
metadata(results)["alpha"]
#> $alpha
#> [1] 0.05
results[1:6,]
#> log2 fold change (MLE): condition tumor vs normal
#> Wald test p-value: condition tumor vs normal
#> DataFrame with 6 rows and 6 columns
#> baseMean log2FoldChange lfcSE stat pvalue padj
#> <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
#> TSPAN6 2491.784 1.009957 0.284407 3.55110 3.83631e-04 2.72195e-03
#> DPM1 1318.670 0.500631 0.336588 1.48737 1.36918e-01 2.72805e-01
#> SCYL3 734.633 0.605618 0.202489 2.99087 2.78184e-03 1.38997e-02
#> C1orf112 268.645 1.951846 0.287025 6.80026 1.04432e-11 7.96688e-10
#> FGR 1655.039 -1.947828 0.290586 -6.70310 2.04049e-11 1.44134e-09
#> CFH 5564.337 0.608888 0.464523 1.31078 1.89932e-01 3.45383e-01
summary(results)
#>
#> out of 19562 with nonzero total read count
#> adjusted p-value < 0.05
#> LFC > 0 (up) : 3248, 17%
#> LFC < 0 (down) : 2160, 11%
#> outliers [1] : 0, 0%
#> low counts [2] : 490, 2.5%
#> (mean count < 13)
#> [1] see 'cooksCutoff' argument of ?results
#> [2] see 'independentFiltering' argument of ?results
sum(results$padj < 0.05, na.rm=TRUE)
#> [1] 5408
# 根据padj 升序
resOrdered <- results[order(results$padj),]
保存csv以备热图、火山图、富集分析。
x <- as.data.frame(resOrdered) |> rownames_to_column(var = "gene")
write_csv(x, file="data/DEGs_Analysis/resOrdered.csv")
mcols(resOrdered)$description
#> [1] "mean of normalized counts for all samples"
#> [2] "log2 fold change (MLE): condition tumor vs normal"
#> [3] "standard error: condition tumor vs normal"
#> [4] "Wald statistic: condition tumor vs normal"
#> [5] "Wald test p-value: condition tumor vs normal"
#> [6] "BH adjusted p-values"
pvalue,是统计学检验变量,代表差异显著性,一般认为P < 0.05 为显著, P <0.01 为非常显著。其含义为:由抽样误差导致样本间差异的概率小于0.05 或0.01。
padj,是对pvalue进行多重假设检验校正后的结果。转录组测序的差异表达分析是对大量的基因表达值进行的独立统计假设检验,存在假阳性(Family-Wise Error Rate,FWER),Bonferroni矫正
qvalue,False Discovery Rate,FDR,所有假阳性结果占所有拒绝零假设结果的比例,假阳性结果的比例不超过预设的FDR水平(0.05/0.1),基于pvalue的分布。多重假设检验校正方法:Benjamini-Hochberg,
log2FoldChange:对数倍数变化,1.5倍差异即0.585,2倍差异即1。
DEG <- read_csv("data/DEGs_Analysis/resOrdered.csv")
DESeq2_DEG <- na.omit(DEG)
nrDEG <- DESeq2_DEG
nrDEG[1:25,]
gene | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
---|---|---|---|---|---|---|
MGAT3 | 1412.43862 | -4.320278 | 0.2750470 | -15.707414 | 0 | 0 |
RTKN2 | 5130.81804 | -4.640307 | 0.3175103 | -14.614667 | 0 | 0 |
GPM6A | 1228.70589 | -5.145635 | 0.3599700 | -14.294621 | 0 | 0 |
SERTM1 | 298.17214 | -6.886260 | 0.5455626 | -12.622309 | 0 | 0 |
LIMS2 | 1926.05684 | -2.660279 | 0.2146619 | -12.392878 | 0 | 0 |
EMP2 | 31710.06920 | -2.671839 | 0.2229468 | -11.984199 | 0 | 0 |
CNTN6 | 352.36029 | -4.393899 | 0.3796112 | -11.574736 | 0 | 0 |
KANK3 | 495.11283 | -2.848364 | 0.2522794 | -11.290513 | 0 | 0 |
RP1-78O14.1 | 240.67218 | -4.394661 | 0.3899644 | -11.269390 | 0 | 0 |
GPM6B | 796.26123 | -2.528567 | 0.2297754 | -11.004520 | 0 | 0 |
WNT3A | 280.08606 | -5.270964 | 0.4854622 | -10.857621 | 0 | 0 |
PDLIM2 | 1757.53896 | -2.207799 | 0.2064731 | -10.692915 | 0 | 0 |
ITLN2 | 655.86747 | -5.904468 | 0.5588234 | -10.565893 | 0 | 0 |
SLC6A13 | 35.46976 | -3.659195 | 0.3472796 | -10.536740 | 0 | 0 |
GRK5 | 2389.94916 | -2.795107 | 0.2712779 | -10.303480 | 0 | 0 |
EDNRB | 3236.38793 | -3.288481 | 0.3211332 | -10.240242 | 0 | 0 |
KCNN4 | 1240.55071 | 3.836884 | 0.3783283 | 10.141679 | 0 | 0 |
CLIC5 | 8327.11939 | -3.732657 | 0.3685299 | -10.128505 | 0 | 0 |
RASAL1 | 261.44562 | 4.538838 | 0.4504928 | 10.075272 | 0 | 0 |
RP11-371A19.2 | 54.44687 | -5.168047 | 0.5162906 | -10.009957 | 0 | 0 |
PTPN21 | 2327.21817 | -2.527771 | 0.2538084 | -9.959365 | 0 | 0 |
CLEC3B | 2091.74017 | -4.223742 | 0.4294719 | -9.834733 | 0 | 0 |
CTD-2135D7.5 | 38.21341 | -3.366170 | 0.3434858 | -9.800029 | 0 | 0 |
FAM83A | 2565.85011 | 6.377845 | 0.6514036 | 9.790927 | 0 | 0 |
PTPRH | 490.58361 | 6.130422 | 0.6281505 | 9.759480 | 0 | 0 |
挑选指定第一个基因看它在同一个病人的配对表达情况
library(ggpubr)
df <- tibble(
group = sample$condition,
patient = colnames(exprCounts),
expressionValue = as.numeric(exprCounts[DESeq2_DEG$gene[1],]),
)
ggpubr::ggpaired(df, x = "group", y = "expressionValue",color = "group",
line.color = "gray", line.size = 0.4,palette = "npg")+
stat_compare_means()
nrDEG$group <- factor(
if_else(nrDEG$padj<0.05& abs(nrDEG$log2FoldChange)>=1,
if_else(nrDEG$log2FoldChange>=1,"up","down"),
"NS",),
levels =c("up","down","NS") )
table(nrDEG$group)
#>
#> up down NS
#> 2073 1641 15358
gene_labels <- dplyr::filter(nrDEG,nrDEG$padj<0.05 &
abs(nrDEG$log2FoldChange)>=1)|>
slice_head(n=25)
ggplot(nrDEG,aes( x = log2FoldChange,y = -log10(padj),color=group,shape=group))+
geom_point(alpha=0.5,size=1.5)+
scale_color_manual(values = c("red","green","gray"))+
#scale_shape_manual(values = c(2,25,4))+
geom_vline(xintercept = c(-1,0,1),lty=3,color="grey25",lwd=0.8)+
geom_hline(yintercept = -log10(0.05),lty=3,color="grey25",lwd=0.8)+
ggrepel::geom_text_repel(
data = gene_labels,
aes(label=gene),
color="black",
size=2)+
theme_pubr()
if(!require(EnhancedVolcano))
BiocManager::install('EnhancedVolcano')
resultsNames(des)
#> [1] "Intercept" "condition_tumor_vs_normal"
# BiocManager::install("apeglm") 自适应 t 先验收缩估计器 默认
# resLFC <- lfcShrink(des, coef="condition_tumor_vs_normal", type="apeglm")
# `normal`是原始的 DESeq2 收缩估计器,是先验的自适应正态分布
resNorm <- lfcShrink(des, coef=2, type="normal")
# BiocManager::install("ashr")
# resAsh <- lfcShrink(des, coef=2, type="ashr")
## 先验信息
# priorInfo(resLFC)
priorInfo(resNorm)
#> $type
#> [1] "normal"
#>
#> $package
#> [1] "DESeq2"
#>
#> $version
#> [1] '1.44.0'
#>
#> $betaPriorVar
#> Intercept conditiontumor
#> 1.000000e+06 1.658755e+00
# priorInfo(resAsh)
# 对gene添加“是否呈现显著差异表达”的标签
DESeq2_DEG$significant <- factor(ifelse(DESeq2_DEG$padj <0.05, "Significant", "NS"),
levels = c("Significant", "NS"))
table(DESeq2_DEG$significant)
#>
#> Significant NS
#> 5408 13664
# 以baseMean作为x,log2FoldChange作为y 以significant作为分类变量
ggplot(DESeq2_DEG, aes(baseMean, log2FoldChange, colour=significant)) +
geom_point(size=1) +
scale_y_continuous(limits=c(-3, 3)) +
scale_x_log10() +
geom_hline(yintercept = 0, colour="black", linewidth=1) +
labs(x="mean of normalized counts", y="log2FoldChange") +
scale_colour_manual(name="padj",
values=c("Significant"="blue","NS"="grey50")) +
theme_classic()
# the counts of reads for a single gene across the groups
plotCounts(des, gene=which.min(resOrdered$padj),
intgroup="condition",returnData=TRUE) |>
ggplot(aes(x=condition, y=count)) +
geom_point(position=position_jitter(w=0.1,h=0))
colData(dds)
#> DataFrame with 16 rows and 3 columns
#> radiation_therapy kras_mutation_found condition
#> <character> <character> <factor>
#> TCGA-44-6145-01A NO NO tumor
#> TCGA-44-6145-11A NO NO normal
#> TCGA-44-6148-01A NO NO tumor
#> TCGA-44-6148-11A NO NO normal
#> TCGA-49-6742-01A NO NO tumor
#> ... ... ... ...
#> TCGA-50-5066-02A NO NO tumor
#> TCGA-50-5935-01A YES YES tumor
#> TCGA-50-5935-11A YES YES normal
#> TCGA-50-5936-01A NO YES tumor
#> TCGA-50-5936-11A NO YES normal