11  标记基因检测

标记基因检测最直接的方法包括检测簇之间的差异表达。

Code


#--- loading ---#
library(DropletUtils)
sce.pbmc <- DropletUtils::read10xCounts("data/OSCA/raw_gene_bc_matrices/GRCh38",
                                        col.names = TRUE)

#--- gene-annotation ---#
library(scater)
rownames(sce.pbmc) <- uniquifyFeatureNames(
    rowData(sce.pbmc)$ID, rowData(sce.pbmc)$Symbol)

library(EnsDb.Hsapiens.v86)
location <- mapIds(EnsDb.Hsapiens.v86, keys=rowData(sce.pbmc)$ID, 
    column="SEQNAME", keytype="GENEID")

#--- cell-detection ---#
set.seed(100)
e.out <- emptyDrops(counts(sce.pbmc))
sce.pbmc <- sce.pbmc[,which(e.out$FDR <= 0.001)]

#--- quality-control ---#
stats <- perCellQCMetrics(sce.pbmc, subsets=list(Mito=which(location=="MT")))
high.mito <- isOutlier(stats$subsets_Mito_percent, type="higher")
sce.pbmc <- sce.pbmc[,!high.mito]

#--- normalization ---#
library(scran)
set.seed(1000)
clusters <- quickCluster(sce.pbmc)
sce.pbmc <- computeSumFactors(sce.pbmc, cluster=clusters)
sce.pbmc <- logNormCounts(sce.pbmc)

#--- variance-modelling ---#
set.seed(1001)
dec.pbmc <- modelGeneVarByPoisson(sce.pbmc)
top.pbmc <- getTopHVGs(dec.pbmc, prop=0.1)

#--- dimensionality-reduction ---#
set.seed(10000)
sce.pbmc <- denoisePCA(sce.pbmc, subset.row=top.pbmc, technical=dec.pbmc)

set.seed(100000)
sce.pbmc <- runTSNE(sce.pbmc, dimred="PCA")

set.seed(1000000)
sce.pbmc <- runUMAP(sce.pbmc, dimred="PCA")

#--- clustering ---#
g <- buildSNNGraph(sce.pbmc, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
colLabels(sce.pbmc) <- factor(clust)
Code
sce.pbmc
#> class: SingleCellExperiment 
#> dim: 33694 3985 
#> metadata(1): Samples
#> assays(2): counts logcounts
#> rownames(33694): RP11-34P13.3 FAM138A ... AC213203.1 FAM231B
#> rowData names(2): ID Symbol
#> colnames(3985): AAACCTGAGAAGGCCT-1 AAACCTGAGACAGACC-1 ...
#>   TTTGTCAGTTAAGACA-1 TTTGTCATCCCAAGAT-1
#> colData names(4): Sample Barcode sizeFactor label
#> reducedDimNames(3): PCA TSNE UMAP
#> mainExpName: NULL
#> altExpNames(0):

11.1 通过成对比较对标记基因计分

scoreMarkers()

self.average 在X簇中的某基因平均对数表达值

other.average 所有其他簇的总体平均值

self.detected 在X簇中检测到某基因表达的细胞比例

other.detected 所有其他簇的平均检测到的细胞比例

AUClogFC.cohenlog.C.detected 成对比较中生成的效应值汇总

AUC 或 Cohen’s d 通常是通用标记检测的最佳选择,因为无论表达值的大小如何,它们都是有效的。 检测到表达值的细胞比例的对数倍数变化对于识别表达的binary changes特别有用。

Code
library(scran)
marker.info <- scoreMarkers(sce.pbmc, colLabels(sce.pbmc))

marker.info
#> List of length 16
#> names(16): 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16

colnames(marker.info[["1"]]) # statistics for cluster 1.
#>  [1] "self.average"          "other.average"         "self.detected"        
#>  [4] "other.detected"        "mean.logFC.cohen"      "min.logFC.cohen"      
#>  [7] "median.logFC.cohen"    "max.logFC.cohen"       "rank.logFC.cohen"     
#> [10] "mean.AUC"              "min.AUC"               "median.AUC"           
#> [13] "max.AUC"               "rank.AUC"              "mean.logFC.detected"  
#> [16] "min.logFC.detected"    "median.logFC.detected" "max.logFC.detected"   
#> [19] "rank.logFC.detected"

例如,根据簇1mean.AUC作为排序

Code
chosen <- marker.info[["1"]]
ordered <- chosen[order(chosen$mean.AUC, decreasing=TRUE),]
head(ordered[,1:4]) # showing basic stats only, for brevity.
#> DataFrame with 6 rows and 4 columns
#>       self.average other.average self.detected other.detected
#>          <numeric>     <numeric>     <numeric>      <numeric>
#> NKG7       4.39503     0.7486412      0.985366      0.3768438
#> GNLY       4.40275     0.3489056      0.956098      0.2010344
#> CTSW       2.60281     0.4618797      0.960976      0.2898534
#> HOPX       1.99060     0.1607469      0.907317      0.1160658
#> PRF1       2.22297     0.1659853      0.887805      0.1263180
#> KLRF1      1.60598     0.0379703      0.858537      0.0346691
Code
library(scater)
plotExpression(sce.pbmc, features=head(rownames(ordered)), 
    x="label", colour_by="label")
Figure 11.1

11.2 效应值

Effect sizes for pairwise comparisons

11.2.1 AUC

AUC 表示X簇中随机选择的观测值大于从其他簇中随机选择的观测值的概率。 值1对应于上调,其中X簇的所有值都大于其他簇的任何值;值0.5意味着分布的位置没有净差异;值0对应于下调。

AUC 与 Wilcoxon ranked sum test(又名 Mann-Whitney U 检验)中的U-statistic 密切相关。

Code
chosen
#> DataFrame with 33694 rows and 19 columns
#>              self.average other.average self.detected other.detected
#>                 <numeric>     <numeric>     <numeric>      <numeric>
#> RP11-34P13.3            0   0.000000000             0    0.000000000
#> FAM138A                 0   0.000000000             0    0.000000000
#> OR4F5                   0   0.000000000             0    0.000000000
#> RP11-34P13.7            0   0.002147662             0    0.002845981
#> RP11-34P13.8            0   0.000383217             0    0.000375072
#> ...                   ...           ...           ...            ...
#> AC233755.2       0.000000    0.00000000     0.0000000      0.0000000
#> AC233755.1       0.000000    0.00000000     0.0000000      0.0000000
#> AC240274.1       0.012495    0.00779986     0.0146341      0.0086369
#> AC213203.1       0.000000    0.00000000     0.0000000      0.0000000
#> FAM231B          0.000000    0.00000000     0.0000000      0.0000000
#>              mean.logFC.cohen min.logFC.cohen median.logFC.cohen
#>                     <numeric>       <numeric>          <numeric>
#> RP11-34P13.3       0.00000000       0.0000000                  0
#> FAM138A            0.00000000       0.0000000                  0
#> OR4F5              0.00000000       0.0000000                  0
#> RP11-34P13.7      -0.04251980      -0.2062842                  0
#> RP11-34P13.8      -0.00996135      -0.0813788                  0
#> ...                       ...             ...                ...
#> AC233755.2          0.0000000          0.0000          0.0000000
#> AC233755.1          0.0000000          0.0000          0.0000000
#> AC240274.1          0.0640153         -0.1179          0.0486982
#> AC213203.1          0.0000000          0.0000          0.0000000
#> FAM231B             0.0000000          0.0000          0.0000000
#>              max.logFC.cohen rank.logFC.cohen  mean.AUC   min.AUC median.AUC
#>                    <numeric>        <integer> <numeric> <numeric>  <numeric>
#> RP11-34P13.3               0             5838  0.500000  0.500000        0.5
#> FAM138A                    0             5838  0.500000  0.500000        0.5
#> OR4F5                      0             5838  0.500000  0.500000        0.5
#> RP11-34P13.7               0             5838  0.498577  0.489362        0.5
#> RP11-34P13.8               0             5838  0.499812  0.498344        0.5
#> ...                      ...              ...       ...       ...        ...
#> AC233755.2          0.000000             5838  0.500000  0.500000   0.500000
#> AC233755.1          0.000000             5838  0.500000  0.500000   0.500000
#> AC240274.1          0.169635             1831  0.502997  0.489547   0.503794
#> AC213203.1          0.000000             5838  0.500000  0.500000   0.500000
#> FAM231B             0.000000             5838  0.500000  0.500000   0.500000
#>                max.AUC  rank.AUC mean.logFC.detected min.logFC.detected
#>              <numeric> <integer>           <numeric>          <numeric>
#> RP11-34P13.3       0.5      2978        -4.27124e-17       -3.20343e-16
#> FAM138A            0.5      2978        -4.27124e-17       -3.20343e-16
#> OR4F5              0.5      2978        -4.27124e-17       -3.20343e-16
#> RP11-34P13.7       0.5      2978        -3.45538e-01       -1.60237e+00
#> RP11-34P13.8       0.5      2978        -8.71800e-02       -7.47437e-01
#> ...                ...       ...                 ...                ...
#> AC233755.2    0.500000      2978        -4.27124e-17       -3.20343e-16
#> AC233755.1    0.500000      2978        -4.27124e-17       -3.20343e-16
#> AC240274.1    0.507317      1649         4.64243e-01       -8.43430e-01
#> AC213203.1    0.500000      2978        -4.27124e-17       -3.20343e-16
#> FAM231B       0.500000      2978        -4.27124e-17       -3.20343e-16
#>              median.logFC.detected max.logFC.detected rank.logFC.detected
#>                          <numeric>          <numeric>           <integer>
#> RP11-34P13.3                     0        3.20343e-16                2645
#> FAM138A                          0        3.20343e-16                2645
#> OR4F5                            0        3.20343e-16                2645
#> RP11-34P13.7                     0        1.60171e-16                2645
#> RP11-34P13.8                     0        3.20343e-16                2645
#> ...                            ...                ...                 ...
#> AC233755.2                0.000000        3.20343e-16                2645
#> AC233755.1                0.000000        3.20343e-16                2645
#> AC240274.1                0.613645        9.20433e-01                 835
#> AC213203.1                0.000000        3.20343e-16                2645
#> FAM231B                   0.000000        3.20343e-16                2645
auc.only <- chosen[,grepl("AUC", colnames(chosen))]
auc.only[order(auc.only$mean.AUC,decreasing=TRUE),]
#> DataFrame with 33694 rows and 5 columns
#>         mean.AUC    min.AUC median.AUC   max.AUC  rank.AUC
#>        <numeric>  <numeric>  <numeric> <numeric> <integer>
#> NKG7    0.963726   0.805885   0.988947  0.991803         1
#> GNLY    0.958868   0.877404   0.973347  0.974913         2
#> CTSW    0.932352   0.709079   0.966638  0.978873         2
#> HOPX    0.928138   0.780016   0.949129  0.953659         3
#> PRF1    0.924690   0.837420   0.940300  0.943902         5
#> ...          ...        ...        ...       ...       ...
#> RPS13   0.174413 0.00684690  0.0806435  0.695732       426
#> RPL26   0.170493 0.01334117  0.0724876  0.813720       104
#> RPL18A  0.169516 0.01090122  0.0760620  0.749735       264
#> RPL39   0.152346 0.00405525  0.0626341  0.774848       185
#> FTH1    0.147289 0.00121951  0.0639979  0.645732       766

11.2.2 LogFC.cohen

Cohen’s d是一种标准化的对数倍数变化(log-fold change),

正值表示该基因在簇中上调,负值表示下调,接近零的值表示差异不大。Cohen’s d大致类似于双样本t-检验的t-statistic

Code
cohen.only <- chosen[,grepl("logFC.cohen", colnames(chosen))]
cohen.only[order(cohen.only$mean.logFC.cohen,decreasing=TRUE),]
#> DataFrame with 33694 rows and 5 columns
#>         mean.logFC.cohen min.logFC.cohen median.logFC.cohen max.logFC.cohen
#>                <numeric>       <numeric>          <numeric>       <numeric>
#> NKG7             4.84346        1.025337            5.84606         6.30987
#> GNLY             3.71574        1.793853            4.04410         4.36929
#> CTSW             2.96940        0.699433            3.19968         3.96973
#> GZMA             2.69683        0.399487            3.18392         3.44040
#> HOPX             2.67330        1.108548            2.92242         3.06690
#> ...                  ...             ...                ...             ...
#> FTH1            -2.28562        -5.19176          -2.533685       0.2995322
#> HLA-DRA         -2.39933        -7.13493          -2.032812       0.0146072
#> FTL             -2.40544        -5.82525          -1.285601       0.2569453
#> CST3            -2.56767        -7.92584          -0.950339       0.0336350
#> LYZ             -2.84045        -9.00815          -0.341198      -0.1797338
#>         rank.logFC.cohen
#>                <integer>
#> NKG7                   1
#> GNLY                   1
#> CTSW                   3
#> GZMA                   2
#> HOPX                   4
#> ...                  ...
#> FTH1                4362
#> HLA-DRA             6202
#> FTL                 5319
#> CST3                5608
#> LYZ                30966

11.2.3 logFC.detected

成对比较检测到表达值的细胞比例的对数倍数变化,这忽略了有关表达量的任何信息,只考虑是否检测到表达值。 正值表示与其他簇相比,X簇中表达基因的细胞比例更大。

Code
detect.only <- chosen[,grepl("logFC.detected", colnames(chosen))]
detect.only[order(detect.only$mean.logFC.detected,decreasing=TRUE),]
#> DataFrame with 33694 rows and 5 columns
#>        mean.logFC.detected min.logFC.detected median.logFC.detected
#>                  <numeric>          <numeric>             <numeric>
#> KLRF1              4.80539           2.532196               5.33959
#> PRSS23             4.43836           2.354538               4.57967
#> XCL1               4.42946           1.398559               4.91134
#> XCL2               4.42099           1.304208               4.87151
#> SH2D1B             4.17329           0.804099               4.54737
#> ...                    ...                ...                   ...
#> MARCKS            -3.14645           -6.96050              -2.00000
#> RAB31             -3.23328           -6.63557              -2.58496
#> SLC7A7            -3.28244           -6.95262              -2.64295
#> RAB32             -3.42074           -6.72160              -3.32193
#> NCF2              -3.76139           -7.00693              -3.38863
#>        max.logFC.detected rank.logFC.detected
#>                 <numeric>           <integer>
#> KLRF1             6.50482                   1
#> PRSS23            5.98530                   2
#> XCL1              6.16993                   1
#> XCL2              6.21524                   1
#> SH2D1B            5.85007                   3
#> ...                   ...                 ...
#> MARCKS                  0               11805
#> RAB31                  -1               31423
#> SLC7A7                  0               11796
#> RAB32                   0               11805
#> NCF2                    0               11805

11.3 summary

mean,与其他簇的平均值相比,对于簇X,较大的均值(>0 for the log-fold changes, >0.5 for the AUCs)表示在簇X中该基因上调。

median,与大多数 (>50%) 其他簇相比,较大的值表示该基因在X簇中上调。对于异常值, 中位数比均值提供更大的稳健性robustness,而均值可能是可取的,也可能是不可取的。 一方面,如果只有少数成对比较具有较大的效应,则中位数可以避免夸大效应值; 另一方面,它也会通过忽略少数具有相反效果的成对比较来夸大效应值。

minimum value,是鉴定上调基因最严格的。因为最小值是个大值表示与所有其他集群相比,该基因在X簇中上调。 相反,如果最小值很小(<0 for the log-fold changes, <0.5 for the AUCs),与至少一个其他簇相比,该基因在X簇中下调。

maximum value, 是识别上调基因最不严格的,因为与任何一个其他集群相比,如果在X簇中存在强烈上调,最大值可以是个大值。 相反,如果最大值很小,与所有其他集群相比,该基因在X簇中下调。

minimum rank,是所有成对比较中每个基因的最小排序。具体来说,根据效应值降序,在每个成对比较中对基因进行排序,然后为每个基因报告所有成对比较中的最小排序。如果一个基因的最小秩很小,在至少一个簇X与另一个簇的成对比较中,它是最上调的基因之一。

Code
chosen <- marker.info[["4"]] # using another cluster, for some variety.
ordered <- chosen[order(chosen$median.logFC.cohen,decreasing=TRUE),]
head(ordered[,1:4]) # showing basic stats only, for brevity.
#> DataFrame with 6 rows and 4 columns
#>        self.average other.average self.detected other.detected
#>           <numeric>     <numeric>     <numeric>      <numeric>
#> LYZ         4.97589      2.188256      1.000000       0.657832
#> S100A9      4.94461      2.059524      1.000000       0.705775
#> FTL         5.58358      4.140706      1.000000       0.961428
#> S100A8      4.61729      1.834030      1.000000       0.639031
#> CTSS        3.28383      1.568992      1.000000       0.630574
#> CSTA        1.76876      0.685007      0.982143       0.337978
Code
plotExpression(sce.pbmc, features=head(rownames(ordered)), 
    x="label", colour_by="label")
Figure 11.2
Code
ordered <- chosen[order(chosen$rank.logFC.cohen),]
top.ranked <- ordered[ordered$rank.logFC.cohen <= 5,]  # 最小排序小于或等于 5
top.ranked$rank.logFC.cohen
#>        S100A8        S100A4         RPS27        MALAT1           LYZ 
#>             1             1             1             1             1 
#>        TYROBP          CTSS         RPL31        EEF1B2           LTB 
#>             1             2             2             2             2 
#>         RPS21           FTL        S100A9          RPSA          GPX1 
#>             2             2             3             3             3 
#>         RPS29         RPLP1         RPL17          CST3       S100A11 
#>             3             3             3             3             4 
#>        RPS27A        RPS15A        MT-ND2        FCER1G         RPS12 
#>             4             4             4             5             5 
#>        RPL36A RP11-1143G9.4          IL32        RPL23A 
#>             5             5             5             5
rownames(top.ranked)
#>  [1] "S100A8"        "S100A4"        "RPS27"         "MALAT1"       
#>  [5] "LYZ"           "TYROBP"        "CTSS"          "RPL31"        
#>  [9] "EEF1B2"        "LTB"           "RPS21"         "FTL"          
#> [13] "S100A9"        "RPSA"          "GPX1"          "RPS29"        
#> [17] "RPLP1"         "RPL17"         "CST3"          "S100A11"      
#> [21] "RPS27A"        "RPS15A"        "MT-ND2"        "FCER1G"       
#> [25] "RPS12"         "RPL36A"        "RP11-1143G9.4" "IL32"         
#> [29] "RPL23A"
plotGroupedHeatmap(sce.pbmc, features=rownames(top.ranked), group="label", 
    center=TRUE, zlim=c(-3, 3))
Figure 11.3
Code
# Omitting the decreasing=TRUE to focus on negative effects.
ordered <- chosen[order(chosen$median.logFC.cohen),1:4]  #检查一些升序靠前的基因,看看与其他簇相比是否有任何一致的下调
head(ordered)
#> DataFrame with 6 rows and 4 columns
#>       self.average other.average self.detected other.detected
#>          <numeric>     <numeric>     <numeric>      <numeric>
#> HLA-B     4.013806      4.438053      1.000000       0.971760
#> HLA-E     1.789158      2.093695      0.982143       0.871183
#> RPSA      2.685950      2.563913      1.000000       0.829648
#> HLA-C     3.116699      3.527693      1.000000       0.934941
#> BIN2      0.410471      0.667019      0.464286       0.456757
#> RAC2      0.823508      0.982662      0.714286       0.630389

11.4 full effects

设置 full.stats=TRUE 获取涉及特定聚类的所有成对比较的效应值

Code
# cluster 4 vs any other cluster
marker.info <- scoreMarkers(sce.pbmc, colLabels(sce.pbmc), full.stats=TRUE)
chosen <- marker.info[["4"]]
chosen$full.AUC
#> DataFrame with 33694 rows and 15 columns
#>                      1         2         3         5         6         7
#>              <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
#> RP11-34P13.3       0.5  0.500000  0.500000  0.500000       0.5       0.5
#> FAM138A            0.5  0.500000  0.500000  0.500000       0.5       0.5
#> OR4F5              0.5  0.500000  0.500000  0.500000       0.5       0.5
#> RP11-34P13.7       0.5  0.499016  0.499076  0.497326       0.5       0.5
#> RP11-34P13.8       0.5  0.500000  0.500000  0.500000       0.5       0.5
#> ...                ...       ...       ...       ...       ...       ...
#> AC233755.2    0.500000  0.500000  0.500000  0.500000     0.500       0.5
#> AC233755.1    0.500000  0.500000  0.500000  0.500000     0.500       0.5
#> AC240274.1    0.492683  0.496063  0.494455  0.495989     0.496       0.5
#> AC213203.1    0.500000  0.500000  0.500000  0.500000     0.500       0.5
#> FAM231B       0.500000  0.500000  0.500000  0.500000     0.500       0.5
#>                      8         9        10        11        12        13
#>              <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
#> RP11-34P13.3  0.500000  0.500000       0.5  0.500000       0.5       0.5
#> FAM138A       0.500000  0.500000       0.5  0.500000       0.5       0.5
#> OR4F5         0.500000  0.500000       0.5  0.500000       0.5       0.5
#> RP11-34P13.7  0.498843  0.495033       0.5  0.489362       0.5       0.5
#> RP11-34P13.8  0.498843  0.498344       0.5  0.500000       0.5       0.5
#> ...                ...       ...       ...       ...       ...       ...
#> AC233755.2    0.500000  0.500000  0.500000  0.500000  0.500000  0.500000
#> AC233755.1    0.500000  0.500000  0.500000  0.500000  0.500000  0.500000
#> AC240274.1    0.496528  0.496689  0.494233  0.489362  0.496774  0.496988
#> AC213203.1    0.500000  0.500000  0.500000  0.500000  0.500000  0.500000
#> FAM231B       0.500000  0.500000  0.500000  0.500000  0.500000  0.500000
#>                     14        15        16
#>              <numeric> <numeric> <numeric>
#> RP11-34P13.3       0.5       0.5       0.5
#> FAM138A            0.5       0.5       0.5
#> OR4F5              0.5       0.5       0.5
#> RP11-34P13.7       0.5       0.5       0.5
#> RP11-34P13.8       0.5       0.5       0.5
#> ...                ...       ...       ...
#> AC233755.2         0.5  0.500000       0.5
#> AC233755.1         0.5  0.500000       0.5
#> AC240274.1         0.5  0.482143       0.5
#> AC213203.1         0.5  0.500000       0.5
#> FAM231B            0.5  0.500000       0.5

假设我们想确定将簇 4 与其他具有高 LYZ 表达的簇区分开来的基因。 对相关比较进行取子集,并对汇总统计量进行排序,以获得该子集中标记基因的排名。 这使我们能够轻松地表征密切相关的集群之间的细微差异。 为了说明这一点,与其他 高LYZ 簇相比,我们使用最小的排名computeMinRank()来识别簇 4 中排名靠前的 DE 基因。

Code
lyz.high <- c("4", "6", "8", "9", "14") # based on inspection of the previous Figure.
subset <- chosen$full.AUC[,colnames(chosen$full.AUC) %in% lyz.high]
to.show <- subset[computeMinRank(subset) <= 10,]
to.show
#> DataFrame with 27 rows and 4 columns
#>                 6         8         9        14
#>         <numeric> <numeric> <numeric> <numeric>
#> CTSS     0.942286  0.195230  0.203288  0.208138
#> S100A12  0.885571  0.204034  0.605428  0.287178
#> S100A8   0.898143  0.156457  0.639368  0.112705
#> RPS27    0.906286  0.943576  0.905866  0.949063
#> RPS27A   0.810000  0.862021  0.832663  0.932084
#> ...           ...       ...       ...       ...
#> FTL      0.941857  0.115369  0.118023 0.0954333
#> RPL13A   0.637143  0.858466  0.813683 0.9396956
#> RPL3     0.383143  0.819651  0.764250 0.8934426
#> MT-ND2   0.922143  0.576100  0.536128 0.7391686
#> MT-ND3   0.906857  0.432746  0.514487 0.5386417
Code
plotGroupedHeatmap(sce.pbmc[,colLabels(sce.pbmc) %in% lyz.high],
    features=rownames(to.show), group="label", center=TRUE, zlim=c(-3, 3))
Figure 11.4

自定义汇总统计量例如,对于相对于其他集群的某个百分比(例如80%)上调的标记物感兴趣。

Code
stat <- MatrixGenerics::rowQuantiles(as.matrix(chosen$full.AUC), p=0.2)
chosen[order(stat, decreasing=TRUE), 1:4] 
#> DataFrame with 33694 rows and 4 columns
#>         self.average other.average self.detected other.detected
#>            <numeric>     <numeric>     <numeric>      <numeric>
#> S100A12      1.94300      0.609079      0.928571       0.263675
#> JUND         1.42774      0.619368      0.910714       0.442422
#> S100A9       4.94461      2.059524      1.000000       0.705775
#> VCAN         1.63794      0.436597      0.875000       0.222359
#> S100A8       4.61729      1.834030      1.000000       0.639031
#> ...              ...           ...           ...            ...
#> RPL23A       3.96991       3.74428             1       0.905175
#> RPSA         2.68595       2.56391             1       0.829648
#> HLA-C        3.11670       3.52769             1       0.934941
#> RPS29        4.57983       4.27407             1       0.919797
#> HLA-B        4.01381       4.43805             1       0.971760

11.5 log-fold change threshold

设置 lfc= 来计算相对于对数倍数变化阈值的效应值

Code
marker.info.lfc <- scoreMarkers(sce.pbmc, colLabels(sce.pbmc), lfc=2)
chosen2 <- marker.info.lfc[["5"]] 
chosen2 <- chosen2[order(chosen2$mean.AUC, decreasing=TRUE),]
chosen2[,c("self.average", "other.average", "mean.AUC")]
#> DataFrame with 33694 rows and 3 columns
#>            self.average other.average  mean.AUC
#>               <numeric>     <numeric> <numeric>
#> CCL5            3.81358      1.047986  0.747609
#> NKG7            3.17038      0.830285  0.654736
#> IL32            3.29770      1.076383  0.633969
#> GZMA            1.92877      0.426848  0.454187
#> TRAC            2.25881      0.848025  0.434530
#> ...                 ...           ...       ...
#> AC233755.2   0.00000000    0.00000000         0
#> AC233755.1   0.00000000    0.00000000         0
#> AC240274.1   0.00942441    0.00800457         0
#> AC213203.1   0.00000000    0.00000000         0
#> FAM231B      0.00000000    0.00000000         0
Code
plotDots(sce.pbmc, rownames(chosen2)[1:10], group="label")

11.6 blocking factors

变异因素(例如,批次效应、性别差异)

block=

Code
#--- loading ---#
library(scRNAseq)
sce.416b <- LunSpikeInData(which="416b") 
sce.416b$block <- factor(sce.416b$block)

#--- gene-annotation ---#
library(AnnotationHub)
ens.mm.v97 <- AnnotationHub()[["AH73905"]]
rowData(sce.416b)$ENSEMBL <- rownames(sce.416b)
rowData(sce.416b)$SYMBOL <- mapIds(ens.mm.v97, keys=rownames(sce.416b),
    keytype="GENEID", column="SYMBOL")
rowData(sce.416b)$SEQNAME <- mapIds(ens.mm.v97, keys=rownames(sce.416b),
    keytype="GENEID", column="SEQNAME")

library(scater)
rownames(sce.416b) <- uniquifyFeatureNames(rowData(sce.416b)$ENSEMBL, 
    rowData(sce.416b)$SYMBOL)

#--- quality-control ---#
mito <- which(rowData(sce.416b)$SEQNAME=="MT")
stats <- perCellQCMetrics(sce.416b, subsets=list(Mt=mito))
qc <- quickPerCellQC(stats, percent_subsets=c("subsets_Mt_percent",
    "altexps_ERCC_percent"), batch=sce.416b$block)
sce.416b <- sce.416b[,!qc$discard]

#--- normalization ---#
library(scran)
sce.416b <- computeSumFactors(sce.416b)
sce.416b <- logNormCounts(sce.416b)

#--- variance-modelling ---#
dec.416b <- modelGeneVarWithSpikes(sce.416b, "ERCC", block=sce.416b$block)
chosen.hvgs <- getTopHVGs(dec.416b, prop=0.1)

#--- batch-correction ---#
library(limma)
assay(sce.416b, "corrected") <- removeBatchEffect(logcounts(sce.416b), 
    design=model.matrix(~sce.416b$phenotype), batch=sce.416b$block)

#--- dimensionality-reduction ---#
sce.416b <- runPCA(sce.416b, ncomponents=10, subset_row=chosen.hvgs,
    exprs_values="corrected", BSPARAM=BiocSingular::ExactParam())

set.seed(1010)
sce.416b <- runTSNE(sce.416b, dimred="PCA", perplexity=10)

#--- clustering ---#
my.dist <- dist(reducedDim(sce.416b, "PCA"))
my.tree <- hclust(my.dist, method="ward.D2")

library(dynamicTreeCut)
my.clusters <- unname(cutreeDynamic(my.tree, distM=as.matrix(my.dist),
    minClusterSize=10, verbose=0))
colLabels(sce.416b) <- factor(my.clusters)
Code
m.out <- scoreMarkers(sce.416b, colLabels(sce.416b), block=sce.416b$block)

在每个批次内进行成对比较,抵消了任何批次效应。然后使用加权平均数对各批次的效应值进行平均以获得每个成对比较的单个值,该加权平均数考虑了每个批次中参与成对比较的细胞数。对每个簇内外的平均对数表达和检测到的细胞比例进行了类似的校正。

Code
demo <- m.out[["1"]] 
ordered <- demo[order(demo$median.logFC.cohen, decreasing=TRUE),]
ordered[,1:4]
#> DataFrame with 46604 rows and 4 columns
#>         self.average other.average self.detected other.detected
#>            <numeric>     <numeric>     <numeric>      <numeric>
#> Myh11        4.03436      0.861019      0.988132       0.303097
#> Cd200r3      7.97667      3.524762      0.977675       0.624507
#> Pi16         6.27654      2.644421      0.957126       0.530395
#> Actb        15.48533     14.808584      1.000000       1.000000
#> Ctsd        11.61247      9.130141      1.000000       1.000000
#> ...              ...           ...           ...            ...
#> Spc24      0.4772577       5.03548      0.222281       0.862153
#> Ska1       0.0787421       4.43426      0.118743       0.773950
#> Pimreg     0.5263611       5.35494      0.258150       0.910706
#> Birc5      1.5580536       7.07230      0.698746       0.976929
#> Ccna2      0.9664521       6.55243      0.554104       0.948520
Code
plotExpression(sce.416b, features=rownames(ordered)[1:6],
    x="label", colour_by="block")
Figure 11.5

该参数block=适用于上面显示的所有效应值,并且对批次之间对数倍数变化或方差的差异具有鲁棒性。 但是,它假定每对簇至少存在于一个批次中。如果来自两个簇的细胞从未在同一批次中同时出现,则无法进行关联的成对比较,并且在计算汇总统计量时会忽略该比较。