These exercises are about manipulate single-cell data with Bioconductor packages. Please download the counting matrix from DropBox and loading them into a Seurat object. Or you may also used the rds file data/scSeq_CKO_sceSub.rds.

If you want to load the whole dataset

library(DropletUtils)
library(DropletTestFiles)
fname <- "~path to the 10X counting matrix"
sce <- read10xCounts(fname, col.names=TRUE)
saveRDS(sce,"path to rds file")

If you want to load a subset of the data

library(DropletUtils)
# library(DropletTestFiles)
sce <- readRDS("data/scSeq_CKO_sceSub.rds")

Exercise 2 - Data manipulation with Bioconductor packages

Identify empty droplets

  1. Draw a knee plot and identify inflection point and knee point.
bcrank <- barcodeRanks(counts(sce))
uniq <- !duplicated(bcrank$rank)
#
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
    xlab="Rank", ylab="Total UMI count", cex.lab=1.2)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot
abline(h=metadata(bcrank)$inflection, col="darkgreen", lty=2)
abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2)

legend("bottomleft", legend=c("Inflection", "Knee"), 
        col=c("darkgreen", "dodgerblue"), lty=2, cex=1.2)

  1. Identify non-empty droplets and compare to the results with hard cut-off
  • set limit as “100” (filter out droplets with UMI counts less than 100)
  • FDR cut-off for non-empty droplet is 0.001
set.seed(100)
limit <- 100   
e.out <- emptyDrops(counts(sce),lower=limit, test.ambient=TRUE)
#
e.out
## DataFrame with 400000 rows and 5 columns
##                        Total   LogProb    PValue   Limited       FDR
##                    <integer> <numeric> <numeric> <logical> <numeric>
## CCACGGAAGCTCTCGG-1         0        NA        NA        NA        NA
## TGTGGTAAGAGTCGGT-1         2  -11.1237 0.7561244     FALSE        NA
## CCCTCCTTCGTCTGCT-1         2  -18.2572 0.0727927     FALSE        NA
## ACTGAGTGTGTCGCTG-1         0        NA        NA        NA        NA
## CGTAGCGTCTCTGCTG-1         0        NA        NA        NA        NA
## ...                      ...       ...       ...       ...       ...
## AGAGCTTGTCCGACGT-1         0        NA        NA        NA        NA
## GCTGCGACAATAGCAA-1         0        NA        NA        NA        NA
## CGAACATAGTGAAGTT-1         2  -12.3017  0.632437     FALSE        NA
## TAGCCGGCATGTCGAT-1         0        NA        NA        NA        NA
## CGAGAAGAGCAGATCG-1         0        NA        NA        NA        NA
#
summary(e.out$FDR <= 0.001)
##    Mode   FALSE    TRUE    NA's 
## logical    3051    3943  393006
# Concordance by testing with FDR and limited
table(Sig=e.out$FDR <= 0.001, Limited=e.out$Limited)
##        Limited
## Sig     FALSE TRUE
##   FALSE  3051    0
##   TRUE     62 3881
#
sce2 <- sce[,which(e.out$FDR <= 0.001)]

Data normalization and cluster data

  1. Normalize and cluster now that empty droplets have been removed.
library(scran)
library(scuttle)
library(scater)
clusters <- quickCluster(sce2)
sce2 <- computeSumFactors(sce2, cluster=clusters)
## Warning in (function (x, sizes, min.mean = NULL, positive = FALSE, scaling =
## NULL) : encountered non-positive size factor estimates
sce2 <- logNormCounts(sce2)
#
set.seed(1000)
# modeling variables
dec.pbmc <- modelGeneVarByPoisson(sce2)
# calcualte top features
top.pbmc <- getTopHVGs(dec.pbmc, prop=0.1)
#
set.seed(1000)
# Evaluate PCs
sce2 <- denoisePCA(sce2, subset.row=top.pbmc, technical=dec.pbmc)
# make UMAP plot
sce2 <- runUMAP(sce2, dimred="PCA")
#
g <- buildSNNGraph(sce2, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
colLabels(sce2) <- factor(clust)
#
plotUMAP(sce2,colour_by="label")

Evaluate ambient RNA contamination

  1. Please estimate ambient RNA contamination, remove contaminants, and validate by using the Hba-a1 gene.
# evaluate ambinat RNA contamination in the empty droplets
amb <- metadata(e.out)$ambient[,1]
head(amb)
## ENSMUSG00000051951 ENSMUSG00000025902 ENSMUSG00000033845 ENSMUSG00000025903 
##       1.669575e-07       1.669575e-07       1.407972e-04       5.201547e-05 
## ENSMUSG00000104217 ENSMUSG00000033813 
##       1.669575e-07       6.783772e-05
#
# remove ambient RNA 
library(scater)
stripped <- sce2[names(amb),]
out <- removeAmbience(counts(stripped), ambient=amb,groups = colLabels(stripped))
#
# load correccted counts into scce object
counts(stripped, withDimnames=FALSE) <- out
stripped <- logNormCounts(stripped)
#
ensmbl_id <- rowData(sce2)$ID[rowData(sce2)$Symbol=="Hba-a1"]
plotExpression(sce2, x="label", colour_by="label", features=ensmbl_id) +
        ggtitle("Before")

plotExpression(stripped, x="label", colour_by="label", features=ensmbl_id) + 
        ggtitle("After")

  1. Re-cluster data after ambient RNA removal
set.seed(1000)
dec <- modelGeneVar(stripped)
hvgs <- getTopHVGs(dec,n=1000)
stripped <- runPCA(stripped, ncomponents=10, subset_row=hvgs)
stripped <- runUMAP(stripped, dimred="PCA")
g <- buildSNNGraph(stripped, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
colLabels(stripped) <- factor(clust)
plotUMAP(stripped,colour_by="label")

Remove doublets

  1. Please estimate doublets and evaluate if the doublets were enriched in any clusters or not. Then try to remove the doublet cells/clusters.
dbl.dens <- computeDoubletDensity(stripped, #subset.row=top.mam, 
    d=ncol(reducedDim(stripped)),subset.row=hvgs)
summary(dbl.dens)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2445  0.6309  0.8732  1.2381  7.3498
stripped$DoubletScore <- dbl.dens
#
plotUMAP(stripped,colour_by="DoubletScore")

#
plotColData(stripped, x="label", y="DoubletScore", colour_by="label")+
  geom_hline(yintercept = quantile(colData(stripped)$DoubletScore,0.95),lty="dashed",color="red")

#
cut_off <- quantile(stripped$DoubletScore,0.95)
stripped$isDoublet <- c("no","yes")[factor(as.integer(stripped$DoubletScore>=cut_off),levels=c(0,1))]
table(stripped$isDoublet)
## 
##   no  yes 
## 3744  199
sce_clean <- stripped[stripped$isDoublet=="no",]
# saveRDS(sce_clean,"data/scSeq_CTRL_sceSub_rmAmbRNA_rmDoublet.rds")

Advanced QC plots

  1. Please estimate mitochondrial contents (is.mito), read counts (sum) and gene counts (detected) for each cell. Then, draw plots.
  • violin plots of is.mito, sum, and detected
  • scatter plots for is.mito vs sum and detected vs sum
library(scater)
mtGene <- rowData(sce_clean)$ID[grepl(rowData(sce_clean)$Symbol,pattern = "mt-")]
is.mito <- names(sce_clean) %in% mtGene
sce_clean <- addPerCellQC(sce_clean, subsets=list(Mito=is.mito))
#
plotColData(sce_clean,x="label", y="sum", colour_by="label")+ggtitle("read counts")

plotColData(sce_clean,x="label", y="detected", colour_by="label")+ggtitle("gene counts")

plotColData(sce_clean,x="label", y="subsets_Mito_percent", colour_by="label")+ggtitle("mitocondrial content")

#
plotColData(sce_clean,x="sum",y="subsets_Mito_percent",colour_by="label")+ggtitle("is.mito vs read counts")

plotColData(sce_clean,x="sum",y="detected",colour_by="label")+ggtitle("gene counts vs read counts")

  1. Estimate variance explanation and find the factor that contributes to the majority of variance.
vars <- getVarianceExplained(sce_clean, 
    variables=c("DoubletScore","label","sum","detected","subsets_Mito_percent"))
plotExplanatoryVariables(vars)
## Warning: Removed 905 rows containing non-finite values (stat_density).