Exercise 2 - Bioconductor

These exercises are about manipulate single-cell data with Bioconductor packages. Please download the counting matrix from DropBox and loading them into a Seurat object. Or you may also used the rds file data/scSeq_CKO_sceSub.rds.

If you want to load the whole dataset

library(DropletUtils)
library(DropletTestFiles)
fname <- "~path to the 10X counting matrix"
sce <- read10xCounts(fname, col.names=TRUE)
saveRDS(sce,"path to rds file")

If you want to load a subset of the data

library(DropletUtils)
# library(DropletTestFiles)
sce <- readRDS("data/scSeq_CKO_sceSub.rds")

Exercise 2 - Data manipulation with Bioconductor packages

Identify empty droplets

Draw a knee plot and identify inflection point and knee point.

bcrank <- barcodeRanks(counts(sce))
uniq <- !duplicated(bcrank$rank)
#
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
    xlab="Rank", ylab="Total UMI count", cex.lab=1.2)

## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot

abline(h=metadata(bcrank)$inflection, col="darkgreen", lty=2)
abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2)

legend("bottomleft", legend=c("Inflection", "Knee"), 
        col=c("darkgreen", "dodgerblue"), lty=2, cex=1.2)

Identify non-empty droplets and compare to the results with hard cut-off

set limit as “100” (filter out droplets with UMI counts less than 100)
FDR cut-off for non-empty droplet is 0.001

set.seed(100)
limit <- 100   
e.out <- emptyDrops(counts(sce),lower=limit, test.ambient=TRUE)
#
e.out

## DataFrame with 400000 rows and 5 columns
##                        Total   LogProb    PValue   Limited       FDR
##                    <integer> <numeric> <numeric> <logical> <numeric>
## CCACGGAAGCTCTCGG-1         0        NA        NA        NA        NA
## TGTGGTAAGAGTCGGT-1         2  -11.1237 0.7561244     FALSE        NA
## CCCTCCTTCGTCTGCT-1         2  -18.2572 0.0727927     FALSE        NA
## ACTGAGTGTGTCGCTG-1         0        NA        NA        NA        NA
## CGTAGCGTCTCTGCTG-1         0        NA        NA        NA        NA
## ...                      ...       ...       ...       ...       ...
## AGAGCTTGTCCGACGT-1         0        NA        NA        NA        NA
## GCTGCGACAATAGCAA-1         0        NA        NA        NA        NA
## CGAACATAGTGAAGTT-1         2  -12.3017  0.632437     FALSE        NA
## TAGCCGGCATGTCGAT-1         0        NA        NA        NA        NA
## CGAGAAGAGCAGATCG-1         0        NA        NA        NA        NA

#
summary(e.out$FDR <= 0.001)

##    Mode   FALSE    TRUE    NA's 
## logical    3051    3943  393006

# Concordance by testing with FDR and limited
table(Sig=e.out$FDR <= 0.001, Limited=e.out$Limited)

##        Limited
## Sig     FALSE TRUE
##   FALSE  3051    0
##   TRUE     62 3881

#
sce2 <- sce[,which(e.out$FDR <= 0.001)]

Data normalization and cluster data

Normalize and cluster now that empty droplets have been removed.

Normalize and log transform the counts
Use modeling to identify top variable features
Make UMAP plot
Graphic based clustering

library(scran)
library(scuttle)
library(scater)
clusters <- quickCluster(sce2)
sce2 <- computeSumFactors(sce2, cluster=clusters)

## Warning in (function (x, sizes, min.mean = NULL, positive = FALSE, scaling =
## NULL) : encountered non-positive size factor estimates

sce2 <- logNormCounts(sce2)
#
set.seed(1000)
# modeling variables
dec.pbmc <- modelGeneVarByPoisson(sce2)
# calcualte top features
top.pbmc <- getTopHVGs(dec.pbmc, prop=0.1)
#
set.seed(1000)
# Evaluate PCs
sce2 <- denoisePCA(sce2, subset.row=top.pbmc, technical=dec.pbmc)
# make UMAP plot
sce2 <- runUMAP(sce2, dimred="PCA")
#
g <- buildSNNGraph(sce2, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
colLabels(sce2) <- factor(clust)
#
plotUMAP(sce2,colour_by="label")

Evaluate ambient RNA contamination

Please estimate ambient RNA contamination, remove contaminants, and validate by using the Hba-a1 gene.

# evaluate ambinat RNA contamination in the empty droplets
amb <- metadata(e.out)$ambient[,1]
head(amb)

## ENSMUSG00000051951 ENSMUSG00000025902 ENSMUSG00000033845 ENSMUSG00000025903 
##       1.669575e-07       1.669575e-07       1.407972e-04       5.201547e-05 
## ENSMUSG00000104217 ENSMUSG00000033813 
##       1.669575e-07       6.783772e-05

#
# remove ambient RNA 
library(scater)
stripped <- sce2[names(amb),]
out <- removeAmbience(counts(stripped), ambient=amb,groups = colLabels(stripped))
#
# load correccted counts into scce object
counts(stripped, withDimnames=FALSE) <- out
stripped <- logNormCounts(stripped)
#
ensmbl_id <- rowData(sce2)$ID[rowData(sce2)$Symbol=="Hba-a1"]
plotExpression(sce2, x="label", colour_by="label", features=ensmbl_id) +
        ggtitle("Before")

plotExpression(stripped, x="label", colour_by="label", features=ensmbl_id) + 
        ggtitle("After")

Re-cluster data after ambient RNA removal

set.seed(1000)
dec <- modelGeneVar(stripped)
hvgs <- getTopHVGs(dec,n=1000)
stripped <- runPCA(stripped, ncomponents=10, subset_row=hvgs)
stripped <- runUMAP(stripped, dimred="PCA")
g <- buildSNNGraph(stripped, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
colLabels(stripped) <- factor(clust)
plotUMAP(stripped,colour_by="label")

Remove doublets

Please estimate doublets and evaluate if the doublets were enriched in any clusters or not. Then try to remove the doublet cells/clusters.

dbl.dens <- computeDoubletDensity(stripped, #subset.row=top.mam, 
    d=ncol(reducedDim(stripped)),subset.row=hvgs)
summary(dbl.dens)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2445  0.6309  0.8732  1.2381  7.3498

stripped$DoubletScore <- dbl.dens
#
plotUMAP(stripped,colour_by="DoubletScore")

#
plotColData(stripped, x="label", y="DoubletScore", colour_by="label")+
  geom_hline(yintercept = quantile(colData(stripped)$DoubletScore,0.95),lty="dashed",color="red")

#
cut_off <- quantile(stripped$DoubletScore,0.95)
stripped$isDoublet <- c("no","yes")[factor(as.integer(stripped$DoubletScore>=cut_off),levels=c(0,1))]
table(stripped$isDoublet)

## 
##   no  yes 
## 3744  199

sce_clean <- stripped[stripped$isDoublet=="no",]
# saveRDS(sce_clean,"data/scSeq_CTRL_sceSub_rmAmbRNA_rmDoublet.rds")

Advanced QC plots

Please estimate mitochondrial contents (is.mito), read counts (sum) and gene counts (detected) for each cell. Then, draw plots.

violin plots of is.mito, sum, and detected
scatter plots for is.mito vs sum and detected vs sum

library(scater)
mtGene <- rowData(sce_clean)$ID[grepl(rowData(sce_clean)$Symbol,pattern = "mt-")]
is.mito <- names(sce_clean) %in% mtGene
sce_clean <- addPerCellQC(sce_clean, subsets=list(Mito=is.mito))
#
plotColData(sce_clean,x="label", y="sum", colour_by="label")+ggtitle("read counts")

plotColData(sce_clean,x="label", y="detected", colour_by="label")+ggtitle("gene counts")

plotColData(sce_clean,x="label", y="subsets_Mito_percent", colour_by="label")+ggtitle("mitocondrial content")

#
plotColData(sce_clean,x="sum",y="subsets_Mito_percent",colour_by="label")+ggtitle("is.mito vs read counts")

plotColData(sce_clean,x="sum",y="detected",colour_by="label")+ggtitle("gene counts vs read counts")

Estimate variance explanation and find the factor that contributes to the majority of variance.

vars <- getVarianceExplained(sce_clean, 
    variables=c("DoubletScore","label","sum","detected","subsets_Mito_percent"))
plotExplanatoryVariables(vars)

## Warning: Removed 905 rows containing non-finite values (stat_density).