There are more and more Bioconductor packages supporting single-cell data analysis. R Amezquita, A Lun, S Hicks, and R Gottardo wrote an integrated workflow, Orchestrating Single-Cell Analysis with Bioconductor, for single-cell data analysis and quality assessment. In this session, we will go through several important QC metrics which can’t be made with Seurat.
We can load the data in from 10X with the DropletUtils function, read10xCounts().
library(DropletUtils)
library(DropletTestFiles)
fname <- "~/filepath/toCellRanger/results"
sce <- read10xCounts(fname, col.names = TRUE)
cellID <- colData(sce)$BarcodeWe have a small subset version of this dataset that you can load in from the data/ directory to try this out.
read10xCounts() reads the data in as a specialist object called a SingleCellExperiment.
sce## class: SingleCellExperiment
## dim: 27998 100000
## metadata(1): Samples
## assays(1): counts
## rownames(27998): ENSMUSG00000051951 ENSMUSG00000089699 ...
## ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ID Symbol
## colnames(100000): CGTGTCTTCGCATGAT-1 TTTATGCGTCGAACAG-1 ...
## TTATGCTGTGTTTGTG-1 GCTGCGACACGTTGGC-1
## colData names(2): Sample Barcode
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
The colData() and rowData() functions can be used to access experiment metadata.
# cell information
colData(sce)[1:2, ]## DataFrame with 2 rows and 2 columns
## Sample Barcode
## <character> <character>
## CGTGTCTTCGCATGAT-1 ~/Desktop/01_scSeq_t.. CGTGTCTTCGCATGAT-1
## TTTATGCGTCGAACAG-1 ~/Desktop/01_scSeq_t.. TTTATGCGTCGAACAG-1
# gene information
rowData(sce)[1:2, ]## DataFrame with 2 rows and 2 columns
## ID Symbol
## <character> <character>
## ENSMUSG00000051951 ENSMUSG00000051951 Xkr4
## ENSMUSG00000089699 ENSMUSG00000089699 Gm1992
bcrank <- barcodeRanks(counts(sce))
bcrank[1:2, ]## DataFrame with 2 rows and 3 columns
## rank total fitted
## <numeric> <integer> <numeric>
## CGTGTCTTCGCATGAT-1 41200.5 1 NA
## TTTATGCGTCGAACAG-1 41200.5 1 NA
Knee plot is a useful QC plot for single-cell seuqnecing. It reflects the threshold to validate cell for analysis. - x-axis is the cell barcodes (droplet) ranked by their UMI counts - y-axis is the UMI counts in each droplet - inflection point: the point while UMI counts start decreasing rapidly - knee point: the cut-off of UMI counts to differentiate cells valid for analysis
uniq <- !duplicated(bcrank$rank)
#
plot(bcrank$rank[uniq], bcrank$total[uniq], log = "xy", xlab = "Rank", ylab = "Total UMI count",
cex.lab = 1.2)
abline(h = metadata(bcrank)$inflection, col = "darkgreen", lty = 2)
abline(h = metadata(bcrank)$knee, col = "dodgerblue", lty = 2)
legend("bottomleft", legend = c("Inflection", "Knee"), col = c("darkgreen", "dodgerblue"),
lty = 2, cex = 1.2)## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot
The emptydrops function can be used to identify good/bad droplets. 2 keys arguments are: - limit: lowest counts per cell - test.ambient: Could be used to estimate ambient RNA contamination
set.seed(100)
limit <- 100
e.out <- emptyDrops(counts(sce), lower = limit, test.ambient = TRUE)
#
e.out## DataFrame with 100000 rows and 5 columns
## Total LogProb PValue Limited FDR
## <integer> <numeric> <numeric> <logical> <numeric>
## CGTGTCTTCGCATGAT-1 1 -5.34170 0.70912909 FALSE NA
## TTTATGCGTCGAACAG-1 1 -6.44503 0.50414959 FALSE NA
## CATGACACAAGTCTGT-1 0 NA NA NA NA
## ACATACGCACCACCAG-1 58 -266.20222 0.01839816 FALSE NA
## CGTCCATAGCTGCAAG-1 1601 -2917.18710 0.00009999 TRUE 0.000200166
## ... ... ... ... ... ...
## GTGCTTCGTCACCCAG-1 0 NA NA NA NA
## GGAAAGCCAAGGCTCC-1 1 -9.06371 0.20007999 FALSE NA
## TGCCCTAGTAGCTCCG-1 1 -13.51757 0.00309969 FALSE NA
## TTATGCTGTGTTTGTG-1 0 NA NA NA NA
## GCTGCGACACGTTGGC-1 0 NA NA NA NA
# Testeed by FDR
summary(e.out$FDR <= 0.001)## Mode FALSE TRUE NA's
## logical 1072 1080 97848
# Concordance by testing with FDR and limited
table(Sig = e.out$FDR <= 0.001, Limited = e.out$Limited)## Limited
## Sig FALSE TRUE
## FALSE 1072 0
## TRUE 6 1074
We can plot the distribution of significance of non-empty reads
hist(e.out$PValue[e.out$Total <= limit & e.out$Total > 0], xlab = "P-value", main = "",
col = "grey80")We can filter to just our non-empty droplets using a simple which query on the FDR from emptyDrops(). Here we are using a 0.001 cut-off.
sce2 <- sce[, which(e.out$FDR <= 0.001)]library(scran)
library(scuttle)
library(scater)
clusters <- quickCluster(sce2)
sce2 <- computeSumFactors(sce2, cluster = clusters)
sce2 <- logNormCounts(sce2)
sce2## class: SingleCellExperiment
## dim: 27998 1080
## metadata(1): Samples
## assays(2): counts logcounts
## rownames(27998): ENSMUSG00000051951 ENSMUSG00000089699 ...
## ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ID Symbol
## colnames(1080): CGTCCATAGCTGCAAG-1 GCTCCTAAGGCACATG-1 ...
## ACTGAGTCACACCGAC-1 GAACGGAAGCTTATCG-1
## colData names(3): Sample Barcode sizeFactor
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
set.seed(1000)
# modeling variables
dec.pbmc <- modelGeneVarByPoisson(sce2)
# calcualte top features
top.pbmc <- getTopHVGs(dec.pbmc, prop = 0.1)set.seed(1000)
# Evaluate PCs
sce2 <- denoisePCA(sce2, subset.row = top.pbmc, technical = dec.pbmc)
# make TSNE plot
sce2 <- runTSNE(sce2, dimred = "PCA")
# make UMAP plot
sce2 <- runUMAP(sce2, dimred = "PCA")g <- buildSNNGraph(sce2, k = 10, use.dimred = "PCA")
clust <- igraph::cluster_walktrap(g)$membership
colLabels(sce2) <- factor(clust)
#
colData(sce2)## DataFrame with 1080 rows and 4 columns
## Sample Barcode sizeFactor
## <character> <character> <numeric>
## CGTCCATAGCTGCAAG-1 ~/Desktop/01_scSeq_t.. CGTCCATAGCTGCAAG-1 0.220604
## GCTCCTAAGGCACATG-1 ~/Desktop/01_scSeq_t.. GCTCCTAAGGCACATG-1 1.293722
## CGGACACAGTACGACG-1 ~/Desktop/01_scSeq_t.. CGGACACAGTACGACG-1 0.477037
## GTAACTGTCGGATGGA-1 ~/Desktop/01_scSeq_t.. GTAACTGTCGGATGGA-1 1.411181
## TGACTAGAGTAGGCCA-1 ~/Desktop/01_scSeq_t.. TGACTAGAGTAGGCCA-1 0.394843
## ... ... ... ...
## ACGATACCAATGCCAT-1 ~/Desktop/01_scSeq_t.. ACGATACCAATGCCAT-1 0.392299
## AGGTCATCATGTAAGA-1 ~/Desktop/01_scSeq_t.. AGGTCATCATGTAAGA-1 2.538163
## CCGGGATAGCCACCTG-1 ~/Desktop/01_scSeq_t.. CCGGGATAGCCACCTG-1 0.261304
## ACTGAGTCACACCGAC-1 ~/Desktop/01_scSeq_t.. ACTGAGTCACACCGAC-1 0.363430
## GAACGGAAGCTTATCG-1 ~/Desktop/01_scSeq_t.. GAACGGAAGCTTATCG-1 0.656362
## label
## <factor>
## CGTCCATAGCTGCAAG-1 4
## GCTCCTAAGGCACATG-1 3
## CGGACACAGTACGACG-1 5
## GTAACTGTCGGATGGA-1 3
## TGACTAGAGTAGGCCA-1 5
## ... ...
## ACGATACCAATGCCAT-1 2
## AGGTCATCATGTAAGA-1 3
## CCGGGATAGCCACCTG-1 2
## ACTGAGTCACACCGAC-1 1
## GAACGGAAGCTTATCG-1 3
plotUMAP(sce2, colour_by = "label")plotTSNE(sce2, colour_by = "label")# extrat potential ambient RNA and thee estimated score
amb <- metadata(e.out)$ambient[, 1]
head(amb)## ENSMUSG00000051951 ENSMUSG00000025902 ENSMUSG00000033845 ENSMUSG00000025903
## 5.808442e-07 5.808442e-07 1.067188e-04 6.783426e-05
## ENSMUSG00000033813 ENSMUSG00000002459
## 5.876210e-05 5.808442e-07
library(scater)
stripped <- sce2[names(amb), ]
out <- removeAmbience(counts(stripped), ambient = amb, groups = colLabels(stripped))counts(stripped, withDimnames = FALSE) <- out
stripped <- logNormCounts(stripped)ensmbl_id <- rowData(sce2)$ID[rowData(sce2)$Symbol == "Hba-a1"]
plotExpression(sce2, x = "label", colour_by = "label", features = ensmbl_id) + ggtitle("Before")
plotExpression(stripped, x = "label", colour_by = "label", features = ensmbl_id) +
ggtitle("After")Hemoglobin A1 (Hba-a1) as example
ensmbl_id <- rowData(sce2)$ID[rowData(sce2)$Symbol == "Krt17"]
plotExpression(sce2, x = "label", colour_by = "label", features = ensmbl_id) + ggtitle("Before")
plotExpression(stripped, x = "label", colour_by = "label", features = ensmbl_id) +
ggtitle("After")Krt17 as example
dec <- modelGeneVar(stripped)
hvgs <- getTopHVGs(dec, n = 1000)
stripped <- runPCA(stripped, ncomponents = 10, subset_row = hvgs)
stripped <- runUMAP(stripped, dimred = "PCA")
g <- buildSNNGraph(stripped, k = 10, use.dimred = "PCA")
clust <- igraph::cluster_walktrap(g)$membership
colLabels(stripped) <- factor(clust)
plotUMAP(stripped, colour_by = "label")Again we can save our results for later using a Rds object.
saveRDS(stripped, "data/scSeq_CTRL_sceSub_rmAmbRNA.rds")The computeDoubletDensity() fucntion can be used on your SingleCellExperiment object to estimate doublets.
dbl.dens <- computeDoubletDensity(stripped, #subset.row=top.mam,
d=ncol(reducedDim(stripped)),subset.row=hvgs)
summary(dbl.dens)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0324 0.5011 0.9396 1.0330 1.4796 4.5122
stripped$DoubletScore <- dbl.densWe can project the doublet scores onto our UMPA to see if doublet correlates with the data distribution.
plotUMAP(stripped, colour_by = "DoubletScore")Likewise we can look for over representation of doublets in clusters.
plotColData(stripped, x = "label", y = "DoubletScore", colour_by = "label") + geom_hline(yintercept = quantile(colData(stripped)$DoubletScore,
0.95), lty = "dashed", color = "red") - No clusters have significantly higher doublet scores than other clusters. No clusters would be removed. - Red dash line represented 95% quantile of doublet score. The cells with higher doublet score than this cut-off would be removed.
We can clean our data up based on this 95% quantile cut-off.
cut_off <- quantile(stripped$DoubletScore, 0.95)
stripped$isDoublet <- c("no", "yes")[factor(as.integer(stripped$DoubletScore >= cut_off),
levels = c(0, 1))]
table(stripped$isDoublet)##
## no yes
## 1025 55
sce_clean <- stripped[stripped$isDoublet == "no", ]
saveRDS(sce_clean, "data/scSeq_CTRL_sceSub_rmAmbRNA_rmDoublet.rds")library(scater)
mtGene <- rowData(sce_clean)$ID[grepl(rowData(sce_clean)$Symbol, pattern = "mt-")]
is.mito <- names(sce_clean) %in% mtGene
sce_clean <- addPerCellQC(sce_clean, subsets = list(Mito = is.mito))Post-clean
plotColData(sce_clean, x = "label", y = "sum", colour_by = "label") + ggtitle("read counts")Post-clean
plotColData(sce_clean, x = "label", y = "detected", colour_by = "label") + ggtitle("gene counts")Post-clean
plotColData(sce_clean, x = "label", y = "subsets_Mito_percent", colour_by = "label") +
ggtitle("mitocondrial content")plotColData(sce_clean, x = "sum", y = "subsets_Mito_percent", colour_by = "label") +
ggtitle("is.mito vs read counts")plotColData(sce_clean, x = "sum", y = "detected", colour_by = "label") + ggtitle("gene counts vs read counts")vars <- getVarianceExplained(sce_clean, variables = c("DoubletScore", "label", "sum",
"detected", "subsets_Mito_percent"))plotExplanatoryVariables(vars)plotExplanatoryVariables(vars)## Warning: Removed 800 rows containing non-finite values (stat_density).