There are more and more Bioconductor packages supporting single-cell data analysis. R Amezquita, A Lun, S Hicks, and R Gottardo wrote an integrated workflow, Orchestrating Single-Cell Analysis with Bioconductor, for single-cell data analysis and quality assessment. In this session, we will go through several important QC metrics which can’t be made with Seurat.
We can load the data in from 10X with the DropletUtils function, read10xCounts().
library(DropletUtils)
library(DropletTestFiles)
<- "~/filepath/toCellRanger/results"
fname <- read10xCounts(fname, col.names = TRUE)
sce <- colData(sce)$Barcode cellID
We have a small subset version of this dataset that you can load in from the data/ directory to try this out.
read10xCounts() reads the data in as a specialist object called a SingleCellExperiment.
sce
## class: SingleCellExperiment
## dim: 27998 100000
## metadata(1): Samples
## assays(1): counts
## rownames(27998): ENSMUSG00000051951 ENSMUSG00000089699 ...
## ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ID Symbol
## colnames(100000): CGTGTCTTCGCATGAT-1 TTTATGCGTCGAACAG-1 ...
## TTATGCTGTGTTTGTG-1 GCTGCGACACGTTGGC-1
## colData names(2): Sample Barcode
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
The colData() and rowData() functions can be used to access experiment metadata.
# cell information
colData(sce)[1:2, ]
## DataFrame with 2 rows and 2 columns
## Sample Barcode
## <character> <character>
## CGTGTCTTCGCATGAT-1 ~/Desktop/01_scSeq_t.. CGTGTCTTCGCATGAT-1
## TTTATGCGTCGAACAG-1 ~/Desktop/01_scSeq_t.. TTTATGCGTCGAACAG-1
# gene information
rowData(sce)[1:2, ]
## DataFrame with 2 rows and 2 columns
## ID Symbol
## <character> <character>
## ENSMUSG00000051951 ENSMUSG00000051951 Xkr4
## ENSMUSG00000089699 ENSMUSG00000089699 Gm1992
<- barcodeRanks(counts(sce))
bcrank 1:2, ] bcrank[
## DataFrame with 2 rows and 3 columns
## rank total fitted
## <numeric> <integer> <numeric>
## CGTGTCTTCGCATGAT-1 41200.5 1 NA
## TTTATGCGTCGAACAG-1 41200.5 1 NA
Knee plot is a useful QC plot for single-cell seuqnecing. It reflects the threshold to validate cell for analysis. - x-axis is the cell barcodes (droplet) ranked by their UMI counts - y-axis is the UMI counts in each droplet - inflection point: the point while UMI counts start decreasing rapidly - knee point: the cut-off of UMI counts to differentiate cells valid for analysis
<- !duplicated(bcrank$rank)
uniq #
plot(bcrank$rank[uniq], bcrank$total[uniq], log = "xy", xlab = "Rank", ylab = "Total UMI count",
cex.lab = 1.2)
abline(h = metadata(bcrank)$inflection, col = "darkgreen", lty = 2)
abline(h = metadata(bcrank)$knee, col = "dodgerblue", lty = 2)
legend("bottomleft", legend = c("Inflection", "Knee"), col = c("darkgreen", "dodgerblue"),
lty = 2, cex = 1.2)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot
The emptydrops function can be used to identify good/bad droplets. 2 keys arguments are: - limit: lowest counts per cell - test.ambient: Could be used to estimate ambient RNA contamination
set.seed(100)
<- 100
limit <- emptyDrops(counts(sce), lower = limit, test.ambient = TRUE)
e.out #
e.out
## DataFrame with 100000 rows and 5 columns
## Total LogProb PValue Limited FDR
## <integer> <numeric> <numeric> <logical> <numeric>
## CGTGTCTTCGCATGAT-1 1 -5.34170 0.70912909 FALSE NA
## TTTATGCGTCGAACAG-1 1 -6.44503 0.50414959 FALSE NA
## CATGACACAAGTCTGT-1 0 NA NA NA NA
## ACATACGCACCACCAG-1 58 -266.20222 0.01839816 FALSE NA
## CGTCCATAGCTGCAAG-1 1601 -2917.18710 0.00009999 TRUE 0.000200166
## ... ... ... ... ... ...
## GTGCTTCGTCACCCAG-1 0 NA NA NA NA
## GGAAAGCCAAGGCTCC-1 1 -9.06371 0.20007999 FALSE NA
## TGCCCTAGTAGCTCCG-1 1 -13.51757 0.00309969 FALSE NA
## TTATGCTGTGTTTGTG-1 0 NA NA NA NA
## GCTGCGACACGTTGGC-1 0 NA NA NA NA
# Testeed by FDR
summary(e.out$FDR <= 0.001)
## Mode FALSE TRUE NA's
## logical 1072 1080 97848
# Concordance by testing with FDR and limited
table(Sig = e.out$FDR <= 0.001, Limited = e.out$Limited)
## Limited
## Sig FALSE TRUE
## FALSE 1072 0
## TRUE 6 1074
We can plot the distribution of significance of non-empty reads
hist(e.out$PValue[e.out$Total <= limit & e.out$Total > 0], xlab = "P-value", main = "",
col = "grey80")
We can filter to just our non-empty droplets using a simple which query on the FDR from emptyDrops(). Here we are using a 0.001 cut-off.
<- sce[, which(e.out$FDR <= 0.001)] sce2
library(scran)
library(scuttle)
library(scater)
<- quickCluster(sce2)
clusters <- computeSumFactors(sce2, cluster = clusters)
sce2 <- logNormCounts(sce2)
sce2 sce2
## class: SingleCellExperiment
## dim: 27998 1080
## metadata(1): Samples
## assays(2): counts logcounts
## rownames(27998): ENSMUSG00000051951 ENSMUSG00000089699 ...
## ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ID Symbol
## colnames(1080): CGTCCATAGCTGCAAG-1 GCTCCTAAGGCACATG-1 ...
## ACTGAGTCACACCGAC-1 GAACGGAAGCTTATCG-1
## colData names(3): Sample Barcode sizeFactor
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
set.seed(1000)
# modeling variables
<- modelGeneVarByPoisson(sce2)
dec.pbmc # calcualte top features
<- getTopHVGs(dec.pbmc, prop = 0.1) top.pbmc
set.seed(1000)
# Evaluate PCs
<- denoisePCA(sce2, subset.row = top.pbmc, technical = dec.pbmc)
sce2 # make TSNE plot
<- runTSNE(sce2, dimred = "PCA")
sce2 # make UMAP plot
<- runUMAP(sce2, dimred = "PCA") sce2
<- buildSNNGraph(sce2, k = 10, use.dimred = "PCA")
g <- igraph::cluster_walktrap(g)$membership
clust colLabels(sce2) <- factor(clust)
#
colData(sce2)
## DataFrame with 1080 rows and 4 columns
## Sample Barcode sizeFactor
## <character> <character> <numeric>
## CGTCCATAGCTGCAAG-1 ~/Desktop/01_scSeq_t.. CGTCCATAGCTGCAAG-1 0.220604
## GCTCCTAAGGCACATG-1 ~/Desktop/01_scSeq_t.. GCTCCTAAGGCACATG-1 1.293722
## CGGACACAGTACGACG-1 ~/Desktop/01_scSeq_t.. CGGACACAGTACGACG-1 0.477037
## GTAACTGTCGGATGGA-1 ~/Desktop/01_scSeq_t.. GTAACTGTCGGATGGA-1 1.411181
## TGACTAGAGTAGGCCA-1 ~/Desktop/01_scSeq_t.. TGACTAGAGTAGGCCA-1 0.394843
## ... ... ... ...
## ACGATACCAATGCCAT-1 ~/Desktop/01_scSeq_t.. ACGATACCAATGCCAT-1 0.392299
## AGGTCATCATGTAAGA-1 ~/Desktop/01_scSeq_t.. AGGTCATCATGTAAGA-1 2.538163
## CCGGGATAGCCACCTG-1 ~/Desktop/01_scSeq_t.. CCGGGATAGCCACCTG-1 0.261304
## ACTGAGTCACACCGAC-1 ~/Desktop/01_scSeq_t.. ACTGAGTCACACCGAC-1 0.363430
## GAACGGAAGCTTATCG-1 ~/Desktop/01_scSeq_t.. GAACGGAAGCTTATCG-1 0.656362
## label
## <factor>
## CGTCCATAGCTGCAAG-1 4
## GCTCCTAAGGCACATG-1 3
## CGGACACAGTACGACG-1 5
## GTAACTGTCGGATGGA-1 3
## TGACTAGAGTAGGCCA-1 5
## ... ...
## ACGATACCAATGCCAT-1 2
## AGGTCATCATGTAAGA-1 3
## CCGGGATAGCCACCTG-1 2
## ACTGAGTCACACCGAC-1 1
## GAACGGAAGCTTATCG-1 3
plotUMAP(sce2, colour_by = "label")
plotTSNE(sce2, colour_by = "label")
# extrat potential ambient RNA and thee estimated score
<- metadata(e.out)$ambient[, 1]
amb head(amb)
## ENSMUSG00000051951 ENSMUSG00000025902 ENSMUSG00000033845 ENSMUSG00000025903
## 5.808442e-07 5.808442e-07 1.067188e-04 6.783426e-05
## ENSMUSG00000033813 ENSMUSG00000002459
## 5.876210e-05 5.808442e-07
library(scater)
<- sce2[names(amb), ]
stripped <- removeAmbience(counts(stripped), ambient = amb, groups = colLabels(stripped)) out
counts(stripped, withDimnames = FALSE) <- out
<- logNormCounts(stripped) stripped
<- rowData(sce2)$ID[rowData(sce2)$Symbol == "Hba-a1"]
ensmbl_id plotExpression(sce2, x = "label", colour_by = "label", features = ensmbl_id) + ggtitle("Before")
plotExpression(stripped, x = "label", colour_by = "label", features = ensmbl_id) +
ggtitle("After")
Hemoglobin A1 (Hba-a1) as example
<- rowData(sce2)$ID[rowData(sce2)$Symbol == "Krt17"]
ensmbl_id plotExpression(sce2, x = "label", colour_by = "label", features = ensmbl_id) + ggtitle("Before")
plotExpression(stripped, x = "label", colour_by = "label", features = ensmbl_id) +
ggtitle("After")
Krt17 as example
<- modelGeneVar(stripped)
dec <- getTopHVGs(dec, n = 1000)
hvgs <- runPCA(stripped, ncomponents = 10, subset_row = hvgs)
stripped <- runUMAP(stripped, dimred = "PCA")
stripped <- buildSNNGraph(stripped, k = 10, use.dimred = "PCA")
g <- igraph::cluster_walktrap(g)$membership
clust colLabels(stripped) <- factor(clust)
plotUMAP(stripped, colour_by = "label")
Again we can save our results for later using a Rds object.
saveRDS(stripped, "data/scSeq_CTRL_sceSub_rmAmbRNA.rds")
The computeDoubletDensity() fucntion can be used on your SingleCellExperiment object to estimate doublets.
<- computeDoubletDensity(stripped, #subset.row=top.mam,
dbl.dens d=ncol(reducedDim(stripped)),subset.row=hvgs)
summary(dbl.dens)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0324 0.5011 0.9396 1.0330 1.4796 4.5122
$DoubletScore <- dbl.dens stripped
We can project the doublet scores onto our UMPA to see if doublet correlates with the data distribution.
plotUMAP(stripped, colour_by = "DoubletScore")
Likewise we can look for over representation of doublets in clusters.
plotColData(stripped, x = "label", y = "DoubletScore", colour_by = "label") + geom_hline(yintercept = quantile(colData(stripped)$DoubletScore,
0.95), lty = "dashed", color = "red")
- No clusters have significantly higher doublet scores than other clusters. No clusters would be removed. - Red dash line represented 95% quantile of doublet score. The cells with higher doublet score than this cut-off would be removed.
We can clean our data up based on this 95% quantile cut-off.
<- quantile(stripped$DoubletScore, 0.95)
cut_off $isDoublet <- c("no", "yes")[factor(as.integer(stripped$DoubletScore >= cut_off),
strippedlevels = c(0, 1))]
table(stripped$isDoublet)
##
## no yes
## 1025 55
<- stripped[stripped$isDoublet == "no", ]
sce_clean saveRDS(sce_clean, "data/scSeq_CTRL_sceSub_rmAmbRNA_rmDoublet.rds")
library(scater)
<- rowData(sce_clean)$ID[grepl(rowData(sce_clean)$Symbol, pattern = "mt-")]
mtGene <- names(sce_clean) %in% mtGene
is.mito <- addPerCellQC(sce_clean, subsets = list(Mito = is.mito)) sce_clean
Post-clean
plotColData(sce_clean, x = "label", y = "sum", colour_by = "label") + ggtitle("read counts")
Post-clean
plotColData(sce_clean, x = "label", y = "detected", colour_by = "label") + ggtitle("gene counts")
Post-clean
plotColData(sce_clean, x = "label", y = "subsets_Mito_percent", colour_by = "label") +
ggtitle("mitocondrial content")
plotColData(sce_clean, x = "sum", y = "subsets_Mito_percent", colour_by = "label") +
ggtitle("is.mito vs read counts")
plotColData(sce_clean, x = "sum", y = "detected", colour_by = "label") + ggtitle("gene counts vs read counts")
<- getVarianceExplained(sce_clean, variables = c("DoubletScore", "label", "sum",
vars "detected", "subsets_Mito_percent"))
plotExplanatoryVariables(vars)
plotExplanatoryVariables(vars)
## Warning: Removed 800 rows containing non-finite values (stat_density).