These exercises are about manipulate single-cell data with Bioconductor packages. Please download the counting matrix from DropBox and loading them into a Seurat object. Or you may also used the rds file data/scSeq_CKO_sceSub.rds.
library(DropletUtils)
library(DropletTestFiles)
fname <- "~path to the 10X counting matrix"
sce <- read10xCounts(fname, col.names=TRUE)
saveRDS(sce,"path to rds file")library(DropletUtils)
# library(DropletTestFiles)
sce <- readRDS("data/scSeq_CKO_sceSub.rds")bcrank <- barcodeRanks(counts(sce))
uniq <- !duplicated(bcrank$rank)
#
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
xlab="Rank", ylab="Total UMI count", cex.lab=1.2)## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot
abline(h=metadata(bcrank)$inflection, col="darkgreen", lty=2)
abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2)
legend("bottomleft", legend=c("Inflection", "Knee"),
col=c("darkgreen", "dodgerblue"), lty=2, cex=1.2)set.seed(100)
limit <- 100
e.out <- emptyDrops(counts(sce),lower=limit, test.ambient=TRUE)
#
e.out## DataFrame with 400000 rows and 5 columns
## Total LogProb PValue Limited FDR
## <integer> <numeric> <numeric> <logical> <numeric>
## CCACGGAAGCTCTCGG-1 0 NA NA NA NA
## TGTGGTAAGAGTCGGT-1 2 -11.1237 0.7561244 FALSE NA
## CCCTCCTTCGTCTGCT-1 2 -18.2572 0.0727927 FALSE NA
## ACTGAGTGTGTCGCTG-1 0 NA NA NA NA
## CGTAGCGTCTCTGCTG-1 0 NA NA NA NA
## ... ... ... ... ... ...
## AGAGCTTGTCCGACGT-1 0 NA NA NA NA
## GCTGCGACAATAGCAA-1 0 NA NA NA NA
## CGAACATAGTGAAGTT-1 2 -12.3017 0.632437 FALSE NA
## TAGCCGGCATGTCGAT-1 0 NA NA NA NA
## CGAGAAGAGCAGATCG-1 0 NA NA NA NA
#
summary(e.out$FDR <= 0.001)## Mode FALSE TRUE NA's
## logical 3051 3943 393006
# Concordance by testing with FDR and limited
table(Sig=e.out$FDR <= 0.001, Limited=e.out$Limited)## Limited
## Sig FALSE TRUE
## FALSE 3051 0
## TRUE 62 3881
#
sce2 <- sce[,which(e.out$FDR <= 0.001)]library(scran)
library(scuttle)
library(scater)
clusters <- quickCluster(sce2)
sce2 <- computeSumFactors(sce2, cluster=clusters)## Warning in (function (x, sizes, min.mean = NULL, positive = FALSE, scaling =
## NULL) : encountered non-positive size factor estimates
sce2 <- logNormCounts(sce2)
#
set.seed(1000)
# modeling variables
dec.pbmc <- modelGeneVarByPoisson(sce2)
# calcualte top features
top.pbmc <- getTopHVGs(dec.pbmc, prop=0.1)
#
set.seed(1000)
# Evaluate PCs
sce2 <- denoisePCA(sce2, subset.row=top.pbmc, technical=dec.pbmc)
# make UMAP plot
sce2 <- runUMAP(sce2, dimred="PCA")
#
g <- buildSNNGraph(sce2, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
colLabels(sce2) <- factor(clust)
#
plotUMAP(sce2,colour_by="label")# evaluate ambinat RNA contamination in the empty droplets
amb <- metadata(e.out)$ambient[,1]
head(amb)## ENSMUSG00000051951 ENSMUSG00000025902 ENSMUSG00000033845 ENSMUSG00000025903
## 1.669575e-07 1.669575e-07 1.407972e-04 5.201547e-05
## ENSMUSG00000104217 ENSMUSG00000033813
## 1.669575e-07 6.783772e-05
#
# remove ambient RNA
library(scater)
stripped <- sce2[names(amb),]
out <- removeAmbience(counts(stripped), ambient=amb,groups = colLabels(stripped))
#
# load correccted counts into scce object
counts(stripped, withDimnames=FALSE) <- out
stripped <- logNormCounts(stripped)
#
ensmbl_id <- rowData(sce2)$ID[rowData(sce2)$Symbol=="Hba-a1"]
plotExpression(sce2, x="label", colour_by="label", features=ensmbl_id) +
ggtitle("Before")plotExpression(stripped, x="label", colour_by="label", features=ensmbl_id) +
ggtitle("After")set.seed(1000)
dec <- modelGeneVar(stripped)
hvgs <- getTopHVGs(dec,n=1000)
stripped <- runPCA(stripped, ncomponents=10, subset_row=hvgs)
stripped <- runUMAP(stripped, dimred="PCA")
g <- buildSNNGraph(stripped, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
colLabels(stripped) <- factor(clust)
plotUMAP(stripped,colour_by="label")dbl.dens <- computeDoubletDensity(stripped, #subset.row=top.mam,
d=ncol(reducedDim(stripped)),subset.row=hvgs)
summary(dbl.dens)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2445 0.6309 0.8732 1.2381 7.3498
stripped$DoubletScore <- dbl.dens
#
plotUMAP(stripped,colour_by="DoubletScore")#
plotColData(stripped, x="label", y="DoubletScore", colour_by="label")+
geom_hline(yintercept = quantile(colData(stripped)$DoubletScore,0.95),lty="dashed",color="red")#
cut_off <- quantile(stripped$DoubletScore,0.95)
stripped$isDoublet <- c("no","yes")[factor(as.integer(stripped$DoubletScore>=cut_off),levels=c(0,1))]
table(stripped$isDoublet)##
## no yes
## 3744 199
sce_clean <- stripped[stripped$isDoublet=="no",]
# saveRDS(sce_clean,"data/scSeq_CTRL_sceSub_rmAmbRNA_rmDoublet.rds")library(scater)
mtGene <- rowData(sce_clean)$ID[grepl(rowData(sce_clean)$Symbol,pattern = "mt-")]
is.mito <- names(sce_clean) %in% mtGene
sce_clean <- addPerCellQC(sce_clean, subsets=list(Mito=is.mito))
#
plotColData(sce_clean,x="label", y="sum", colour_by="label")+ggtitle("read counts")plotColData(sce_clean,x="label", y="detected", colour_by="label")+ggtitle("gene counts")plotColData(sce_clean,x="label", y="subsets_Mito_percent", colour_by="label")+ggtitle("mitocondrial content")#
plotColData(sce_clean,x="sum",y="subsets_Mito_percent",colour_by="label")+ggtitle("is.mito vs read counts")plotColData(sce_clean,x="sum",y="detected",colour_by="label")+ggtitle("gene counts vs read counts")vars <- getVarianceExplained(sce_clean,
variables=c("DoubletScore","label","sum","detected","subsets_Mito_percent"))
plotExplanatoryVariables(vars)## Warning: Removed 905 rows containing non-finite values (stat_density).