These exercises are about manipulate single-cell data with Bioconductor packages. Please download the counting matrix from DropBox and loading them into a Seurat object. Or you may also used the rds file data/scSeq_CKO_sceSub.rds.
library(DropletUtils)
library(DropletTestFiles)
<- "~path to the 10X counting matrix"
fname <- read10xCounts(fname, col.names=TRUE)
sce saveRDS(sce,"path to rds file")
library(DropletUtils)
# library(DropletTestFiles)
<- readRDS("data/scSeq_CKO_sceSub.rds") sce
<- barcodeRanks(counts(sce))
bcrank <- !duplicated(bcrank$rank)
uniq #
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
xlab="Rank", ylab="Total UMI count", cex.lab=1.2)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot
abline(h=metadata(bcrank)$inflection, col="darkgreen", lty=2)
abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2)
legend("bottomleft", legend=c("Inflection", "Knee"),
col=c("darkgreen", "dodgerblue"), lty=2, cex=1.2)
set.seed(100)
<- 100
limit <- emptyDrops(counts(sce),lower=limit, test.ambient=TRUE)
e.out #
e.out
## DataFrame with 400000 rows and 5 columns
## Total LogProb PValue Limited FDR
## <integer> <numeric> <numeric> <logical> <numeric>
## CCACGGAAGCTCTCGG-1 0 NA NA NA NA
## TGTGGTAAGAGTCGGT-1 2 -11.1237 0.7561244 FALSE NA
## CCCTCCTTCGTCTGCT-1 2 -18.2572 0.0727927 FALSE NA
## ACTGAGTGTGTCGCTG-1 0 NA NA NA NA
## CGTAGCGTCTCTGCTG-1 0 NA NA NA NA
## ... ... ... ... ... ...
## AGAGCTTGTCCGACGT-1 0 NA NA NA NA
## GCTGCGACAATAGCAA-1 0 NA NA NA NA
## CGAACATAGTGAAGTT-1 2 -12.3017 0.632437 FALSE NA
## TAGCCGGCATGTCGAT-1 0 NA NA NA NA
## CGAGAAGAGCAGATCG-1 0 NA NA NA NA
#
summary(e.out$FDR <= 0.001)
## Mode FALSE TRUE NA's
## logical 3051 3943 393006
# Concordance by testing with FDR and limited
table(Sig=e.out$FDR <= 0.001, Limited=e.out$Limited)
## Limited
## Sig FALSE TRUE
## FALSE 3051 0
## TRUE 62 3881
#
<- sce[,which(e.out$FDR <= 0.001)] sce2
library(scran)
library(scuttle)
library(scater)
<- quickCluster(sce2)
clusters <- computeSumFactors(sce2, cluster=clusters) sce2
## Warning in (function (x, sizes, min.mean = NULL, positive = FALSE, scaling =
## NULL) : encountered non-positive size factor estimates
<- logNormCounts(sce2)
sce2 #
set.seed(1000)
# modeling variables
<- modelGeneVarByPoisson(sce2)
dec.pbmc # calcualte top features
<- getTopHVGs(dec.pbmc, prop=0.1)
top.pbmc #
set.seed(1000)
# Evaluate PCs
<- denoisePCA(sce2, subset.row=top.pbmc, technical=dec.pbmc)
sce2 # make UMAP plot
<- runUMAP(sce2, dimred="PCA")
sce2 #
<- buildSNNGraph(sce2, k=10, use.dimred = 'PCA')
g <- igraph::cluster_walktrap(g)$membership
clust colLabels(sce2) <- factor(clust)
#
plotUMAP(sce2,colour_by="label")
# evaluate ambinat RNA contamination in the empty droplets
<- metadata(e.out)$ambient[,1]
amb head(amb)
## ENSMUSG00000051951 ENSMUSG00000025902 ENSMUSG00000033845 ENSMUSG00000025903
## 1.669575e-07 1.669575e-07 1.407972e-04 5.201547e-05
## ENSMUSG00000104217 ENSMUSG00000033813
## 1.669575e-07 6.783772e-05
#
# remove ambient RNA
library(scater)
<- sce2[names(amb),]
stripped <- removeAmbience(counts(stripped), ambient=amb,groups = colLabels(stripped))
out #
# load correccted counts into scce object
counts(stripped, withDimnames=FALSE) <- out
<- logNormCounts(stripped)
stripped #
<- rowData(sce2)$ID[rowData(sce2)$Symbol=="Hba-a1"]
ensmbl_id plotExpression(sce2, x="label", colour_by="label", features=ensmbl_id) +
ggtitle("Before")
plotExpression(stripped, x="label", colour_by="label", features=ensmbl_id) +
ggtitle("After")
set.seed(1000)
<- modelGeneVar(stripped)
dec <- getTopHVGs(dec,n=1000)
hvgs <- runPCA(stripped, ncomponents=10, subset_row=hvgs)
stripped <- runUMAP(stripped, dimred="PCA")
stripped <- buildSNNGraph(stripped, k=10, use.dimred = 'PCA')
g <- igraph::cluster_walktrap(g)$membership
clust colLabels(stripped) <- factor(clust)
plotUMAP(stripped,colour_by="label")
<- computeDoubletDensity(stripped, #subset.row=top.mam,
dbl.dens d=ncol(reducedDim(stripped)),subset.row=hvgs)
summary(dbl.dens)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2445 0.6309 0.8732 1.2381 7.3498
$DoubletScore <- dbl.dens
stripped#
plotUMAP(stripped,colour_by="DoubletScore")
#
plotColData(stripped, x="label", y="DoubletScore", colour_by="label")+
geom_hline(yintercept = quantile(colData(stripped)$DoubletScore,0.95),lty="dashed",color="red")
#
<- quantile(stripped$DoubletScore,0.95)
cut_off $isDoublet <- c("no","yes")[factor(as.integer(stripped$DoubletScore>=cut_off),levels=c(0,1))]
strippedtable(stripped$isDoublet)
##
## no yes
## 3744 199
<- stripped[stripped$isDoublet=="no",]
sce_clean # saveRDS(sce_clean,"data/scSeq_CTRL_sceSub_rmAmbRNA_rmDoublet.rds")
library(scater)
<- rowData(sce_clean)$ID[grepl(rowData(sce_clean)$Symbol,pattern = "mt-")]
mtGene <- names(sce_clean) %in% mtGene
is.mito <- addPerCellQC(sce_clean, subsets=list(Mito=is.mito))
sce_clean #
plotColData(sce_clean,x="label", y="sum", colour_by="label")+ggtitle("read counts")
plotColData(sce_clean,x="label", y="detected", colour_by="label")+ggtitle("gene counts")
plotColData(sce_clean,x="label", y="subsets_Mito_percent", colour_by="label")+ggtitle("mitocondrial content")
#
plotColData(sce_clean,x="sum",y="subsets_Mito_percent",colour_by="label")+ggtitle("is.mito vs read counts")
plotColData(sce_clean,x="sum",y="detected",colour_by="label")+ggtitle("gene counts vs read counts")
<- getVarianceExplained(sce_clean,
vars variables=c("DoubletScore","label","sum","detected","subsets_Mito_percent"))
plotExplanatoryVariables(vars)
## Warning: Removed 905 rows containing non-finite values (stat_density).