Â
These exercises are about the simple scRNAseq workflow in session 2.
We will use data from this study on renal cancer from human cancer.
The full dataset is on ArrayExpress. We will use just one sample for now: 4834STDY7002881.
You can also download it from DropBox.
Exercise 1 - Cell Ranger
Exercise 2 - Create a Seurat Object
library(Seurat)
mtx <- Read10X("~/Downloads/4834STDY7002881/GRCh38/")
seu <- CreateSeuratObject(mtx, project="kidney_F6", min.cells=10, min.features=200)
seu[["dset"]] <- "kidney_F6"
seu <- Seurat::RenameCells(seu, add.cell.id="kidney_F6")
seu <- NormalizeData(seu, normalization.method="LogNormalize")
seu <- FindVariableFeatures(seu, select.method="vst", nfeatures=3000)
seu <- ScaleData(seu)
top10 <- head(VariableFeatures(seu), 10)
temp_plot <- VariableFeaturePlot(seu)
plot2 <- LabelPoints(plot = temp_plot, points = top10, repel = TRUE)
plot2
library(dplyr)
seu <- RunPCA(seu, assay = "RNA", npcs = 50)
pct <- seu[["pca"]]@stdev / sum(seu[["pca"]]@stdev) * 100
cumu <- cumsum(pct)
co1 <- which(cumu > 90 & pct < 5)[1]
co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
pc <- min(co1, co2)
plot_df <- data.frame(pct = pct, cumu = cumu,rank = 1:length(pct))
my_elbow <- ggplot(plot_df, aes(cumu, pct, label = rank, color = rank > pc)) + geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(pct[pct > 5]), color = "grey") +
theme_bw()
my_elbow
seu <- FindNeighbors(seu,dims = 1:pc, reduction = "pca")
seu <- RunUMAP(seu,dims = 1:pc, reduction = "pca")
seu <- FindClusters(seu, resolution = 0.5)
seu[["cluster_byDefault"]] <- seu$seurat_clusters
umap_1 <- DimPlot(seu)
umap_1
Exercise 3 - Basic QC
vln_1 <- VlnPlot(seu,
features = c("nCount_RNA","nFeature_RNA"))
vln_1
feat_1 <- FeaturePlot(seu,
features = c("nCount_RNA","nFeature_RNA"))
feat_1
seu$percent.mt <- PercentageFeatureSet(seu, pattern = "^MT-")
vln_2 <- VlnPlot(seu,"percent.mt", group.by ="dset")
vln_2
feat_2 <- FeaturePlot(seu,"percent.mt") + scale_colour_gradient(low="gray",high="blue")
feat_2
feat_3 <- FeatureScatter(seu, feature1 = "nCount_RNA", feature2 = "percent.mt")
feat_3
tab<-table(seu$percent.mt>10)
tab
##
## FALSE TRUE
## 1028 21
feat_s <- cc.genes$s.genes
feat_g2m <- cc.genes$g2m.genes
seu <- CellCycleScoring(seu, s.features = feat_s, g2m.features = feat_g2m)
umap_2 <- DimPlot(seu, group.by = "Phase")
umap_2
library(scran)
sce <- as.SingleCellExperiment(seu, assay = "RNA")
rowData(sce)$SYMBOL <- rownames(sce)
load("data/ccGene_mouse_human_geneSymbol_ensemblID_20220817.RData")
ccGene_hs <- ccGene_mm_hs$human_symbol
assignments <- cyclone(sce, ccGene_hs, gene.names=rowData(sce)$SYMBOL)
seu[["cyclone_Phase"]] <- assignments$phases
seu[["cyclone_G1Score"]] <- assignments$scores$G1
seu[["cyclone_SScore"]] <- assignments$scores$S
seu[["cyclone_G2MScore"]] <- assignments$scores$G2M
umap_3 <- DimPlot(seu, group.by = "cyclone_Phase")
umap_3
scr <- reticulate::import("scrublet")
mat <- GetAssayData(seu, assay = "RNA", slot = "counts")
mat <- as.matrix(mat)
scrub <- scr$Scrublet(t(mat))
doublet <- scrub$scrub_doublets()
names(doublet) <- c("doublet_score","doublet")
seu[["doublet_score"]] <- doublet$doublet_score
seu[["doublet"]] <- doublet$doublet
vln_3 <- VlnPlot(seu, group.by = "doublet",
features = c("doublet_score", "nCount_RNA","nFeature_RNA"),
pt.size = 0)
vln_3
umap_4 <- DimPlot(seu, group.by = "doublet")
umap_4
Exercise 4 - Subset and Regress
seu_filt <- subset(seu, subset=doublet=="FALSE" &
percent.mt < 10)
# These cells are from a cancer study so there's a good chance there's a high proportion of cycling cells -> Seurat cell cycle assignment seems good.
pot_conf <- c("percent.mt","S.Score","G2M.Score")
seu_filt <- ScaleData(seu_filt, vars.to.regress = pot_conf)
filt_number <- c(ncol(seu),ncol(seu_filt))
filt_number
## [1] 1049 1024
seu_filt <- RunPCA(seu_filt, npcs=30, verbose=FALSE)
seu_filt <- RunUMAP(seu_filt, reduction = "pca", dims = 1:10, verbose=FALSE)
seu_filt <- FindNeighbors(seu_filt, reduction = "pca", dims = 1:10, verbose=FALSE)
seu_filt <- FindClusters(seu_filt, resolution = 0.5, verbose=FALSE)
umap_5 <- DimPlot(seu_filt)
umap_5
umap_6 <- DimPlot(seu_filt, group.by ="Phase")
umap_6
feat_4 <- FeaturePlot(seu_filt,"percent.mt") + scale_colour_gradient(low="gray",high="blue")
feat_4
Exercise 4 - Optimize Clustering and Find Markers
library(clustree)
seu_filt$default_clusters <- seu_filt$seurat_clusters
reso <- c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)
reso_res <- lapply(1:length(reso), function(x,seu_filt,reso){
seu_filt <- FindClusters(seu_filt,resolution = reso[x])
clust <- setNames(seu_filt$seurat_clusters,Cells(seu_filt))
return(clust)}, seu_filt, reso)
names(reso_res) <- paste0("k",1:length(reso))
k_tab <- do.call(cbind,reso_res)
k_dat <- as.data.frame(k_tab)
clust_tree <- clustree(k_dat, prefix = "k", node_colour = "sc3_stability")
clust_tree
seu_filt <- FindClusters(seu_filt, resolution = 0.3)
umap_7 <- DimPlot(seu_filt, group.by = "seurat_clusters",label = TRUE,pt.size = 0.2) + NoLegend() + ggtitle("Optimized Clusters")
umap_7
umap_8 <- DimPlot(seu_filt, group.by = "default_clusters",label = TRUE,pt.size = 0.2) + NoLegend() + ggtitle("Default Clusters")
umap_8
library(magrittr)
markers <- FindAllMarkers(seu_filt, only.pos = TRUE,
min.pct = 0.25, logfc.threshold = 0.25)
top_genes <- markers %>% group_by(cluster) %>%
slice_max(n = 2, order_by = avg_log2FC)
head(top_genes)
heat_1 <- DoHeatmap(seu_filt, features = top_genes$gene) + NoLegend()
heat_1
## # A tibble: 18 x 7
## # Groups: cluster [9]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 2.82e-101 5.16 0.646 0.048 4.62e- 97 0 ALX1
## 2 1.02e- 47 4.57 0.313 0.015 1.67e- 43 0 RP11-193M21.1
## 3 5.26e- 65 3.46 0.632 0.128 8.61e- 61 1 SLC15A1
## 4 7.19e- 32 3.25 0.316 0.049 1.18e- 27 1 ELAVL4
## 5 3.48e- 16 4.31 0.304 0.085 5.69e- 12 2 CORO1A
## 6 1.00e- 12 3.68 0.41 0.171 1.64e- 8 2 CXCR4
## 7 1.00e-125 9.08 0.659 0.009 1.64e-121 3 DDN
## 8 1.66e- 68 8.34 0.318 0 2.73e- 64 3 RFPL1
## 9 2.65e- 44 7.45 0.286 0.008 4.34e- 40 4 ATP6V1B1
## 10 7.29e- 68 7.36 0.351 0.003 1.19e- 63 4 TFCP2L1
## 11 1.54e- 40 6.84 0.395 0.032 2.53e- 36 5 MFAP5
## 12 8.07e- 35 6.51 0.684 0.185 1.32e- 30 5 SFRP2
## 13 1.05e-135 9.06 0.634 0.001 1.72e-131 6 ADH6
## 14 2.58e- 67 8.39 0.366 0.003 4.23e- 63 6 AZGP1
## 15 7.62e-122 9.08 0.576 0.001 1.25e-117 7 PDE2A
## 16 4.67e-115 8.92 0.545 0.001 7.64e-111 7 NOVA2
## 17 2.40e-102 7.45 0.697 0.012 3.93e- 98 8 RERGL
## 18 5.81e- 13 7.40 0.758 0.342 9.51e- 9 8 REN