These exercises are about the simple scRNAseq workflow in session 2.

Description

We will use data from this study on renal cancer from human cancer.

The full dataset is on ArrayExpress. We will use just one sample for now: 4834STDY7002881.

You can also download it from DropBox.

Exercise 1 - Cell Ranger

# cellranger-4.0.0
# 1,049
# Low fraction read in cells
# Low quality reads
#
# Cell Bender may help
#

Exercise 2 - Create a Seurat Object

library(Seurat)

mtx <- Read10X("~/Downloads/4834STDY7002881/GRCh38/")

seu <- CreateSeuratObject(mtx, project="kidney_F6", min.cells=10, min.features=200)
seu[["dset"]] <- "kidney_F6"
seu <- Seurat::RenameCells(seu, add.cell.id="kidney_F6")
seu <- NormalizeData(seu, normalization.method="LogNormalize")
seu <- FindVariableFeatures(seu, select.method="vst", nfeatures=3000)
seu <- ScaleData(seu)

top10 <- head(VariableFeatures(seu), 10)

temp_plot <- VariableFeaturePlot(seu)
plot2 <- LabelPoints(plot = temp_plot, points = top10, repel = TRUE)
plot2

library(dplyr)
seu <- RunPCA(seu, assay = "RNA", npcs = 50)
pct <- seu[["pca"]]@stdev / sum(seu[["pca"]]@stdev) * 100
cumu <- cumsum(pct)
co1 <- which(cumu > 90 & pct < 5)[1]
co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
pc <- min(co1, co2)

plot_df <- data.frame(pct = pct, cumu = cumu,rank = 1:length(pct))

my_elbow <- ggplot(plot_df, aes(cumu, pct, label = rank, color = rank > pc)) + geom_text() +
  geom_vline(xintercept = 90, color = "grey") +
  geom_hline(yintercept = min(pct[pct > 5]), color = "grey") +
  theme_bw()
my_elbow 

seu <- FindNeighbors(seu,dims = 1:pc, reduction = "pca")
seu <- RunUMAP(seu,dims = 1:pc, reduction = "pca")

seu <- FindClusters(seu, resolution = 0.5)
seu[["cluster_byDefault"]] <- seu$seurat_clusters

umap_1 <- DimPlot(seu)
umap_1

Exercise 3 - Basic QC

vln_1 <- VlnPlot(seu,
        features = c("nCount_RNA","nFeature_RNA"))
vln_1

feat_1 <- FeaturePlot(seu,
        features = c("nCount_RNA","nFeature_RNA"))
feat_1

seu$percent.mt <- PercentageFeatureSet(seu, pattern = "^MT-")

vln_2 <- VlnPlot(seu,"percent.mt", group.by ="dset")
vln_2

feat_2 <-  FeaturePlot(seu,"percent.mt")  + scale_colour_gradient(low="gray",high="blue")
feat_2 

feat_3 <- FeatureScatter(seu, feature1 = "nCount_RNA", feature2 = "percent.mt")
feat_3

tab<-table(seu$percent.mt>10)
tab

## 
## FALSE  TRUE 
##  1028    21
feat_s <- cc.genes$s.genes
feat_g2m <- cc.genes$g2m.genes

seu <- CellCycleScoring(seu, s.features = feat_s, g2m.features = feat_g2m)

umap_2 <- DimPlot(seu, group.by = "Phase")
umap_2

library(scran)

sce <- as.SingleCellExperiment(seu, assay = "RNA")
rowData(sce)$SYMBOL <- rownames(sce)

load("data/ccGene_mouse_human_geneSymbol_ensemblID_20220817.RData")
ccGene_hs <- ccGene_mm_hs$human_symbol
assignments <- cyclone(sce, ccGene_hs, gene.names=rowData(sce)$SYMBOL)

seu[["cyclone_Phase"]] <- assignments$phases
seu[["cyclone_G1Score"]] <- assignments$scores$G1
seu[["cyclone_SScore"]] <- assignments$scores$S
seu[["cyclone_G2MScore"]] <- assignments$scores$G2M

umap_3 <- DimPlot(seu, group.by = "cyclone_Phase")
umap_3

scr <- reticulate::import("scrublet")
mat <- GetAssayData(seu, assay = "RNA", slot = "counts")
mat <- as.matrix(mat)
scrub <- scr$Scrublet(t(mat))
doublet <- scrub$scrub_doublets()
names(doublet) <- c("doublet_score","doublet")


seu[["doublet_score"]] <- doublet$doublet_score
seu[["doublet"]] <- doublet$doublet


vln_3 <- VlnPlot(seu, group.by = "doublet",
        features = c("doublet_score", "nCount_RNA","nFeature_RNA"),
        pt.size = 0)

vln_3

umap_4 <- DimPlot(seu, group.by = "doublet") 
umap_4

Exercise 4 - Subset and Regress

seu_filt <- subset(seu, subset=doublet=="FALSE" & 
                     percent.mt < 10)

# These cells are from a cancer study so there's a good chance there's a high proportion of cycling cells -> Seurat cell cycle assignment seems good. 
pot_conf <- c("percent.mt","S.Score","G2M.Score")
seu_filt <- ScaleData(seu_filt, vars.to.regress = pot_conf)

filt_number <- c(ncol(seu),ncol(seu_filt))
filt_number 
## [1] 1049 1024
  seu_filt <- RunPCA(seu_filt, npcs=30, verbose=FALSE)
  seu_filt <- RunUMAP(seu_filt, reduction = "pca", dims = 1:10, verbose=FALSE)
  seu_filt <- FindNeighbors(seu_filt, reduction = "pca", dims = 1:10, verbose=FALSE)
  seu_filt <- FindClusters(seu_filt, resolution = 0.5, verbose=FALSE)
  
  umap_5 <- DimPlot(seu_filt)
  umap_5
  umap_6 <- DimPlot(seu_filt, group.by ="Phase")
  umap_6
  feat_4 <- FeaturePlot(seu_filt,"percent.mt")  + scale_colour_gradient(low="gray",high="blue")
  feat_4

Exercise 4 - Optimize Clustering and Find Markers

library(clustree)

seu_filt$default_clusters <- seu_filt$seurat_clusters

reso <- c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)
reso_res <- lapply(1:length(reso), function(x,seu_filt,reso){
  seu_filt <- FindClusters(seu_filt,resolution = reso[x])
  clust <- setNames(seu_filt$seurat_clusters,Cells(seu_filt))
  return(clust)}, seu_filt, reso)
names(reso_res) <- paste0("k",1:length(reso))


k_tab <- do.call(cbind,reso_res)
k_dat <- as.data.frame(k_tab)

clust_tree <- clustree(k_dat, prefix = "k", node_colour = "sc3_stability")
clust_tree


seu_filt <- FindClusters(seu_filt, resolution = 0.3)

umap_7 <- DimPlot(seu_filt, group.by = "seurat_clusters",label = TRUE,pt.size = 0.2) + NoLegend() + ggtitle("Optimized Clusters")
umap_7

umap_8 <- DimPlot(seu_filt, group.by = "default_clusters",label = TRUE,pt.size = 0.2) + NoLegend() + ggtitle("Default Clusters")
umap_8 

library(magrittr)
markers <- FindAllMarkers(seu_filt, only.pos = TRUE,
                          min.pct = 0.25, logfc.threshold = 0.25)

top_genes <- markers %>% group_by(cluster) %>%
  slice_max(n = 2, order_by = avg_log2FC)
head(top_genes)

heat_1 <- DoHeatmap(seu_filt, features = top_genes$gene) + NoLegend()
heat_1
## # A tibble: 18 x 7
## # Groups:   cluster [9]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene         
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>        
##  1 2.82e-101       5.16 0.646 0.048 4.62e- 97 0       ALX1         
##  2 1.02e- 47       4.57 0.313 0.015 1.67e- 43 0       RP11-193M21.1
##  3 5.26e- 65       3.46 0.632 0.128 8.61e- 61 1       SLC15A1      
##  4 7.19e- 32       3.25 0.316 0.049 1.18e- 27 1       ELAVL4       
##  5 3.48e- 16       4.31 0.304 0.085 5.69e- 12 2       CORO1A       
##  6 1.00e- 12       3.68 0.41  0.171 1.64e-  8 2       CXCR4        
##  7 1.00e-125       9.08 0.659 0.009 1.64e-121 3       DDN          
##  8 1.66e- 68       8.34 0.318 0     2.73e- 64 3       RFPL1        
##  9 2.65e- 44       7.45 0.286 0.008 4.34e- 40 4       ATP6V1B1     
## 10 7.29e- 68       7.36 0.351 0.003 1.19e- 63 4       TFCP2L1      
## 11 1.54e- 40       6.84 0.395 0.032 2.53e- 36 5       MFAP5        
## 12 8.07e- 35       6.51 0.684 0.185 1.32e- 30 5       SFRP2        
## 13 1.05e-135       9.06 0.634 0.001 1.72e-131 6       ADH6         
## 14 2.58e- 67       8.39 0.366 0.003 4.23e- 63 6       AZGP1        
## 15 7.62e-122       9.08 0.576 0.001 1.25e-117 7       PDE2A        
## 16 4.67e-115       8.92 0.545 0.001 7.64e-111 7       NOVA2        
## 17 2.40e-102       7.45 0.697 0.012 3.93e- 98 8       RERGL        
## 18 5.81e- 13       7.40 0.758 0.342 9.51e-  9 8       REN