Exercise 2 - Bioconductor

Description

We will use data from this study on renal cancer from human cancer.

The full dataset is on ArrayExpress. We will use just one sample for now: 4834STDY7002881.

You can also download it from DropBox.

Exercise 1 - Cell Ranger

Review the Web Summary Report
What version of Cell Ranger was used to process this data?

# cellranger-4.0.0

How many cells are there?

# 1,049

What looks problematic about this dataset?

# Low fraction read in cells
# Low quality reads
#

What steps could we do to to help things?

# Cell Bender may help
#

Exercise 2 - Create a Seurat Object

Read the matrix file in and create a Seurat object. Ensure it is properly annotated.

library(Seurat)

mtx <- Read10X("~/Downloads/4834STDY7002881/GRCh38/")

seu <- CreateSeuratObject(mtx, project="kidney_F6", min.cells=10, min.features=200)
seu[["dset"]] <- "kidney_F6"
seu <- Seurat::RenameCells(seu, add.cell.id="kidney_F6")

Run normalization (log) and scaling. What are the top variable features?

seu <- NormalizeData(seu, normalization.method="LogNormalize")
seu <- FindVariableFeatures(seu, select.method="vst", nfeatures=3000)
seu <- ScaleData(seu)

top10 <- head(VariableFeatures(seu), 10)

temp_plot <- VariableFeaturePlot(seu)
plot2 <- LabelPoints(plot = temp_plot, points = top10, repel = TRUE)
plot2

Do dimension reduction. Don’t forget to check the elbow for PCA to get a good number of PCs. Than run some basic clusetering. No need to optimize that.

library(dplyr)
seu <- RunPCA(seu, assay = "RNA", npcs = 50)
pct <- seu[["pca"]]@stdev / sum(seu[["pca"]]@stdev) * 100
cumu <- cumsum(pct)
co1 <- which(cumu > 90 & pct < 5)[1]
co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
pc <- min(co1, co2)

plot_df <- data.frame(pct = pct, cumu = cumu,rank = 1:length(pct))

my_elbow <- ggplot(plot_df, aes(cumu, pct, label = rank, color = rank > pc)) + geom_text() +
  geom_vline(xintercept = 90, color = "grey") +
  geom_hline(yintercept = min(pct[pct > 5]), color = "grey") +
  theme_bw()
my_elbow 

seu <- FindNeighbors(seu,dims = 1:pc, reduction = "pca")
seu <- RunUMAP(seu,dims = 1:pc, reduction = "pca")

seu <- FindClusters(seu, resolution = 0.5)
seu[["cluster_byDefault"]] <- seu$seurat_clusters

umap_1 <- DimPlot(seu)
umap_1

Exercise 3 - Basic QC

Review the number of counts and features detected. Is there any asymmetry?

vln_1 <- VlnPlot(seu,
        features = c("nCount_RNA","nFeature_RNA"))
vln_1

feat_1 <- FeaturePlot(seu,
        features = c("nCount_RNA","nFeature_RNA"))
feat_1

Lets check the mitochondrial content. Review the results.

seu$percent.mt <- PercentageFeatureSet(seu, pattern = "^MT-")

vln_2 <- VlnPlot(seu,"percent.mt", group.by ="dset")
vln_2

feat_2 <-  FeaturePlot(seu,"percent.mt")  + scale_colour_gradient(low="gray",high="blue")
feat_2 

feat_3 <- FeatureScatter(seu, feature1 = "nCount_RNA", feature2 = "percent.mt")
feat_3

tab<-table(seu$percent.mt>10)
tab

## 
## FALSE  TRUE 
##  1028    21

Lets run cell cycle checks?

feat_s <- cc.genes$s.genes
feat_g2m <- cc.genes$g2m.genes

seu <- CellCycleScoring(seu, s.features = feat_s, g2m.features = feat_g2m)

umap_2 <- DimPlot(seu, group.by = "Phase")
umap_2

library(scran)

sce <- as.SingleCellExperiment(seu, assay = "RNA")
rowData(sce)$SYMBOL <- rownames(sce)

load("data/ccGene_mouse_human_geneSymbol_ensemblID_20220817.RData")
ccGene_hs <- ccGene_mm_hs$human_symbol
assignments <- cyclone(sce, ccGene_hs, gene.names=rowData(sce)$SYMBOL)

seu[["cyclone_Phase"]] <- assignments$phases
seu[["cyclone_G1Score"]] <- assignments$scores$G1
seu[["cyclone_SScore"]] <- assignments$scores$S
seu[["cyclone_G2MScore"]] <- assignments$scores$G2M

umap_3 <- DimPlot(seu, group.by = "cyclone_Phase")
umap_3

Lets run scrublet. Review the doublet annotation. Are doublets an issue for this dataset?

scr <- reticulate::import("scrublet")
mat <- GetAssayData(seu, assay = "RNA", slot = "counts")
mat <- as.matrix(mat)
scrub <- scr$Scrublet(t(mat))
doublet <- scrub$scrub_doublets()
names(doublet) <- c("doublet_score","doublet")


seu[["doublet_score"]] <- doublet$doublet_score
seu[["doublet"]] <- doublet$doublet


vln_3 <- VlnPlot(seu, group.by = "doublet",
        features = c("doublet_score", "nCount_RNA","nFeature_RNA"),
        pt.size = 0)

vln_3

umap_4 <- DimPlot(seu, group.by = "doublet") 
umap_4

Exercise 4 - Subset and Regress

Subset the data and run regression. Review the data and think about what features it is appropriate to regress over. How many cells are left?

seu_filt <- subset(seu, subset=doublet=="FALSE" & 
                     percent.mt < 10)

# These cells are from a cancer study so there's a good chance there's a high proportion of cycling cells -> Seurat cell cycle assignment seems good. 
pot_conf <- c("percent.mt","S.Score","G2M.Score")
seu_filt <- ScaleData(seu_filt, vars.to.regress = pot_conf)

filt_number <- c(ncol(seu),ncol(seu_filt))
filt_number

## [1] 1049 1024

Rerun PCA, UMAP and clustering. Review the results including QC features.

  seu_filt <- RunPCA(seu_filt, npcs=30, verbose=FALSE)
  seu_filt <- RunUMAP(seu_filt, reduction = "pca", dims = 1:10, verbose=FALSE)
  seu_filt <- FindNeighbors(seu_filt, reduction = "pca", dims = 1:10, verbose=FALSE)
  seu_filt <- FindClusters(seu_filt, resolution = 0.5, verbose=FALSE)
  
  umap_5 <- DimPlot(seu_filt)
  umap_5
  umap_6 <- DimPlot(seu_filt, group.by ="Phase")
  umap_6
  feat_4 <- FeaturePlot(seu_filt,"percent.mt")  + scale_colour_gradient(low="gray",high="blue")
  feat_4

Exercise 4 - Optimize Clustering and Find Markers

Optimize Clustering.

library(clustree)

seu_filt$default_clusters <- seu_filt$seurat_clusters

reso <- c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)
reso_res <- lapply(1:length(reso), function(x,seu_filt,reso){
  seu_filt <- FindClusters(seu_filt,resolution = reso[x])
  clust <- setNames(seu_filt$seurat_clusters,Cells(seu_filt))
  return(clust)}, seu_filt, reso)
names(reso_res) <- paste0("k",1:length(reso))


k_tab <- do.call(cbind,reso_res)
k_dat <- as.data.frame(k_tab)

clust_tree <- clustree(k_dat, prefix = "k", node_colour = "sc3_stability")
clust_tree


seu_filt <- FindClusters(seu_filt, resolution = 0.3)

umap_7 <- DimPlot(seu_filt, group.by = "seurat_clusters",label = TRUE,pt.size = 0.2) + NoLegend() + ggtitle("Optimized Clusters")
umap_7

umap_8 <- DimPlot(seu_filt, group.by = "default_clusters",label = TRUE,pt.size = 0.2) + NoLegend() + ggtitle("Default Clusters")
umap_8

Find Markers

library(magrittr)
markers <- FindAllMarkers(seu_filt, only.pos = TRUE,
                          min.pct = 0.25, logfc.threshold = 0.25)

top_genes <- markers %>% group_by(cluster) %>%
  slice_max(n = 2, order_by = avg_log2FC)
head(top_genes)

heat_1 <- DoHeatmap(seu_filt, features = top_genes$gene) + NoLegend()
heat_1

## # A tibble: 18 x 7
## # Groups:   cluster [9]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene         
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>        
##  1 2.82e-101       5.16 0.646 0.048 4.62e- 97 0       ALX1         
##  2 1.02e- 47       4.57 0.313 0.015 1.67e- 43 0       RP11-193M21.1
##  3 5.26e- 65       3.46 0.632 0.128 8.61e- 61 1       SLC15A1      
##  4 7.19e- 32       3.25 0.316 0.049 1.18e- 27 1       ELAVL4       
##  5 3.48e- 16       4.31 0.304 0.085 5.69e- 12 2       CORO1A       
##  6 1.00e- 12       3.68 0.41  0.171 1.64e-  8 2       CXCR4        
##  7 1.00e-125       9.08 0.659 0.009 1.64e-121 3       DDN          
##  8 1.66e- 68       8.34 0.318 0     2.73e- 64 3       RFPL1        
##  9 2.65e- 44       7.45 0.286 0.008 4.34e- 40 4       ATP6V1B1     
## 10 7.29e- 68       7.36 0.351 0.003 1.19e- 63 4       TFCP2L1      
## 11 1.54e- 40       6.84 0.395 0.032 2.53e- 36 5       MFAP5        
## 12 8.07e- 35       6.51 0.684 0.185 1.32e- 30 5       SFRP2        
## 13 1.05e-135       9.06 0.634 0.001 1.72e-131 6       ADH6         
## 14 2.58e- 67       8.39 0.366 0.003 4.23e- 63 6       AZGP1        
## 15 7.62e-122       9.08 0.576 0.001 1.25e-117 7       PDE2A        
## 16 4.67e-115       8.92 0.545 0.001 7.64e-111 7       NOVA2        
## 17 2.40e-102       7.45 0.697 0.012 3.93e- 98 8       RERGL        
## 18 5.81e- 13       7.40 0.758 0.342 9.51e-  9 8       REN

Exercise 2 - Bioconductor

Rockefeller University, Bioinformatics Resource Centre

https://rockefelleruniversity.github.io/scRNA-seq

Description