These exercises are about manipulate single-cell data with Seurat. Please download the counting matrix from DropBox and loading them into a Seurat object. Or you may also used the rds file data/scSeq_CKO_1kCell_ori.rds.

Exercise 1 - Data manipulation with Seurat

Loading data

  1. Create a Seurat object by reading the count data from BOX or by loading the rds file. Then calculate mitochondrial contents of each ell
library(Seurat)
obj <- readRDS("data/scSeq_CKO_1kCell_ori.rds")
obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^mt-")
head(obj@meta.data)
##                        orig.ident nCount_RNA nFeature_RNA dset percent.mt
## CKO_AGACGTTCAGCTGGCT-1        CKO       2679         1200  CKO  0.1119821
## CKO_TTAACTCGTAGTACCT-1        CKO        813          476  CKO 14.3911439
## CKO_GAGGTGAGTCTAGTGT-1        CKO       4852         1759  CKO 10.9233306
## CKO_CTCGTACAGCTAAGAT-1        CKO       1750          672  CKO 46.0000000
## CKO_AGGGAGTTCAAACCAC-1        CKO       4632         1585  CKO  5.3108808
## CKO_ATTATCCTCAACGGCC-1        CKO       2498         1177  CKO  3.8030424

Basic QC

  1. Access the read counts (nCount_RNA), gene counts (nFeature_RNA), and mitochondrial content (percent.mt) for each cell and draw a violin plot of each.
VlnPlot(obj,features = c("nCount_RNA","nFeature_RNA","percent.mt"),pt.size = 0.2)

  1. Mmake a dot plot for nCount_RNA vs nFeature_RNA. NOTE: At this step try to keep an eye out potential doublets.
FeatureScatter(obj,feature1="nCount_RNA",feature2="nFeature_RNA")

  1. Make a dot plot for nCount_RNA vs percent.mt. NOTE: At this step try to keep an eye out potential cell debris.
FeatureScatter(obj,feature1="nCount_RNA",feature2="percent.mt")

  1. Remove cells with percent.mt >= 10 for following analysis
mt_cutH <- 10
obj_unfiltered <- obj
obj <- subset(obj,subset = percent.mt < mt_cutH)
obj
## An object of class Seurat 
## 14353 features across 619 samples within 1 assay 
## Active assay: RNA (14353 features, 0 variable features)

Cell cycle

  1. Please estimate cell cycle phase of each cell and make a table to describe how many cells per phase.
convertHumanGeneList <- function(x){
  require("biomaRt")
  human = useMart("ensembl", dataset = "hsapiens_gene_ensembl",host = "http://useast.ensembl.org")
  mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl",host = "http://useast.ensembl.org")
  genesV2 = getLDS(attributes = c("hgnc_symbol"), 
                   filters = "hgnc_symbol", 
                   values = x , 
                   mart = human,
                   attributesL = c("mgi_symbol"), 
                   martL = mouse, uniqueRows=T)
  humanx <- unique(genesV2[, 2])
  return(humanx)}
#
s_gene <- convertHumanGeneList(cc.genes$s.genes)
g2m_gene <- convertHumanGeneList(cc.genes$g2m.genes)
#
obj <- CellCycleScoring(obj, s.features = s_gene,
                              g2m.features = g2m_gene, set.ident = TRUE)
table(obj@meta.data$Phase)
## 
##  G1 G2M   S 
## 260 108 251

Normalization and clustering

  1. Please scale data regressing to mitochondrial content (percent.mt) and cell cycle (S.score, G2M.score, Phase).
obj <- FindVariableFeatures(obj)
obj <- ScaleData(obj,vars.to.regress = c("percent.mt","S.score","G2M.score","Phase"))
## Warning: Requested variables to regress not in object: S.score, G2M.score
## Regressing out percent.mt, Phase
## Centering and scaling data matrix
  1. Please make principle component analysis (PCA), estimate how many PCs would best represent this data, then make clustering and UMAP plot.
  • check for up to 30 PCs.
  • elbow plot can be used to determine PCs.
set.seed(0)
obj <- RunPCA(obj, npcs = 30, verbose = FALSE)
ElbowPlot(obj,ndims = 30)

#
numPC <- 10
maxPC <- numPC
obj <- FindNeighbors(obj, reduction = "pca", dims = 1:maxPC)
## Computing nearest neighbor graph
## Computing SNN
obj <- FindClusters(obj, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 619
## Number of edges: 17846
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8145
## Number of communities: 6
## Elapsed time: 0 seconds
#
obj <- RunUMAP(obj, reduction = "pca", dims = 1:maxPC)
## 16:31:11 UMAP embedding parameters a = 0.9922 b = 1.112
## 16:31:11 Read 619 rows and found 10 numeric columns
## 16:31:11 Using Annoy for neighbor search, n_neighbors = 30
## 16:31:11 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 16:31:11 Writing NN index file to temp file /tmp/RtmpMsibtn/file1769ffaf6b2
## 16:31:11 Searching Annoy index using 1 thread, search_k = 3000
## 16:31:12 Annoy recall = 100%
## 16:31:12 Commencing smooth kNN distance calibration using 1 thread
## 16:31:14 Initializing from normalized Laplacian + noise
## 16:31:14 Commencing optimization for 500 epochs, with 23934 positive edges
## 16:31:16 Optimization finished
obj <- RunTSNE(obj, reduction = "pca", dims = 1:maxPC)
#
DimPlot(obj,reduction = "umap")

Marker genes

  1. Try to select marker genes for each cluster and generate a heatmap of the top five marker genes for each cluster.
obj <- SetIdent(obj,value = "seurat_clusters")
clust.markers <- FindAllMarkers(obj, 
                                only.pos = TRUE,
                                min.pct = 0.25, 
                                logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Warning in FindMarkers.default(object = data.use, slot = data.slot, counts =
## counts, : No features pass logfc.threshold threshold; returning empty data.frame
## Calculating cluster 4
## Calculating cluster 5
head(clust.markers)
##                 p_val avg_log2FC pct.1 pct.2    p_val_adj cluster     gene
## Dcn      5.248586e-28  3.6236022 0.832 0.434 7.533296e-24       0      Dcn
## Tnfrsf19 9.893203e-16  0.6278188 0.621 0.271 1.419971e-11       0 Tnfrsf19
## Kremen2  1.517087e-15  1.6636156 0.703 0.377 2.177475e-11       0  Kremen2
## Ifitm3   1.519477e-13  3.6505185 0.871 0.638 2.180906e-09       0   Ifitm3
## Lef1     9.500305e-13  1.7603907 0.513 0.222 1.363579e-08       0     Lef1
## Postn    2.547487e-12  1.6586465 0.599 0.320 3.656408e-08       0    Postn
#
topG <- clust.markers %>% 
  group_by(cluster) %>% 
  top_n(n = 5, wt = avg_log2FC)
#
DoHeatmap(obj, features = topG$gene) + NoLegend()
## Warning in DoHeatmap(obj, features = topG$gene): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay: Gpx1,
## Rpl41, Rps13, Rps19, Rps27a, Rps28, Zfos1

Advanced plots

  1. Please display the top marker gene of each cluster in FeaturePlot and RidgePlot
topG <- clust.markers %>% 
  group_by(cluster) %>% 
  top_n(n = 1, wt = avg_log2FC)
top_gene <- topG$gene
#
FeaturePlot(obj,features = top_gene,pt.size = 0.2)

RidgePlot(obj,features = top_gene)
## Picking joint bandwidth of 0.766
## Picking joint bandwidth of 0.927
## Picking joint bandwidth of 2.08
## Picking joint bandwidth of 0.12
## Picking joint bandwidth of 0.167