These exercises are about the integration and annotation in session 3.
We will use data from this study on renal cancer from human cancer.
The full dataset is on ArrayExpress. We will now use 4 samples for now: FCAImmP7277561, FCAImmP7277560, FCAImmP7277553, FCAImmP7277552. These are CD45 +/- cells from liver.
You can also download them from DropBox.
Exercise 0 - Preprocess data [You can skip this and go straight to integration if you want]
files <- c("~/Downloads/FCAImmP7277552/GRCh38",
"~/Downloads/FCAImmP7277553/GRCh38",
"~/Downloads/FCAImmP7277560/GRCh38",
"~/Downloads/FCAImmP7277561/GRCh38")
labels <- c("liver_CD45pos_552",
"liver_CD45neg_553",
"liver_CD45pos_560",
"liver_CD45neg_561")
library(Seurat)
seu_list <- lapply(1:4, function(x){
mtx <- Read10X(files[x])
seu <- CreateSeuratObject(mtx, min.features =200, min.cells=10 )
seu[["dset"]] <- labels[x]# Create a category for sample
seu <- Seurat::RenameCells(seu, add.cell.id=labels[x]) # add sample name in front of cell
seu <- NormalizeData(seu, normalization.method="LogNormalize")
seu <- FindVariableFeatures(seu, select.method="vst", nfeatures=3000)
seu <- ScaleData(seu)
seu[["percent.mt"]] <- PercentageFeatureSet(seu, pattern = "^MT-")
feat_s <- cc.genes$s.genes
feat_g2m <- cc.genes$g2m.genes
seu <- CellCycleScoring(seu, s.features = feat_s, g2m.features = feat_g2m)
seu_filt <- subset(seu, percent.mt < 10)
seu_filt <- ScaleData(seu, vars.to.regress = c("percent.mt","Phase"))
DefaultAssay(seu_filt) <- "RNA"
seu_filt <- RunPCA(seu_filt, assay = "RNA", npcs = 50)
seu_filt <- FindNeighbors(seu_filt, reduction = "pca")
seu_filt <- RunUMAP(seu_filt, dims=1:30, reduction = "pca")
return(seu_filt)
})
my_plots <- lapply(seu_list, function(x){
vln <- VlnPlot(x,"percent.mt", group.by ="dset")
feat <- FeaturePlot(x, "percent.mt")
umap <- DimPlot(x,group.by = "Phase")
out <- list(vln, feat, umap)
return(out)
})
a <- my_plots[[1]][[1]]
b <- my_plots[[1]][[2]]
c <- my_plots[[1]][[3]]
a
b
c
Exercise 1 - Integration
seu_list <- readRDS(file="~/Desktop/to_integrate_ex3.rds")
seu_merge <- merge(seu_list[[1]], seu_list[2:4],
add.cell.ids = labels, project = "Merge")
seu_merge <- NormalizeData(seu_merge, normalization.method="LogNormalize", scale.factor=10000)
seu_merge <- FindVariableFeatures(seu_merge, select.method="vst", nfeatures=2000)
seu_merge <- ScaleData(seu_merge, verbose=FALSE)
seu_merge <- RunPCA(seu_merge, npcs=30, verbose=FALSE)
seu_merge <- RunUMAP(seu_merge, reduction = "pca", dims = 1:10, verbose=FALSE)
seu_merge <- FindNeighbors(seu_merge, reduction = "pca", dims = 1:10, verbose=FALSE)
seu_merge <- FindClusters(seu_merge, resolution = 0.5, verbose=FALSE)
dim_1 <- DimPlot(seu_merge, group.by = "dset")
dim_1
dim_2 <- DimPlot(seu_merge, split.by = "dset", ncol = 2)
dim_2
seu_merge_basic <- seu_merge
feats <- SelectIntegrationFeatures(seu_list)
my_seu_list_rpca <- lapply(seu_list, function(seu, feats){
seu <- ScaleData(seu, features=feats, verbose=FALSE)
seu <- RunPCA(seu, features=feats, verbose=FALSE)
return(seu)}, feats)
anchors <- FindIntegrationAnchors(my_seu_list_rpca, anchor.features = feats, reduction = "rpca")
my_seu_merge_rpca <- IntegrateData(anchorset = anchors)
my_seu_merge_rpca <- ScaleData(my_seu_merge_rpca)
my_seu_merge_rpca <- RunPCA(my_seu_merge_rpca, npcs=30, verbose=FALSE)
my_seu_merge_rpca <- RunUMAP(my_seu_merge_rpca, reduction = "pca", dims = 1:10, verbose=FALSE)
my_seu_merge_rpca <- FindNeighbors(my_seu_merge_rpca, reduction = "pca", dims = 1:10, verbose=FALSE)
my_seu_merge_rpca <- FindClusters(my_seu_merge_rpca, resolution = 0.5, verbose=FALSE)
dim_3 <- DimPlot(my_seu_merge_rpca, group.by = "dset")
dim_3
dim_4 <- DimPlot(my_seu_merge_rpca, split.by = "dset", ncol = 2)
dim_4
seu_merge <- merge(seu_list[[1]], seu_list[2:4],
add.cell.ids = labels, project = "Merge")
seu_merge <- NormalizeData(seu_merge, normalization.method="LogNormalize", scale.factor=10000)
seu_merge <- FindVariableFeatures(seu_merge, select.method="vst", nfeatures=2000)
seu_merge <- ScaleData(seu_merge, verbose=FALSE)
seu_merge <- RunPCA(seu_merge, npcs=30, verbose=FALSE)
seu_merge <- RunUMAP(seu_merge, reduction = "pca", dims = 1:10, verbose=FALSE)
library(harmony)
seu_merge_harmony <- RunHarmony(seu_merge, group.by.vars= "dset", assay.use= "RNA")
seu_merge_harmony <- RunUMAP(seu_merge_harmony, reduction = "harmony", dims = 1:10, reduction.name = "umap_harmony")
seu_merge_harmony <- FindNeighbors(seu_merge_harmony, reduction = "harmony")
seu_merge_harmony <- FindClusters(seu_merge_harmony)
dim_5 <- DimPlot(seu_merge_harmony, group.by = "dset", reduction = "umap_harmony")
dim_5
dim_6 <- DimPlot(seu_merge_harmony, split.by = "dset", ncol = 2, reduction = "umap_harmony")
dim_6
d <- FeaturePlot(seu_merge, "A1BG")
e <- FeaturePlot(my_seu_merge_rpca, "A1BG")
f <- FeaturePlot(seu_merge_harmony, "A1BG", reduction = "umap_harmony")
g <- FeaturePlot(seu_merge, "SERPINC1")
h <- FeaturePlot(my_seu_merge_rpca, "SERPINC1")
i <- FeaturePlot(seu_merge_harmony, "SERPINC1", reduction = "umap_harmony")
d
e
f
g
h
i
Exercise 2 - Annotation
[Hint: The layers of a Harmony integrated object are kept seperate in the Seurat Object. You made need to run JoinLayers() before exporting the count matrix]
library(celldex)
library(SingleR)
hpcad <- HumanPrimaryCellAtlasData()
imm <- ImmGenData()
seu_merge_harmony <- JoinLayers(seu_merge_harmony)
seu_merge_harmony_mat <- GetAssayData(seu_merge_harmony)
pred_res <- SingleR(ref = hpcad, test = seu_merge_harmony_mat, labels = hpcad$label.main)
seu_merge_harmony$hpcad_singleR_labels <- pred_res$labels
pred_res <- SingleR(ref = imm, test = seu_merge_harmony_mat, labels = imm$label.main)
seu_merge_harmony$imm_singleR_labels <- pred_res$labels
dim_7 <- DimPlot(seu_merge_harmony, group.by = "hpcad_singleR_labels", reduction = "umap_harmony", label=T)
dim_8 <- DimPlot(seu_merge_harmony, group.by = "imm_singleR_labels", reduction = "umap_harmony", label=T)
dim_7
dim_8
Exercise 3 - Differential expression
Set up: Using the integrated Seurat object from the AD samples from our lecture, we will look at differentially expressed genes between AD and control subjects in cluster #4 from the ‘seurat_clusters’ metadata group. This object is on dropbox as integrated.rds
Generate a list of differentially expressed genes for AD vs control in cluster #4 using a Wilcoxon rank sum test. Note: Confirm you are using the ‘seurat_clusters’ group, not the ‘paper_cluster’ group that also exists in the object.
[Hint: Make sure you have the Seurat object properly set up for differential analysis. Are you using the correct assay? Are the integrated layers joined?]
library(Seurat)
library(tibble)
library(dplyr)
# metadata column for condition
seu$condition <- ifelse(grepl("AD", seu$sample_id), "AD", "CON")
# metadata column for condition + cell type
seu$cluster_condition <- paste(seu$seurat_clusters, seu$condition, sep = "_")
# set default assay to be RNA (currently is 'integrated')
DefaultAssay(seu) <- "RNA"
seu <- JoinLayers(seu)
Idents(seu) <- "cluster_condition"
de_wilcox_c4 <- FindMarkers(seu, ident.1 = "4_AD", ident.2 = "4_CON", logfc.threshold = 0) %>%
tibble::rownames_to_column("geneID")
library(MAST)
seu_c4 <- subset(seu, seurat_clusters == 4)
seu_c4_data <- GetAssayData(seu_c4, assay = "RNA", layer = "data")
# create SingleCellAssay for MAST
sca_c4 <- MAST::FromMatrix(exprsArray = as.matrix(seu_c4_data),
cData = seu_c4@meta.data,
fData = data.frame(gene_id = rownames(seu_c4))
)
cdr_c4 <- colSums(assay(sca_c4)>0)
cdr_scale_c4 <- scale(cdr_c4)
colData(sca_c4)$ngeneson <- cdr_scale_c4
cond <- factor(colData(sca_c4)$condition,
levels = c("AD","CON"))
cond <- relevel(cond,"CON")
colData(sca_c4)$Group <- cond
# Does ngeneson correlate with a PC?
pca_embed <- data.frame(seu_c4[["pca"]]@cell.embeddings)
pca_vars <- seu_c4[["pca"]]@stdev^2 # variance for each pc
total_var <- sum(pca_vars) # total variance
pct_var_explained <- pca_vars/total_var * 100
pct_var_explained <- round(pct_var_explained, 1)
for_plot <- data.frame(colData(sca_c4))
for_plot <- merge(for_plot, pca_embed, by = 0)
library(ggpubr)
p1 <- ggplot(for_plot, aes(x = ngeneson, y = PC_1, color = condition)) + geom_point() + ylab(paste0("PC_1 (", pct_var_explained[1],"%)"))
p2 <- ggplot(for_plot, aes(x = ngeneson, y = PC_2, color = condition)) + geom_point() + ylab(paste0("PC_2 (", pct_var_explained[2],"%)"))
p3 <- ggplot(for_plot, aes(x = ngeneson, y = PC_3, color = condition)) + geom_point() + ylab(paste0("PC_3 (", pct_var_explained[3],"%)"))
ggarrange(p1, p2, p3, nrow = 1, common.legend = TRUE, legend="bottom")
# run MAST
zlmCond_c4 <- zlm(~ngeneson + Group, sca_c4)
sumZLM_c4 <- summary(zlmCond_c4,
doLRT=paste0("Group","AD"))
sumDT_c4 <- sumZLM_c4$datatable
# turn the outpput into a nice DE dataframe
de_mast_c4 <- merge(sumDT_c4[sumDT_c4$component=="H"&sumDT_c4$contrast==paste0("Group","AD"),],
sumDT_c4[sumDT_c4$component=="logFC"&sumDT_c4$contrast==paste0("Group","AD"),],by='primerid')
de_mast_c4 <- dplyr::select(de_mast_c4,primerid,coef.y,z.y,`Pr(>Chisq).x`)
names(de_mast_c4) <- c("geneID","log2Fc","z","pvalue")
de_mast_c4$FDR <- p.adjust(de_mast_c4$pvalue,'fdr')
de_mast_c4 <- de_mast_c4[order(de_mast_c4$FDR),]
de_mast_c4
# should we include Sex as a confounding factor?
# Check for the sex specific XIST gene expression
# not differentiall expressed - good!
de_mast_c4[de_mast_c4$geneID == "XIST", ]
# these plots show that there is not a strong overlap between XIST and a particular group
# bias doesn't seem to be as strong as in the excitatory neuronsm so probably don't need to regress it out
FeaturePlot(seu_c4, features = "XIST")
DimPlot(seu_c4, split.by = "condition")
top_up_AD <- head(de_mast_c4[de_mast_c4$log2Fc > 0, ], 5)
vln_mast <- VlnPlot(seu_c4, features = top_up_AD$geneID, stack = T, flip = T, pt.size = 1) + scale_x_discrete(labels=c('CON', 'AD'))
seu_c4_aggr <- Seurat::AggregateExpression(seu_c4, return.seurat = T, group.by = c("sample_id", "condition"))
# get the raw un-normalized counts
counts_aggr <- as.matrix(seu_c4_aggr[["RNA"]]$counts)
library(DESeq2)
column_meta <- data.frame(
row.names = colnames(counts_aggr),
condition = ifelse(grepl("AD", colnames(counts_aggr)), "AD", "CON")
)
column_meta$condition <- factor(column_meta$condition, levels = c("CON", "AD"))
dds_pseudo_c4 <- DESeqDataSetFromMatrix(countData = counts_aggr, colData = column_meta, design = ~condition)
colData(dds_pseudo_c4)
# filter lowly expressed genes
smallestGroupSize <- 2
keep <- rowSums(counts(dds_pseudo_c4) >= 10) >= smallestGroupSize
table(keep)
dds_pseudo_c4 <- dds_pseudo_c4[keep,]
# run deseq2
dds_pseudo_c4 <- DESeq(dds_pseudo_c4)
resultsNames(dds_pseudo_c4)
res_pseudo_c4 <- results(dds_pseudo_c4, name = "condition_AD_vs_CON")
de_pseudo_c4 <- res_pseudo_c4 %>%
data.frame %>%
arrange(pvalue) %>%
rownames_to_column(var = "geneID")
# Make PCA plot of aggregates count matrix ans color by condition and sex
sample_ids <- rownames(colData(dds_pseudo_c4))
colData(dds_pseudo_c4)$sex <- ifelse(sample_ids == "AD2b_AD", "female",
ifelse(sample_ids == "AD4_AD", "male",
ifelse(sample_ids == "C1_CON", "male",
ifelse(sample_ids == "C3_CON", "female", "none!"))))
p_cond <- plotPCA(rlog(dds_pseudo_c4), intgroup = "condition") + coord_cartesian() + theme_bw() + ggtitle("PCA")
p_sex <- plotPCA(rlog(dds_pseudo_c4), intgroup = "sex") + coord_cartesian() + theme_bw() + ggtitle("PCA")
de_pseudo_c4$sig <- de_pseudo_c4$padj < 0.05
de_mast_c4$sig <- de_mast_c4$FDR < 0.05
mast_pseudo <- merge(de_mast_c4, de_pseudo_c4, by = "geneID", suffixes = c("_mast", "_pseudo"))
# see overlap in significantly changing genes
de_compare <- table(mast_pseudo$sig_mast, mast_pseudo$sig_pseudo)
rownames(de_compare) <- c("mast_false", "mast_true")
colnames(de_compare) <- c("pseudo_false", "pseudo_true")
de_compare
# correlate -log10p
p_logp <- ggplot(mast_pseudo, aes(x = -log10(FDR), y = -log10(padj))) + geom_point() + xlab("-log10(FDR) for MAST") + ylab("-log10(padj) for DESeq2")
p_fc <- ggplot(mast_pseudo, aes(x = log2Fc, y = log2FoldChange)) + geom_point() + xlab("log2FC for MAST") + ylab("log2FC for DESeq2")