These exercises are about the integration and annotation in session 3.

Description

We will use data from this study on renal cancer from human cancer.

The full dataset is on ArrayExpress. We will now use 4 samples for now: FCAImmP7277561, FCAImmP7277560, FCAImmP7277553, FCAImmP7277552. These are CD45 +/- cells from liver.

You can also download them from DropBox.

Exercise 0 - Preprocess data [You can skip this and go straight to integration if you want]

files <- c("~/Downloads/FCAImmP7277552/GRCh38",
  "~/Downloads/FCAImmP7277553/GRCh38",
  "~/Downloads/FCAImmP7277560/GRCh38",
  "~/Downloads/FCAImmP7277561/GRCh38")

labels <- c("liver_CD45pos_552",
            "liver_CD45neg_553",
            "liver_CD45pos_560",
            "liver_CD45neg_561")

library(Seurat)

seu_list <- lapply(1:4, function(x){

mtx <- Read10X(files[x])
seu <- CreateSeuratObject(mtx, min.features =200, min.cells=10 )
seu[["dset"]] <- labels[x]# Create a category for sample
seu <- Seurat::RenameCells(seu, add.cell.id=labels[x]) # add sample name in front of cell 
seu <- NormalizeData(seu, normalization.method="LogNormalize")
seu <- FindVariableFeatures(seu, select.method="vst", nfeatures=3000)
seu <- ScaleData(seu)
seu[["percent.mt"]] <- PercentageFeatureSet(seu, pattern = "^MT-")
feat_s <- cc.genes$s.genes
feat_g2m <- cc.genes$g2m.genes
seu <- CellCycleScoring(seu, s.features = feat_s, g2m.features = feat_g2m)
seu_filt <- subset(seu, percent.mt < 10)
seu_filt <- ScaleData(seu, vars.to.regress = c("percent.mt","Phase"))
DefaultAssay(seu_filt) <- "RNA"
seu_filt <- RunPCA(seu_filt, assay = "RNA", npcs = 50)
seu_filt <- FindNeighbors(seu_filt, reduction = "pca")
seu_filt <- RunUMAP(seu_filt, dims=1:30, reduction = "pca")

return(seu_filt)
})


my_plots <- lapply(seu_list, function(x){
  vln <- VlnPlot(x,"percent.mt", group.by ="dset")
  feat <- FeaturePlot(x, "percent.mt")
  umap <- DimPlot(x,group.by = "Phase")
  out <- list(vln, feat, umap)
  return(out)
})

a <- my_plots[[1]][[1]]
b <- my_plots[[1]][[2]]
c <- my_plots[[1]][[3]]

a
b
c

Exercise 1 - Integration

seu_list <- readRDS(file="~/Desktop/to_integrate_ex3.rds")

seu_merge <- merge(seu_list[[1]], seu_list[2:4],
                            add.cell.ids = labels, project = "Merge")
seu_merge <- NormalizeData(seu_merge, normalization.method="LogNormalize", scale.factor=10000)
seu_merge <- FindVariableFeatures(seu_merge, select.method="vst", nfeatures=2000)
seu_merge <- ScaleData(seu_merge, verbose=FALSE)
seu_merge <- RunPCA(seu_merge, npcs=30, verbose=FALSE)
seu_merge <- RunUMAP(seu_merge, reduction = "pca", dims = 1:10, verbose=FALSE)
seu_merge <- FindNeighbors(seu_merge, reduction = "pca", dims = 1:10, verbose=FALSE)
seu_merge <- FindClusters(seu_merge, resolution = 0.5, verbose=FALSE)

dim_1 <- DimPlot(seu_merge, group.by = "dset")
dim_1

dim_2 <- DimPlot(seu_merge, split.by = "dset", ncol = 2)
dim_2

seu_merge_basic <- seu_merge
feats <- SelectIntegrationFeatures(seu_list)

my_seu_list_rpca <- lapply(seu_list, function(seu, feats){
  seu <- ScaleData(seu, features=feats, verbose=FALSE)
  seu <- RunPCA(seu, features=feats, verbose=FALSE)
  return(seu)}, feats)

anchors <- FindIntegrationAnchors(my_seu_list_rpca, anchor.features = feats, reduction = "rpca")

my_seu_merge_rpca <- IntegrateData(anchorset = anchors)
my_seu_merge_rpca <- ScaleData(my_seu_merge_rpca)
my_seu_merge_rpca <- RunPCA(my_seu_merge_rpca, npcs=30, verbose=FALSE)
my_seu_merge_rpca <- RunUMAP(my_seu_merge_rpca, reduction = "pca", dims = 1:10, verbose=FALSE)
my_seu_merge_rpca <- FindNeighbors(my_seu_merge_rpca, reduction = "pca", dims = 1:10, verbose=FALSE)
my_seu_merge_rpca <- FindClusters(my_seu_merge_rpca, resolution = 0.5, verbose=FALSE)

dim_3 <- DimPlot(my_seu_merge_rpca, group.by = "dset")
dim_3

dim_4 <- DimPlot(my_seu_merge_rpca, split.by = "dset", ncol = 2)
dim_4
seu_merge <- merge(seu_list[[1]], seu_list[2:4],
                            add.cell.ids = labels, project = "Merge")
seu_merge <- NormalizeData(seu_merge, normalization.method="LogNormalize", scale.factor=10000)
seu_merge <- FindVariableFeatures(seu_merge, select.method="vst", nfeatures=2000)
seu_merge <- ScaleData(seu_merge, verbose=FALSE)
seu_merge <- RunPCA(seu_merge, npcs=30, verbose=FALSE)
seu_merge <- RunUMAP(seu_merge, reduction = "pca", dims = 1:10, verbose=FALSE)
library(harmony)
seu_merge_harmony <- RunHarmony(seu_merge, group.by.vars= "dset", assay.use= "RNA")
seu_merge_harmony <- RunUMAP(seu_merge_harmony, reduction = "harmony", dims = 1:10, reduction.name = "umap_harmony")
seu_merge_harmony <- FindNeighbors(seu_merge_harmony, reduction = "harmony")
seu_merge_harmony <- FindClusters(seu_merge_harmony) 

dim_5 <- DimPlot(seu_merge_harmony, group.by = "dset", reduction = "umap_harmony")
dim_5

dim_6 <- DimPlot(seu_merge_harmony, split.by = "dset", ncol = 2, reduction = "umap_harmony")
dim_6
d <- FeaturePlot(seu_merge, "A1BG")
e <- FeaturePlot(my_seu_merge_rpca, "A1BG")
f <- FeaturePlot(seu_merge_harmony, "A1BG", reduction = "umap_harmony")


g <- FeaturePlot(seu_merge, "SERPINC1")
h <- FeaturePlot(my_seu_merge_rpca, "SERPINC1")
i <- FeaturePlot(seu_merge_harmony, "SERPINC1", reduction = "umap_harmony")

d
e
f
g
h
i

Exercise 2 - Annotation

[Hint: The layers of a Harmony integrated object are kept seperate in the Seurat Object. You made need to run JoinLayers() before exporting the count matrix]

library(celldex)
library(SingleR)
hpcad <- HumanPrimaryCellAtlasData()
imm <- ImmGenData()

seu_merge_harmony <- JoinLayers(seu_merge_harmony)
seu_merge_harmony_mat <- GetAssayData(seu_merge_harmony)

pred_res <- SingleR(ref = hpcad, test = seu_merge_harmony_mat, labels = hpcad$label.main)
seu_merge_harmony$hpcad_singleR_labels <- pred_res$labels

pred_res <- SingleR(ref = imm, test = seu_merge_harmony_mat, labels = imm$label.main)
seu_merge_harmony$imm_singleR_labels <- pred_res$labels

dim_7 <- DimPlot(seu_merge_harmony, group.by = "hpcad_singleR_labels", reduction = "umap_harmony", label=T)

dim_8 <- DimPlot(seu_merge_harmony, group.by = "imm_singleR_labels", reduction = "umap_harmony", label=T)

dim_7
dim_8

Exercise 3 - Differential expression

[Hint: Make sure you have the Seurat object properly set up for differential analysis. Are you using the correct assay? Are the integrated layers joined?]

seu <- readRDS("~/Downloads/integrated.rds")
library(Seurat)
library(tibble)
library(dplyr)

# metadata column for condition
seu$condition <- ifelse(grepl("AD", seu$sample_id), "AD", "CON")

# metadata column for condition + cell type
seu$cluster_condition <- paste(seu$seurat_clusters, seu$condition, sep = "_")

# set default assay to be RNA (currently is 'integrated')
DefaultAssay(seu) <- "RNA"

seu <- JoinLayers(seu)

Idents(seu) <- "cluster_condition"
de_wilcox_c4 <- FindMarkers(seu, ident.1 = "4_AD", ident.2 = "4_CON", logfc.threshold = 0)  %>% 
  tibble::rownames_to_column("geneID")
library(MAST)
seu_c4 <- subset(seu, seurat_clusters == 4)
seu_c4_data <- GetAssayData(seu_c4, assay = "RNA", layer = "data")
# create SingleCellAssay for MAST
sca_c4 <- MAST::FromMatrix(exprsArray = as.matrix(seu_c4_data),
                           cData = seu_c4@meta.data,
                           fData = data.frame(gene_id = rownames(seu_c4))
)

cdr_c4 <- colSums(assay(sca_c4)>0)
cdr_scale_c4 <- scale(cdr_c4)

colData(sca_c4)$ngeneson <- cdr_scale_c4
cond <- factor(colData(sca_c4)$condition,
               levels = c("AD","CON"))
cond <- relevel(cond,"CON")
colData(sca_c4)$Group <- cond

# Does ngeneson correlate with a PC?

pca_embed <- data.frame(seu_c4[["pca"]]@cell.embeddings)

pca_vars <-  seu_c4[["pca"]]@stdev^2 # variance for each pc
total_var <- sum(pca_vars) # total variance
pct_var_explained <- pca_vars/total_var * 100 
pct_var_explained <- round(pct_var_explained, 1)

for_plot <- data.frame(colData(sca_c4))
for_plot <- merge(for_plot, pca_embed, by = 0)

library(ggpubr)
p1 <- ggplot(for_plot, aes(x = ngeneson, y = PC_1, color = condition)) + geom_point() + ylab(paste0("PC_1 (", pct_var_explained[1],"%)")) 
p2 <- ggplot(for_plot, aes(x = ngeneson, y = PC_2, color = condition)) + geom_point() + ylab(paste0("PC_2 (", pct_var_explained[2],"%)")) 
p3 <- ggplot(for_plot, aes(x = ngeneson, y = PC_3, color = condition)) + geom_point() + ylab(paste0("PC_3 (", pct_var_explained[3],"%)")) 
ggarrange(p1, p2, p3, nrow = 1, common.legend = TRUE, legend="bottom")

# run MAST
zlmCond_c4 <- zlm(~ngeneson + Group, sca_c4)
sumZLM_c4 <- summary(zlmCond_c4,
                  doLRT=paste0("Group","AD"))
sumDT_c4 <- sumZLM_c4$datatable

# turn the outpput into a nice DE dataframe
de_mast_c4 <- merge(sumDT_c4[sumDT_c4$component=="H"&sumDT_c4$contrast==paste0("Group","AD"),],
               sumDT_c4[sumDT_c4$component=="logFC"&sumDT_c4$contrast==paste0("Group","AD"),],by='primerid')
de_mast_c4 <- dplyr::select(de_mast_c4,primerid,coef.y,z.y,`Pr(>Chisq).x`)
names(de_mast_c4) <- c("geneID","log2Fc","z","pvalue")
de_mast_c4$FDR <- p.adjust(de_mast_c4$pvalue,'fdr')
de_mast_c4 <- de_mast_c4[order(de_mast_c4$FDR),]
de_mast_c4

# should we include Sex as a confounding factor?
# Check for the sex specific XIST gene expression
# not differentiall expressed - good!
de_mast_c4[de_mast_c4$geneID == "XIST", ]

# these plots show that there is not a strong overlap between XIST and a particular group
# bias doesn't seem to be as strong as in the excitatory neuronsm so probably don't need to regress it out
FeaturePlot(seu_c4, features = "XIST")
DimPlot(seu_c4, split.by = "condition")
top_up_AD <- head(de_mast_c4[de_mast_c4$log2Fc > 0, ], 5)
vln_mast <- VlnPlot(seu_c4, features = top_up_AD$geneID, stack = T, flip = T, pt.size = 1)  + scale_x_discrete(labels=c('CON', 'AD'))
seu_c4_aggr <- Seurat::AggregateExpression(seu_c4, return.seurat = T, group.by = c("sample_id", "condition"))
# get the raw un-normalized counts
counts_aggr <- as.matrix(seu_c4_aggr[["RNA"]]$counts)

library(DESeq2)
column_meta <- data.frame(
  row.names = colnames(counts_aggr),
  condition = ifelse(grepl("AD", colnames(counts_aggr)), "AD", "CON")
)
column_meta$condition <- factor(column_meta$condition, levels = c("CON", "AD"))


dds_pseudo_c4 <- DESeqDataSetFromMatrix(countData = counts_aggr, colData = column_meta, design = ~condition)
colData(dds_pseudo_c4)

# filter lowly expressed genes
smallestGroupSize <- 2
keep <- rowSums(counts(dds_pseudo_c4) >= 10) >= smallestGroupSize
table(keep)
dds_pseudo_c4 <- dds_pseudo_c4[keep,]

# run deseq2
dds_pseudo_c4 <- DESeq(dds_pseudo_c4)
resultsNames(dds_pseudo_c4)
res_pseudo_c4 <- results(dds_pseudo_c4, name = "condition_AD_vs_CON")

de_pseudo_c4 <- res_pseudo_c4 %>%
  data.frame %>%
  arrange(pvalue) %>%
  rownames_to_column(var = "geneID") 

# Make PCA plot of aggregates count matrix ans color by condition and sex

sample_ids <- rownames(colData(dds_pseudo_c4))
colData(dds_pseudo_c4)$sex <- ifelse(sample_ids == "AD2b_AD", "female", 
                                      ifelse(sample_ids == "AD4_AD", "male",
                                             ifelse(sample_ids == "C1_CON", "male",
                                                    ifelse(sample_ids == "C3_CON", "female", "none!"))))

p_cond <- plotPCA(rlog(dds_pseudo_c4), intgroup = "condition")  + coord_cartesian() + theme_bw() + ggtitle("PCA")
p_sex <- plotPCA(rlog(dds_pseudo_c4), intgroup = "sex")  + coord_cartesian() + theme_bw() + ggtitle("PCA")
de_pseudo_c4$sig <- de_pseudo_c4$padj < 0.05
de_mast_c4$sig <- de_mast_c4$FDR < 0.05

mast_pseudo <- merge(de_mast_c4, de_pseudo_c4, by = "geneID", suffixes = c("_mast", "_pseudo"))

# see overlap in significantly changing genes
de_compare <- table(mast_pseudo$sig_mast, mast_pseudo$sig_pseudo)
rownames(de_compare) <- c("mast_false", "mast_true")
colnames(de_compare) <- c("pseudo_false", "pseudo_true")
de_compare


# correlate -log10p
p_logp <- ggplot(mast_pseudo, aes(x = -log10(FDR), y = -log10(padj))) + geom_point() + xlab("-log10(FDR) for MAST")  + ylab("-log10(padj) for DESeq2")

p_fc <- ggplot(mast_pseudo, aes(x = log2Fc, y = log2FoldChange)) + geom_point() + xlab("log2FC for MAST")  + ylab("log2FC for DESeq2")