ATACseq Peak annotation

In these exercises we will review some of the functionality for genomic and functional annotation of peaks.

We will be using output from the differential peak analysis of ATACseq Week 6 vs Week 0. The starting table can be found in the ‘data’ folder of the course.

 

Exercises

1. Peak annotation with ChIPseeker

Read in the differential output table ‘ATAC_W6MinusD0.xlsx’, make a GRanges and do the following:

  • map all of the consensus peak regions to genomic features (e.g. promoter, intron, exon, intergenic, etc.)
  • subset the peaks to those that increase in the W6 time point, and map these peaks to genomic features
  • make plots (or a single plot) to visualize these two distributions
    • Hint: by providing the plotAnnoBar function with a named list, multiple distributions can be visualized at once.
library(ChIPseeker)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)

W6MinusD0 <- rio::import("data/ATAC_W6MinusD0.xlsx")
W6MinusD0_gr <- GRanges(seqnames = W6MinusD0$seqnames,
                        IRanges(start = W6MinusD0$start, end = W6MinusD0$end),
                        log2FoldChange = W6MinusD0$log2FoldChange,
                        padj = W6MinusD0$padj)
peakAnno_all <- annotatePeak(W6MinusD0_gr, tssRegion=c(-1000, 1000), 
                         TxDb=TxDb.Mmusculus.UCSC.mm10.knownGene, 
                         annoDb="org.Mm.eg.db")

W6MinusD0_gr_main <- W6MinusD0_gr[as.vector(seqnames(W6MinusD0_gr)) %in% paste0("chr", c(1:19, "X", "Y", "M"))]
W6MinusD0_gr_up <- W6MinusD0_gr_main[W6MinusD0_gr_main$log2FoldChange > 1 & W6MinusD0_gr_main$padj < 0.05, ]

peakAnno_up <- annotatePeak(W6MinusD0_gr_up, tssRegion=c(-1000, 1000), 
                         TxDb=TxDb.Mmusculus.UCSC.mm10.knownGene, 
                         annoDb="org.Mm.eg.db")

plotAnnoBar(list(all_peaks = peakAnno_all, SigUp_peaks = peakAnno_up),
            title = "ATACseq peak distribution")

2. Functional enrichment of promoter peaks

Perform functional enrichment using the genes that have significant increases in ATACseq signal within peaks located in their promoters.

  • Use the Biological Processes gene sets from the GO consortium as well as the Reactome gene sets available through the MsigDB R package.
  • make a network plot to visualize each result
  • try out other gene sets that interest you from MSigDB.
library(clusterProfiler)
library(org.Mm.eg.db)
library(enrichplot)
library(ggplot2)

# get universe of genes
allGeneGR <- genes(TxDb.Mmusculus.UCSC.mm10.knownGene)
allGeneGR[1:2,]
allGeneIDs <- allGeneGR$gene_id


up_atac_GR <- as.GRanges(peakAnno_up)
up_atac_GR_prom <- up_atac_GR[grepl("Promoter", up_atac_GR$annotation)]
up_promGenes_uniq <- unique(up_atac_GR_prom$geneId)

GO_result_prom <- enrichGO(gene = up_promGenes_uniq, 
                           universe = allGeneIDs,
                           OrgDb = org.Mm.eg.db,
                           ont = "BP")

library(msigdbr)
msig_t2g_reac <- msigdbr(species = "Mus musculus", 
                    category = "C2", 
                    subcategory = "REACTOME")
msig_t2g_reac <- msig_t2g_reac[ , colnames(msig_t2g_reac) %in% c("gs_name", "entrez_gene")]

reactome_result_prom <- enricher(gene = up_promGenes_uniq, 
                                 universe = allGeneIDs,
                                 TERM2GENE = msig_t2g_reac)

GO BP output

data.frame(GO_result_prom, row.names = NULL)[1:3, 1:10]
##           ID                              Description GeneRatio   BgRatio
## 1 GO:0008154 actin polymerization or depolymerization    19/664 199/22195
## 2 GO:0007015              actin filament organization    32/664 474/22195
## 3 GO:0032535    regulation of cellular component size    29/664 413/22195
##   RichFactor FoldEnrichment   zScore       pvalue   p.adjust     qvalue
## 1 0.09547739       3.191447 5.453249 9.087002e-06 0.03324488 0.03085578
## 2 0.06751055       2.256621 4.856501 1.802453e-05 0.03324488 0.03085578
## 3 0.07021792       2.347118 4.852904 2.156424e-05 0.03324488 0.03085578
GO_result_plot <- pairwise_termsim(GO_result_prom)
emapplot(GO_result_plot, showCategory = 20, cex.params = list(category_label = 0.6))

Reactome output:

  • no enriched terms!
data.frame(reactome_result_prom, row.names = NULL)[1:3, 1:10]
##        ID Description GeneRatio BgRatio RichFactor FoldEnrichment zScore pvalue
## NA   <NA>        <NA>      <NA>    <NA>         NA             NA     NA     NA
## NA.1 <NA>        <NA>      <NA>    <NA>         NA             NA     NA     NA
## NA.2 <NA>        <NA>      <NA>    <NA>         NA             NA     NA     NA
##      p.adjust qvalue
## NA         NA     NA
## NA.1       NA     NA
## NA.2       NA     NA
# no enriched terms!

3. Functional enrichment of all peaks with GREAT

  • using the bionomial test from GREAT, use the entire set of peaks that go up at W6 vs W0 and test enrichment for GO-Biological processes and Reactome gene sets.
  • make a bar plot showing the fold enrichment for the top 10 categories and fill the color of the bars based on the observed hits column.
  • try other gene sets from MSigDB.
library(rGREAT)

great_gobp <- great(W6MinusD0_gr_up, 
                    gene_sets = "GO:BP", 
                    tss_source = "TxDb.Mmusculus.UCSC.mm10.knownGene")
great_gobp_tab <- getEnrichmentTables(great_gobp)
great_gobp_tab10 <- great_gobp_tab[1:10, ]
great_gobp_tab10$description <- ordered(great_gobp_tab10$description, levels = rev(great_gobp_tab10$description))


reac_gene_sets <- split(msig_t2g_reac$entrez_gene, msig_t2g_reac$gs_name)
reac_gene_sets <- lapply(reac_gene_sets, as.character)
great_reac <- great(W6MinusD0_gr_up, gene_sets = reac_gene_sets, tss_source = "TxDb.Mmusculus.UCSC.mm10.knownGene", mode = "oneClosest")

great_reac_tab <- getEnrichmentTable(great_reac)
great_reac_tab10 <- great_reac_tab[1:10, ]
great_reac_tab10 <- great_reac_tab10[order(great_reac_tab10$fold_enrichment, decreasing = TRUE), ]
great_reac_tab10$name_for_plot <- gsub("REACTOME_" ,"", great_reac_tab10$id)
great_reac_tab10$name_for_plot <- substr(great_reac_tab10$name_for_plot, 1, 50)
great_reac_tab10$name_for_plot <- ordered(great_reac_tab10$name_for_plot, levels = rev(great_reac_tab10$name_for_plot))

GO BP output:

great_gobp_tab10[1:3, ]
##           id
## 1 GO:0043378
## 2 GO:0014883
## 3 GO:0030854
##                                                              description
## 1 positive regulation of CD8-positive, alpha-beta T cell differentiation
## 2                                 transition between fast and slow fiber
## 3                     positive regulation of granulocyte differentiation
##   genome_fraction observed_region_hits fold_enrichment p_value p_adjust
## 1    0.0008624976                   48        4.621135       0        0
## 2    0.0009041473                   50        4.591939       0        0
## 3    0.0011238427                   59        4.359250       0        0
##   mean_tss_dist observed_gene_hits gene_set_size fold_enrichment_hyper
## 1        245757                  4             6             1.8848901
## 2        149844                  5            10             1.4136676
## 3        241655                  4            12             0.9424451
##   p_value_hyper p_adjust_hyper
## 1     0.1214419      0.2303484
## 2     0.2566108      0.3881452
## 3     0.6633171      0.7644486
ggplot(great_gobp_tab10, aes(x = description, y = fold_enrichment, fill = observed_region_hits)) + geom_col() + coord_flip()

Reactome output:

great_reac_tab10[1:3, ]
##                                             id genome_fraction
## 1             REACTOME_RUNX3_REGULATES_P14_ARF    0.0006337473
## 2 REACTOME_TRANSCRIPTIONAL_REGULATION_BY_RUNX3    0.0047532724
## 3                   REACTOME_RND3_GTPASE_CYCLE    0.0025275757
##   observed_region_hits fold_enrichment p_value p_adjust mean_tss_dist
## 1                   42        5.502987       0        0        109650
## 2                  178        3.109515       0        0        143895
## 3                   87        2.858120       0        0        129096
##   observed_gene_hits gene_set_size fold_enrichment_hyper p_value_hyper
## 1                  9            10              3.669711  2.488930e-05
## 2                 45            96              1.911308  1.552911e-06
## 3                 20            42              1.941646  9.581214e-04
##   p_adjust_hyper                       name_for_plot
## 1   0.0010169056             RUNX3_REGULATES_P14_ARF
## 2   0.0001057458 TRANSCRIPTIONAL_REGULATION_BY_RUNX3
## 3   0.0126862373                   RND3_GTPASE_CYCLE
ggplot(great_reac_tab10, aes(x = name_for_plot, y = fold_enrichment, fill = observed_region_hits)) + geom_col() + coord_flip()