ChIPseq functional annotation and motif identification.

In this practical we will investigate the motifs and pathways associated with the Ebf1 transcription factor in proB-cells and B-cells.

  1. Load the ebf1 peaks for proB and B cells into R.
suppressPackageStartupMessages(library(rtracklayer))

library(rtracklayer)
ebf1_proB <- import.bed("data/ebf1_proB.bed")
ebf1_B <- import.bed("data/ebf1_B.bed")
  1. Find peaks in B cells which we also present in proB cells.
ebf1_BCommon <- ebf1_B[ebf1_B %over% ebf1_proB]
  1. Extract the sequence 100bp (+/-50bp) around the geometric centre of these peaks, write to FASTA and submit to Meme-ChIP.
library(BSgenome.Mmusculus.UCSC.mm10)
toMotif <- resize(ebf1_BCommon,100,fix="center")
peaksSequences <- getSeq(BSgenome.Mmusculus.UCSC.mm10,
                         toMotif)
names(peaksSequences) <- paste0(seqnames(toMotif),":",
                                         start(toMotif),
                                         "-",
                                         end(toMotif))
writeXStringSet(peaksSequences,file="commonEBFinB.fa")

Results files from Meme-ChIP can be found here

  1. Annotate common B cell peaks to genes with TSS set as +/- 500bp
library(ChIPseeker)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
peakAnno_Ebf1B <- annotatePeak(ebf1_BCommon, tssRegion=c(-500, 500), 
                         TxDb=TxDb.Mmusculus.UCSC.mm10.knownGene, 
                         annoDb="org.Mm.eg.db")
## >> preparing features information...      2025-06-05 20:12:42 
## >> identifying nearest features...        2025-06-05 20:12:49 
## >> calculating distance from peak to TSS...   2025-06-05 20:12:50 
## >> assigning genomic annotation...        2025-06-05 20:12:50 
## >> adding gene annotation...          2025-06-05 20:13:03
## 'select()' returned 1:many mapping between keys and columns
## >> assigning chromosome lengths           2025-06-05 20:13:03 
## >> done...                    2025-06-05 20:13:03
  1. Create a barplot of the occurrence of differing annotations (“Promoter”, “Distal Intergenic” and “other”).
myAnnotation <- as.data.frame(peakAnno_Ebf1B)
simpleAnnotation <- as.vector(myAnnotation$annotation)
simpleAnnotation[!simpleAnnotation %in% c("Promoter", "Distal Intergenic")] <- "other"
forPlot <- data.frame(Annotation=simpleAnnotation)
library(ggplot2)
ggplot(forPlot,aes(x=Annotation,fill=Annotation))+geom_bar()+coord_flip()+theme_bw()+ggtitle("Simple distribution of B-cell Ebf1 binding sites")

  1. Perform a geneset enrichment test of genes with peaks in promoter against BP GO terms using clusterProfiler. Make a dotplot and an enrichment map of the results. Finally export the result.
library(clusterProfiler)
library(org.Mm.eg.db)

myAnnotation_TSS <- myAnnotation[
  myAnnotation$annotation == "Promoter",]
genesWithPeakInTSS <- unique(myAnnotation_TSS$geneId)

allGeneGR <- genes(TxDb.Mmusculus.UCSC.mm10.knownGene)
##   66 genes were dropped because they have exons located on both strands of the same
##   reference sequence or on more than one reference sequence, so cannot be
##   represented by a single genomic range.
##   Use 'single.strand.genes.only=FALSE' to get all the genes in a GRangesList
##   object, or use suppressMessages() to suppress this message.
allGeneIDs <- allGeneGR$gene_id

GO_result <- enrichGO(gene = genesWithPeakInTSS, 
                      universe = allGeneIDs,
                      OrgDb = org.Mm.eg.db,
                      ont = "BP")

dotplot(GO_result)

library(enrichplot)
GO_result_plot <- pairwise_termsim(GO_result)
emapplot(GO_result_plot, showCategory = 20, cex_label_category=0.5 )+ theme(text = element_text(size = 10)) 
## Warning in emapplot.enrichResult(x, showCategory = showCategory, ...): Use 'cex.params = list(category_label = your_value)' instead of 'cex_label_category'.
##  The cex_label_category parameter will be removed in the next version.

write.csv(as.data.frame(GO_result), "ebf1_B_vs_ebf1_proB_diff_promoter_peaks_GOterm.csv")
  1. Identify Ebf1 peaks only in Bcells and Ebf1 peaks only in proB cells.
ebf1_Bunique<- ebf1_B[!ebf1_B %over% ebf1_proB]
ebf1_proBunique<- ebf1_proB[!ebf1_proB %over% ebf1_B]

8 Using rGREAT identify the MSigDB Pathways enriched in Ebf1 peaks only in Bcells and Ebf1 peaks only in proB cells.

library(rGREAT)
great_Job <- submitGreatJob(ebf1_Bunique,species="mm10",request_interval=1,version = "3.0.0")
availableCategories(great_Job)
## [1] "GO"                               "Phenotype Data and Human Disease"
## [3] "Pathway Data"                     "Gene Expression"                 
## [5] "Regulatory Motifs"                "Gene Families"
great_ResultTable = getEnrichmentTables(great_Job,
                                        category="Pathway Data")
## The default enrichment table does not contain informatin of associated genes for
## each input region. You can set `download_by = 'tsv'` to download the complete
## table, but note only the top 500 regions can be retreived. See the following
## link:
## 
## https://great-help.atlassian.net/wiki/spaces/GREAT/pages/655401/Export#Export-GlobalExport
## 
## Except the additional gene-region association column if taking 'tsv' as the
## source of result, all other columns are the same if you choose 'json' (the
## default) as the source. Or you can try the local GREAT analysis with the function
## `great()`.
names(great_ResultTable)
## [1] "PANTHER Pathway" "BioCyc Pathway"  "MSigDB Pathway"
msigPath_Bcells <- great_ResultTable[["MSigDB Pathway"]]

msigPath_Bcells[1:10,]
##                                                             ID
## 1                       KEGG_B_CELL_RECEPTOR_SIGNALING_PATHWAY
## 2                                             PID_BCR_5PATHWAY
## 3                     REACTOME_INTERFERON_ALPHA_BETA_SIGNALING
## 4  REACTOME_DOWNSTREAM_SIGNALING_EVENTS_OF_B_CELL_RECEPTOR_BCR
## 5                                   REACTOME_SIGNALLING_BY_NGF
## 6                REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR
## 7                                REACTOME_SIGNALING_BY_SCF_KIT
## 8                                         PID_RAC1_REG_PATHWAY
## 9                                       REACTOME_IMMUNE_SYSTEM
## 10                       REACTOME_PIP3_ACTIVATES_AKT_SIGNALING
##                                                                      name
## 1                                       B cell receptor signaling pathway
## 2                                                   BCR signaling pathway
## 3                       Genes involved in Interferon alpha/beta signaling
## 4  Genes involved in Downstream Signaling Events Of B Cell Receptor (BCR)
## 5                                     Genes involved in Signalling by NGF
## 6                Genes involved in Signaling by the B Cell Receptor (BCR)
## 7                                  Genes involved in Signaling by SCF-KIT
## 8                                             Regulation of RAC1 activity
## 9                                         Genes involved in Immune System
## 10                         Genes involved in PIP3 activates AKT signaling
##    Binom_Genome_Fraction Binom_Expected Binom_Observed_Region_Hits Binom_Fold_Enrichment
## 1            0.005806883      12.531250                         37              2.952618
## 2            0.006121690      13.210610                         38              2.876477
## 3            0.001747780       3.771710                         19              5.037503
## 4            0.006124732      13.217170                         37              2.799389
## 5            0.020280090      43.764440                         83              1.896517
## 6            0.009296625      20.062120                         48              2.392569
## 7            0.006275704      13.542970                         37              2.732045
## 8            0.004501446       9.714120                         30              3.088288
## 9            0.056859840     122.703500                        182              1.483250
## 10           0.002103194       4.538692                         19              4.186228
##    Binom_Region_Set_Coverage Binom_Raw_PValue Binom_Adjp_BH Hyper_Total_Genes
## 1                0.017145510     1.434094e-08  8.743757e-06                74
## 2                0.017608900     1.799178e-08  8.743757e-06                63
## 3                0.008804449     1.988724e-08  8.743757e-06                48
## 4                0.017145510     5.366261e-08  1.536739e-05                90
## 5                0.038461540     5.825394e-08  1.536739e-05               209
## 6                0.022242820     7.137937e-08  1.569156e-05               119
## 7                0.017145510     9.693786e-08  1.826586e-05                73
## 8                0.013901760     1.258219e-07  2.074489e-05                38
## 9                0.084337350     1.415786e-07  2.074913e-05               792
## 10               0.008804449     3.277231e-07  4.075300e-05                27
##    Hyper_Expected Hyper_Observed_Gene_Hits Hyper_Fold_Enrichment Hyper_Gene_Set_Coverage
## 1        9.909689                       25              2.522784             0.008523696
## 2        8.436627                       22              2.607677             0.007500852
## 3        6.427906                       11              1.711288             0.003750426
## 4       12.052320                       27              2.240232             0.009205592
## 5       27.988170                       57              2.036574             0.019434030
## 6       15.935850                       35              2.196306             0.011933170
## 7        9.775774                       25              2.557342             0.008523696
## 8        5.088759                       13              2.554650             0.004432322
## 9      106.060500                      141              1.329431             0.048073640
## 10       3.615697                       14              3.872006             0.004773270
##    Hyper_Term_Gene_Coverage Hyper_Raw_PValue Hyper_Adjp_BH
## 1                 0.3378378     6.065365e-06  3.200087e-04
## 2                 0.3492063     1.184083e-05  4.338349e-04
## 3                 0.2291667     4.924903e-02  1.840212e-01
## 4                 0.3000000     3.089307e-05  8.489158e-04
## 5                 0.2727273     7.317380e-08  2.244281e-05
## 6                 0.2941176     3.668240e-06  2.556315e-04
## 7                 0.3424658     4.597072e-06  3.031769e-04
## 8                 0.3421053     8.932663e-04  1.132902e-02
## 9                 0.1780303     2.077171e-04  4.005688e-03
## 10                0.5185185     2.083329e-06  1.962794e-04
great_Job <- submitGreatJob(ebf1_proBunique,species="mm10",request_interval=1,version = "3.0.0")
availableCategories(great_Job)
## [1] "GO"                               "Phenotype Data and Human Disease"
## [3] "Pathway Data"                     "Gene Expression"                 
## [5] "Regulatory Motifs"                "Gene Families"
great_ResultTable = getEnrichmentTables(great_Job,
                                        category="Pathway Data")
## The default enrichment table does not contain informatin of associated genes for
## each input region. You can set `download_by = 'tsv'` to download the complete
## table, but note only the top 500 regions can be retreived. See the following
## link:
## 
## https://great-help.atlassian.net/wiki/spaces/GREAT/pages/655401/Export#Export-GlobalExport
## 
## Except the additional gene-region association column if taking 'tsv' as the
## source of result, all other columns are the same if you choose 'json' (the
## default) as the source. Or you can try the local GREAT analysis with the function
## `great()`.
names(great_ResultTable)
## [1] "PANTHER Pathway" "BioCyc Pathway"  "MSigDB Pathway"
msigPath_proBcells <- great_ResultTable[["MSigDB Pathway"]]


msigPath_proBcells[1:10,]
##                                                                           ID
## 1                                          REACTOME_METABOLISM_OF_PORPHYRINS
## 2                                                          PID_PDGFRBPATHWAY
## 3                             KEGG_NATURAL_KILLER_CELL_MEDIATED_CYTOTOXICITY
## 4                                                        REACTOME_HEMOSTASIS
## 5                                                     REACTOME_IMMUNE_SYSTEM
## 6                                                          PID_PI3KCIPATHWAY
## 7                                                           PID_BCR_5PATHWAY
## 8                                                    KEGG_TASTE_TRANSDUCTION
## 9                                      KEGG_FC_GAMMA_R_MEDIATED_PHAGOCYTOSIS
## 10 REACTOME_DEPOSITION_OF_NEW_CENPA_CONTAINING_NUCLEOSOMES_AT_THE_CENTROMERE
##                                                                                  name
## 1                                          Genes involved in Metabolism of porphyrins
## 2                                                        PDGFR-beta signaling pathway
## 3                                           Natural killer cell mediated cytotoxicity
## 4                                                        Genes involved in Hemostasis
## 5                                                     Genes involved in Immune System
## 6                                                       Class I PI3K signaling events
## 7                                                               BCR signaling pathway
## 8                                                                  Taste transduction
## 9                                                    Fc gamma R-mediated phagocytosis
## 10 Genes involved in Deposition of New CENPA-containing Nucleosomes at the Centromere
##    Binom_Genome_Fraction Binom_Expected Binom_Observed_Region_Hits Binom_Fold_Enrichment
## 1           0.0003517876       1.589376                         28             17.616970
## 2           0.0120526400      54.453810                        144              2.644443
## 3           0.0066675330      30.123920                         91              3.020856
## 4           0.0357002700     161.293800                        284              1.760762
## 5           0.0568598400     256.892800                        399              1.553177
## 6           0.0037282650      16.844300                         62              3.680770
## 7           0.0061216900      27.657790                         82              2.964806
## 8           0.0021354530       9.647976                         45              4.664191
## 9           0.0072730230      32.859520                         88              2.678067
## 10          0.0014112860       6.376189                         35              5.489173
##    Binom_Region_Set_Coverage Binom_Raw_PValue Binom_Adjp_BH Hyper_Total_Genes
## 1                0.006197432     2.831283e-25  3.734462e-22                14
## 2                0.031872510     2.554682e-24  1.684813e-21               127
## 3                0.020141660     2.290614e-19  1.007107e-16               101
## 4                0.062859670     3.066842e-19  1.011291e-16               424
## 5                0.088313410     1.308965e-17  3.453050e-15               792
## 6                0.013722890     1.835537e-17  4.035122e-15                48
## 7                0.018149620     3.742359e-17  7.051674e-15                63
## 8                0.009960159     1.186513e-16  1.956263e-14                26
## 9                0.019477640     9.529404e-16  1.396587e-13                92
## 10               0.007746791     2.642269e-15  3.485153e-13                51
##    Hyper_Expected Hyper_Observed_Gene_Hits Hyper_Fold_Enrichment Hyper_Gene_Set_Coverage
## 1        3.011962                        8             2.6560760             0.001697793
## 2       27.322800                       67             2.4521640             0.014219020
## 3       21.729160                       36             1.6567600             0.007640068
## 4       91.219430                      157             1.7211250             0.033319190
## 5      170.391000                      250             1.4672140             0.053056030
## 6       10.326730                       26             2.5177380             0.005517827
## 7       13.553830                       40             2.9511950             0.008488964
## 8        5.593644                        8             1.4301950             0.001697793
## 9       19.792900                       43             2.1724970             0.009125637
## 10      10.972150                        5             0.4556993             0.001061121
##    Hyper_Term_Gene_Coverage Hyper_Raw_PValue Hyper_Adjp_BH
## 1                0.57142860     3.888215e-03  2.182364e-02
## 2                0.52755910     1.008313e-14  1.329965e-11
## 3                0.35643560     7.711774e-04  6.826731e-03
## 4                0.37028300     1.334306e-13  8.799748e-11
## 5                0.31565660     1.298131e-11  4.280587e-09
## 6                0.54166670     7.359506e-07  4.044662e-05
## 7                0.63492060     7.825095e-13  3.440433e-10
## 8                0.30769230     1.788232e-01  3.428311e-01
## 9                0.46739130     6.382067e-08  6.475343e-06
## 10               0.09803922     9.916507e-01  1.000000e+00