ChIPseq in Bioconductor exercises (part 2)

ChIPseq with replicates

In this practical we will investigate some CTCF ChIPseq for the Mel and Ch12 cellines.

Peak calls can be found in Data/CTCFpeaks/

I have also precounted CTCF signal high confidence peaks and save the results as CTCFcounts in an RData object called CTCFcounts.RData.

Information and raw/processed files can be found for Ch12 cell line here and Myc cell line here

Load the CTCF peaks into a GRangesList object.

library(GenomicRanges)
library(Rsamtools)
library(rtracklayer)
library(GenomicAlignments)
library(tracktables)
library(limma)

peakFiles <- dir("data/CTCFpeaks/",pattern="*.peaks",
                 full.names = TRUE)
peakFiles

## [1] "data/CTCFpeaks//CTCF_Ch12_1_peaks.xls" "data/CTCFpeaks//CTCF_Ch12_2_peaks.xls"
## [3] "data/CTCFpeaks//CTCF_MEL_1_peaks.xls"  "data/CTCFpeaks//CTCF_MEL_2_peaks.xls"

macsPeaks_DF <- list()
for(i in 1:length(peakFiles)){
  macsPeaks_DF[[i]] <- read.delim(peakFiles[i],
                                  comment.char="#")
}

macsPeaks_GR <- list()
for(i in 1:length(macsPeaks_DF)){
     peakDFtemp <- macsPeaks_DF[[i]]
     macsPeaks_GR[[i]] <- GRanges(
     seqnames=peakDFtemp[,"chr"],
     IRanges(peakDFtemp[,"start"],
             peakDFtemp[,"end"]
     )
  )
}
fileNames <- basename(peakFiles)
sampleNames <- gsub("_peaks.xls","",fileNames)
macsPeaks_GRL <- GRangesList(macsPeaks_GR)
names(macsPeaks_GRL) <- sampleNames

Create a bar chart of number of peaks in each sample.

peakNum <- lengths(macsPeaks_GRL)
toPlot <- data.frame(Samples=names(peakNum),Total=peakNum,Cellline=c("Ch12","Ch12","Mel","Mel"))
library(ggplot2)
ggplot(toPlot,aes(x=Samples,y=Total,fill=Cellline))+geom_bar(stat="identity")+coord_flip()+theme_minimal()

Extract the peaks common to all replicates and cell-lines

allPeaksSet_Overlapping <- unlist(macsPeaks_GRL)
allPeaksSet_nR <- reduce(allPeaksSet_Overlapping)
overlap <- list()
for(i in 1:length(macsPeaks_GRL)){
  overlap[[i]] <- allPeaksSet_nR %over% macsPeaks_GRL[[i]]
}
overlapMatrix <- do.call(cbind,overlap)
colnames(overlapMatrix) <- names(macsPeaks_GRL)
mcols(allPeaksSet_nR) <- overlapMatrix
HC_CommonPeaks <- allPeaksSet_nR[
  rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_Ch12_1","CTCF_Ch12_2")])) >= 2 &
  rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_MEL_1","CTCF_MEL_2")])) >= 2  
  ]

Annotate common peaks to mm10 genes (TSS +/- 500) using ChIPseeker and create and upset plot on annotation.

library(ChIPseeker)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
peakAnno_CTCF <- annotatePeak(HC_CommonPeaks, tssRegion=c(-500, 500), 
                         TxDb=TxDb.Mmusculus.UCSC.mm10.knownGene, 
                         annoDb="org.Mm.eg.db")

## >> preparing features information...      2025-06-05 20:14:21 
## >> identifying nearest features...        2025-06-05 20:14:21 
## >> calculating distance from peak to TSS...   2025-06-05 20:14:21 
## >> assigning genomic annotation...        2025-06-05 20:14:21 
## >> adding gene annotation...          2025-06-05 20:14:23

## 'select()' returned 1:many mapping between keys and columns

## >> assigning chromosome lengths           2025-06-05 20:14:24 
## >> done...                    2025-06-05 20:14:24

upsetplot(peakAnno_CTCF)

## Warning: Removed 121 rows containing non-finite outside the scale range (`stat_count()`).

5 Center common peaks to 100bp around geometric centre, extract sequence under region to FASTA and submit to Meme-ChIP. To save time, randomly sample 1000 sequences to submit to Meme-ChIP.

library(BSgenome.Mmusculus.UCSC.mm10)
toMotif <- resize(HC_CommonPeaks,100,fix="center")
peaksSequences <- getSeq(BSgenome.Mmusculus.UCSC.mm10,
                         toMotif)
names(peaksSequences) <- paste0(seqnames(toMotif),":",
                                         start(toMotif),
                                         "-",
                                         end(toMotif))
writeXStringSet(peaksSequences[sample(1:length(peaksSequences),1000)],file="commonCTCF.fa")

6 Load in the CTCF counts, identify peaks with higher signal in Ch12 cell line (padj <0.05, log2FoldChange > 3) and create an HTML report with tracktables.

## For example ##
HC_Peaks <- allPeaksSet_nR[
  rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_Ch12_1","CTCF_Ch12_2")])) >= 2 |
  rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_MEL_1","CTCF_MEL_2")])) >= 2  
  ]

library(Rsamtools)


load("data/CTCFcounts.RData")

metaDataFrame <- data.frame(CellLine=c("Ch12","Ch12","Mel","Mel"))
rownames(metaDataFrame) <- colnames(CTCFcounts)
library(DESeq2)
deseqCTCF <- DESeqDataSetFromMatrix(countData = assay(CTCFcounts),
                              colData = metaDataFrame,
                              design = ~ CellLine,
                              rowRanges=HC_Peaks)

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula
## are characters, converting to factors

deseqCTCF <- DESeq(deseqCTCF)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

Ch12MinusMel <- results(deseqCTCF,
                        contrast = c("CellLine","Ch12","Mel"),
                        format="GRanges")

Ch12MinusMelFilt <- Ch12MinusMel[!is.na(Ch12MinusMel$pvalue) | !is.na(Ch12MinusMel$padj)]
UpinCh12 <-  Ch12MinusMelFilt[Ch12MinusMelFilt$padj < 0.05 & Ch12MinusMelFilt$log2FoldChange > 0]
library(tracktables)
makebedtable(UpinCh12,"CTCF_UpinCh12.html",getwd())

## [1] "/Users/thomascarroll/Github/RU_ChIPseq/extdata/CTCF_UpinCh12.html"

7 Using rGreat, identify MSigDB pathways enriched for targets genes of our CTCF peaks which are significantly higher in Ch12.

library(rGREAT)
great_Job <- submitGreatJob(UpinCh12,species="mm10",request_interval=1,version = "3.0.0")

## Warning in submitGreatJob(UpinCh12, species = "mm10", request_interval = 1, : GREAT gives a warning:
## Your set hits a large fraction of the genes in the genome, which often does not
## work well with the GREAT Significant by Both view due to a saturation of the
## gene-based hypergeometric test.
## 
## See our tips for handling large datasets or try the Significant By Region-based
## Binomial view.

great_ResultTable = getEnrichmentTables(great_Job,
                                        category="Pathway Data")

## The default enrichment table does not contain informatin of associated genes for
## each input region. You can set `download_by = 'tsv'` to download the complete
## table, but note only the top 500 regions can be retreived. See the following
## link:
## 
## https://great-help.atlassian.net/wiki/spaces/GREAT/pages/655401/Export#Export-GlobalExport
## 
## Except the additional gene-region association column if taking 'tsv' as the
## source of result, all other columns are the same if you choose 'json' (the
## default) as the source. Or you can try the local GREAT analysis with the function
## `great()`.

names(great_ResultTable)

## [1] "PANTHER Pathway" "BioCyc Pathway"  "MSigDB Pathway"

msigPath_UpinCh12 <- great_ResultTable[["MSigDB Pathway"]]

ChIPseq in Bioconductor exercises (part 2)

Rockefeller University, Bioinformatics Resource Centre

https://rockefelleruniversity.github.io/RU_ChIPseq/

ChIPseq with replicates