ChIPseq with replicates

In this practical we will investigate some CTCF ChIPseq for the Mel and Ch12 cellines.

Peak calls can be found in Data/CTCFpeaks/

I have also precounted CTCF signal high confidence peaks and save the results as CTCFcounts in an RData object called CTCFcounts.RData.

Information and raw/processed files can be found for Ch12 cell line here and Myc cell line here

  1. Load the CTCF peaks into a GRangesList object.
library(GenomicRanges)
library(Rsamtools)
library(rtracklayer)
library(GenomicAlignments)
library(tracktables)
library(limma)

peakFiles <- dir("data/CTCFpeaks/",pattern="*.peaks",
                 full.names = TRUE)
peakFiles
## [1] "data/CTCFpeaks//CTCF_Ch12_1_peaks.xls"
## [2] "data/CTCFpeaks//CTCF_Ch12_2_peaks.xls"
## [3] "data/CTCFpeaks//CTCF_MEL_1_peaks.xls" 
## [4] "data/CTCFpeaks//CTCF_MEL_2_peaks.xls"
macsPeaks_DF <- list()
for(i in 1:length(peakFiles)){
  macsPeaks_DF[[i]] <- read.delim(peakFiles[i],
                                  comment.char="#")
}

macsPeaks_GR <- list()
for(i in 1:length(macsPeaks_DF)){
     peakDFtemp <- macsPeaks_DF[[i]]
     macsPeaks_GR[[i]] <- GRanges(
     seqnames=peakDFtemp[,"chr"],
     IRanges(peakDFtemp[,"start"],
             peakDFtemp[,"end"]
     )
  )
}
fileNames <- basename(peakFiles)
sampleNames <- gsub("_peaks.xls","",fileNames)
macsPeaks_GRL <- GRangesList(macsPeaks_GR)
names(macsPeaks_GRL) <- sampleNames
  1. Create a bar chart of number of peaks in each sample.
peakNum <- lengths(macsPeaks_GRL)
toPlot <- data.frame(Samples=names(peakNum),Total=peakNum,Cellline=c("Ch12","Ch12","Mel","Mel"))
library(ggplot2)
ggplot(toPlot,aes(x=Samples,y=Total,fill=Cellline))+geom_bar(stat="identity")+coord_flip()+theme_minimal()

  1. Extract the peaks common to all replicates and cell-lines
allPeaksSet_Overlapping <- unlist(macsPeaks_GRL)
allPeaksSet_nR <- reduce(allPeaksSet_Overlapping)
overlap <- list()
for(i in 1:length(macsPeaks_GRL)){
  overlap[[i]] <- allPeaksSet_nR %over% macsPeaks_GRL[[i]]
}
overlapMatrix <- do.call(cbind,overlap)
colnames(overlapMatrix) <- names(macsPeaks_GRL)
mcols(allPeaksSet_nR) <- overlapMatrix
HC_CommonPeaks <- allPeaksSet_nR[
  rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_Ch12_1","CTCF_Ch12_2")])) >= 2 &
  rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_MEL_1","CTCF_MEL_2")])) >= 2  
  ]
  1. Annotate common peaks to mm10 genes (TSS +/- 500) using ChIPseeker and create and upset plot on annotation.
library(ChIPseeker)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
peakAnno_CTCF <- annotatePeak(HC_CommonPeaks, tssRegion=c(-500, 500), 
                         TxDb=TxDb.Mmusculus.UCSC.mm10.knownGene, 
                         annoDb="org.Mm.eg.db")
## >> preparing features information...      2021-08-02 03:27:42 PM 
## >> identifying nearest features...        2021-08-02 03:27:42 PM 
## >> calculating distance from peak to TSS...   2021-08-02 03:27:43 PM 
## >> assigning genomic annotation...        2021-08-02 03:27:43 PM 
## >> adding gene annotation...          2021-08-02 03:27:46 PM
## 'select()' returned 1:many mapping between keys and columns
## >> assigning chromosome lengths           2021-08-02 03:27:46 PM 
## >> done...                    2021-08-02 03:27:46 PM
upsetplot(peakAnno_CTCF)
## Warning: Removed 121 rows containing non-finite values (stat_count).

5 Center common peaks to 100bp around geometric centre, extract sequence under region to FASTA and submit to Meme-ChIP. To save time, randomly sample 1000 sequences to submit to Meme-ChIP.

library(BSgenome.Mmusculus.UCSC.mm10)
toMotif <- resize(HC_CommonPeaks,100,fix="center")
peaksSequences <- getSeq(BSgenome.Mmusculus.UCSC.mm10,
                         toMotif)
names(peaksSequences) <- paste0(seqnames(toMotif),":",
                                         start(toMotif),
                                         "-",
                                         end(toMotif))
writeXStringSet(peaksSequences[sample(1:length(peaksSequences),1000)],file="commonCTCF.fa")

6 Load in the CTCF counts, identify peaks with higher signal in Ch12 cell line (padj <0.05, log2FoldChange > 3) and create an HTML report with tracktables.

## For example ##
HC_Peaks <- allPeaksSet_nR[
  rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_Ch12_1","CTCF_Ch12_2")])) >= 2 |
  rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_MEL_1","CTCF_MEL_2")])) >= 2  
  ]

library(Rsamtools)


load("data/CTCFcounts.RData")

metaDataFrame <- data.frame(CellLine=c("Ch12","Ch12","Mel","Mel"))
rownames(metaDataFrame) <- colnames(CTCFcounts)
library(DESeq2)
deseqCTCF <- DESeqDataSetFromMatrix(countData = assay(CTCFcounts),
                              colData = metaDataFrame,
                              design = ~ CellLine,
                              rowRanges=HC_Peaks)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
deseqCTCF <- DESeq(deseqCTCF)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Ch12MinusMel <- results(deseqCTCF,
                        contrast = c("CellLine","Ch12","Mel"),
                        format="GRanges")

Ch12MinusMelFilt <- Ch12MinusMel[!is.na(Ch12MinusMel$pvalue) | !is.na(Ch12MinusMel$padj)]
UpinCh12 <-  Ch12MinusMelFilt[Ch12MinusMelFilt$padj < 0.05 & Ch12MinusMelFilt$log2FoldChange > 0]
library(tracktables)
makebedtable(UpinCh12,"CTCF_UpinCh12.html",getwd())
## [1] "/__w/RU_ChIPseq/RU_ChIPseq/extdata/CTCF_UpinCh12.html"

7 Using rGreat, identify MSigDB pathways enriched for targets genes of our CTCF peaks which are significantly higher in Ch12.

library(rGREAT)
great_Job <- submitGreatJob(UpinCh12,species="mm10",request_interval=1,version = "3.0.0")
## Warning in submitGreatJob(UpinCh12, species = "mm10", request_interval = 1, : GREAT gives a warning:
## Your set hits a large fraction of the genes in the genome, which often
## does not work well with the GREAT Significant by Both view due to a
## saturation of the gene-based hypergeometric test.
## 
## See our tips for handling large datasets or try the Significant By
## Region-based Binomial view.
great_ResultTable = getEnrichmentTables(great_Job,
                                        category="Pathway Data")
## The default enrichment tables contain no associated genes for the input
## regions. You can set `download_by = 'tsv'` to download the complete
## table, but note only the top 500 regions can be retreived. See the
## following link:
## 
## https://great-help.atlassian.net/wiki/spaces/GREAT/pages/655401/Export#Export-GlobalExport
names(great_ResultTable)
## [1] "PANTHER Pathway" "BioCyc Pathway"  "MSigDB Pathway"
msigPath_UpinCh12 <- great_ResultTable[["MSigDB Pathway"]]