In this practical we will investigate some CTCF ChIPseq for the Mel and Ch12 cellines.
Peak calls can be found in Data/CTCFpeaks/
I have also precounted CTCF signal high confidence peaks and save the results as CTCFcounts in an RData object called CTCFcounts.RData.
Information and raw/processed files can be found for Ch12 cell line here and Myc cell line here
library(GenomicRanges)
library(Rsamtools)
library(rtracklayer)
library(GenomicAlignments)
library(tracktables)
library(limma)
peakFiles <- dir("data/CTCFpeaks/",pattern="*.peaks",
full.names = TRUE)
peakFiles
## [1] "data/CTCFpeaks//CTCF_Ch12_1_peaks.xls" "data/CTCFpeaks//CTCF_Ch12_2_peaks.xls"
## [3] "data/CTCFpeaks//CTCF_MEL_1_peaks.xls" "data/CTCFpeaks//CTCF_MEL_2_peaks.xls"
macsPeaks_DF <- list()
for(i in 1:length(peakFiles)){
macsPeaks_DF[[i]] <- read.delim(peakFiles[i],
comment.char="#")
}
macsPeaks_GR <- list()
for(i in 1:length(macsPeaks_DF)){
peakDFtemp <- macsPeaks_DF[[i]]
macsPeaks_GR[[i]] <- GRanges(
seqnames=peakDFtemp[,"chr"],
IRanges(peakDFtemp[,"start"],
peakDFtemp[,"end"]
)
)
}
fileNames <- basename(peakFiles)
sampleNames <- gsub("_peaks.xls","",fileNames)
macsPeaks_GRL <- GRangesList(macsPeaks_GR)
names(macsPeaks_GRL) <- sampleNames
peakNum <- lengths(macsPeaks_GRL)
toPlot <- data.frame(Samples=names(peakNum),Total=peakNum,Cellline=c("Ch12","Ch12","Mel","Mel"))
library(ggplot2)
ggplot(toPlot,aes(x=Samples,y=Total,fill=Cellline))+geom_bar(stat="identity")+coord_flip()+theme_minimal()
allPeaksSet_Overlapping <- unlist(macsPeaks_GRL)
allPeaksSet_nR <- reduce(allPeaksSet_Overlapping)
overlap <- list()
for(i in 1:length(macsPeaks_GRL)){
overlap[[i]] <- allPeaksSet_nR %over% macsPeaks_GRL[[i]]
}
overlapMatrix <- do.call(cbind,overlap)
colnames(overlapMatrix) <- names(macsPeaks_GRL)
mcols(allPeaksSet_nR) <- overlapMatrix
HC_CommonPeaks <- allPeaksSet_nR[
rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_Ch12_1","CTCF_Ch12_2")])) >= 2 &
rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_MEL_1","CTCF_MEL_2")])) >= 2
]
library(ChIPseeker)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
peakAnno_CTCF <- annotatePeak(HC_CommonPeaks, tssRegion=c(-500, 500),
TxDb=TxDb.Mmusculus.UCSC.mm10.knownGene,
annoDb="org.Mm.eg.db")
## >> preparing features information... 2025-06-05 20:14:21
## >> identifying nearest features... 2025-06-05 20:14:21
## >> calculating distance from peak to TSS... 2025-06-05 20:14:21
## >> assigning genomic annotation... 2025-06-05 20:14:21
## >> adding gene annotation... 2025-06-05 20:14:23
## 'select()' returned 1:many mapping between keys and columns
## >> assigning chromosome lengths 2025-06-05 20:14:24
## >> done... 2025-06-05 20:14:24
## Warning: Removed 121 rows containing non-finite outside the scale range (`stat_count()`).
5 Center common peaks to 100bp around geometric centre, extract sequence under region to FASTA and submit to Meme-ChIP. To save time, randomly sample 1000 sequences to submit to Meme-ChIP.
library(BSgenome.Mmusculus.UCSC.mm10)
toMotif <- resize(HC_CommonPeaks,100,fix="center")
peaksSequences <- getSeq(BSgenome.Mmusculus.UCSC.mm10,
toMotif)
names(peaksSequences) <- paste0(seqnames(toMotif),":",
start(toMotif),
"-",
end(toMotif))
writeXStringSet(peaksSequences[sample(1:length(peaksSequences),1000)],file="commonCTCF.fa")
6 Load in the CTCF counts, identify peaks with higher signal in Ch12 cell line (padj <0.05, log2FoldChange > 3) and create an HTML report with tracktables.
## For example ##
HC_Peaks <- allPeaksSet_nR[
rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_Ch12_1","CTCF_Ch12_2")])) >= 2 |
rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_MEL_1","CTCF_MEL_2")])) >= 2
]
library(Rsamtools)
load("data/CTCFcounts.RData")
metaDataFrame <- data.frame(CellLine=c("Ch12","Ch12","Mel","Mel"))
rownames(metaDataFrame) <- colnames(CTCFcounts)
library(DESeq2)
deseqCTCF <- DESeqDataSetFromMatrix(countData = assay(CTCFcounts),
colData = metaDataFrame,
design = ~ CellLine,
rowRanges=HC_Peaks)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula
## are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Ch12MinusMel <- results(deseqCTCF,
contrast = c("CellLine","Ch12","Mel"),
format="GRanges")
Ch12MinusMelFilt <- Ch12MinusMel[!is.na(Ch12MinusMel$pvalue) | !is.na(Ch12MinusMel$padj)]
UpinCh12 <- Ch12MinusMelFilt[Ch12MinusMelFilt$padj < 0.05 & Ch12MinusMelFilt$log2FoldChange > 0]
library(tracktables)
makebedtable(UpinCh12,"CTCF_UpinCh12.html",getwd())
## [1] "/Users/thomascarroll/Github/RU_ChIPseq/extdata/CTCF_UpinCh12.html"
7 Using rGreat, identify MSigDB pathways enriched for targets genes of our CTCF peaks which are significantly higher in Ch12.
library(rGREAT)
great_Job <- submitGreatJob(UpinCh12,species="mm10",request_interval=1,version = "3.0.0")
## Warning in submitGreatJob(UpinCh12, species = "mm10", request_interval = 1, : GREAT gives a warning:
## Your set hits a large fraction of the genes in the genome, which often does not
## work well with the GREAT Significant by Both view due to a saturation of the
## gene-based hypergeometric test.
##
## See our tips for handling large datasets or try the Significant By Region-based
## Binomial view.
## The default enrichment table does not contain informatin of associated genes for
## each input region. You can set `download_by = 'tsv'` to download the complete
## table, but note only the top 500 regions can be retreived. See the following
## link:
##
## https://great-help.atlassian.net/wiki/spaces/GREAT/pages/655401/Export#Export-GlobalExport
##
## Except the additional gene-region association column if taking 'tsv' as the
## source of result, all other columns are the same if you choose 'json' (the
## default) as the source. Or you can try the local GREAT analysis with the function
## `great()`.
## [1] "PANTHER Pathway" "BioCyc Pathway" "MSigDB Pathway"