In this practical we will investigate some CTCF ChIPseq for the Mel and Ch12 cellines.
Peak calls can be found in Data/CTCFpeaks/
I have also precounted CTCF signal high confidence peaks and save the results as CTCFcounts in an RData object called CTCFcounts.RData.
Information and raw/processed files can be found for Ch12 cell line here and Myc cell line here
library(GenomicRanges)
library(Rsamtools)
library(rtracklayer)
library(GenomicAlignments)
library(tracktables)
library(limma)
<- dir("data/CTCFpeaks/",pattern="*.peaks",
peakFiles full.names = TRUE)
peakFiles
## [1] "data/CTCFpeaks//CTCF_Ch12_1_peaks.xls"
## [2] "data/CTCFpeaks//CTCF_Ch12_2_peaks.xls"
## [3] "data/CTCFpeaks//CTCF_MEL_1_peaks.xls"
## [4] "data/CTCFpeaks//CTCF_MEL_2_peaks.xls"
<- list()
macsPeaks_DF for(i in 1:length(peakFiles)){
<- read.delim(peakFiles[i],
macsPeaks_DF[[i]] comment.char="#")
}
<- list()
macsPeaks_GR for(i in 1:length(macsPeaks_DF)){
<- macsPeaks_DF[[i]]
peakDFtemp <- GRanges(
macsPeaks_GR[[i]] seqnames=peakDFtemp[,"chr"],
IRanges(peakDFtemp[,"start"],
"end"]
peakDFtemp[,
)
)
}<- basename(peakFiles)
fileNames <- gsub("_peaks.xls","",fileNames)
sampleNames <- GRangesList(macsPeaks_GR)
macsPeaks_GRL names(macsPeaks_GRL) <- sampleNames
<- lengths(macsPeaks_GRL)
peakNum <- data.frame(Samples=names(peakNum),Total=peakNum,Cellline=c("Ch12","Ch12","Mel","Mel"))
toPlot library(ggplot2)
ggplot(toPlot,aes(x=Samples,y=Total,fill=Cellline))+geom_bar(stat="identity")+coord_flip()+theme_minimal()
<- unlist(macsPeaks_GRL)
allPeaksSet_Overlapping <- reduce(allPeaksSet_Overlapping)
allPeaksSet_nR <- list()
overlap for(i in 1:length(macsPeaks_GRL)){
<- allPeaksSet_nR %over% macsPeaks_GRL[[i]]
overlap[[i]]
}<- do.call(cbind,overlap)
overlapMatrix colnames(overlapMatrix) <- names(macsPeaks_GRL)
mcols(allPeaksSet_nR) <- overlapMatrix
<- allPeaksSet_nR[
HC_CommonPeaks rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_Ch12_1","CTCF_Ch12_2")])) >= 2 &
rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_MEL_1","CTCF_MEL_2")])) >= 2
]
library(ChIPseeker)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
<- annotatePeak(HC_CommonPeaks, tssRegion=c(-500, 500),
peakAnno_CTCF TxDb=TxDb.Mmusculus.UCSC.mm10.knownGene,
annoDb="org.Mm.eg.db")
## >> preparing features information... 2021-08-02 03:27:42 PM
## >> identifying nearest features... 2021-08-02 03:27:42 PM
## >> calculating distance from peak to TSS... 2021-08-02 03:27:43 PM
## >> assigning genomic annotation... 2021-08-02 03:27:43 PM
## >> adding gene annotation... 2021-08-02 03:27:46 PM
## 'select()' returned 1:many mapping between keys and columns
## >> assigning chromosome lengths 2021-08-02 03:27:46 PM
## >> done... 2021-08-02 03:27:46 PM
upsetplot(peakAnno_CTCF)
## Warning: Removed 121 rows containing non-finite values (stat_count).
5 Center common peaks to 100bp around geometric centre, extract sequence under region to FASTA and submit to Meme-ChIP. To save time, randomly sample 1000 sequences to submit to Meme-ChIP.
library(BSgenome.Mmusculus.UCSC.mm10)
<- resize(HC_CommonPeaks,100,fix="center")
toMotif <- getSeq(BSgenome.Mmusculus.UCSC.mm10,
peaksSequences
toMotif)names(peaksSequences) <- paste0(seqnames(toMotif),":",
start(toMotif),
"-",
end(toMotif))
writeXStringSet(peaksSequences[sample(1:length(peaksSequences),1000)],file="commonCTCF.fa")
6 Load in the CTCF counts, identify peaks with higher signal in Ch12 cell line (padj <0.05, log2FoldChange > 3) and create an HTML report with tracktables.
## For example ##
<- allPeaksSet_nR[
HC_Peaks rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_Ch12_1","CTCF_Ch12_2")])) >= 2 |
rowSums(as.data.frame(mcols(allPeaksSet_nR)[,c("CTCF_MEL_1","CTCF_MEL_2")])) >= 2
]
library(Rsamtools)
load("data/CTCFcounts.RData")
<- data.frame(CellLine=c("Ch12","Ch12","Mel","Mel"))
metaDataFrame rownames(metaDataFrame) <- colnames(CTCFcounts)
library(DESeq2)
<- DESeqDataSetFromMatrix(countData = assay(CTCFcounts),
deseqCTCF colData = metaDataFrame,
design = ~ CellLine,
rowRanges=HC_Peaks)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
<- DESeq(deseqCTCF) deseqCTCF
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
<- results(deseqCTCF,
Ch12MinusMel contrast = c("CellLine","Ch12","Mel"),
format="GRanges")
<- Ch12MinusMel[!is.na(Ch12MinusMel$pvalue) | !is.na(Ch12MinusMel$padj)]
Ch12MinusMelFilt <- Ch12MinusMelFilt[Ch12MinusMelFilt$padj < 0.05 & Ch12MinusMelFilt$log2FoldChange > 0]
UpinCh12 library(tracktables)
makebedtable(UpinCh12,"CTCF_UpinCh12.html",getwd())
## [1] "/__w/RU_ChIPseq/RU_ChIPseq/extdata/CTCF_UpinCh12.html"
7 Using rGreat, identify MSigDB pathways enriched for targets genes of our CTCF peaks which are significantly higher in Ch12.
library(rGREAT)
<- submitGreatJob(UpinCh12,species="mm10",request_interval=1,version = "3.0.0") great_Job
## Warning in submitGreatJob(UpinCh12, species = "mm10", request_interval = 1, : GREAT gives a warning:
## Your set hits a large fraction of the genes in the genome, which often
## does not work well with the GREAT Significant by Both view due to a
## saturation of the gene-based hypergeometric test.
##
## See our tips for handling large datasets or try the Significant By
## Region-based Binomial view.
= getEnrichmentTables(great_Job,
great_ResultTable category="Pathway Data")
## The default enrichment tables contain no associated genes for the input
## regions. You can set `download_by = 'tsv'` to download the complete
## table, but note only the top 500 regions can be retreived. See the
## following link:
##
## https://great-help.atlassian.net/wiki/spaces/GREAT/pages/655401/Export#Export-GlobalExport
names(great_ResultTable)
## [1] "PANTHER Pathway" "BioCyc Pathway" "MSigDB Pathway"
<- great_ResultTable[["MSigDB Pathway"]] msigPath_UpinCh12