ChIPseq data processing

In these exercises we will review some of the functionality of ChIPQC, reading in peaks and annotating peaks to genes.

We will be using data directly downloaded from the ENCODE consortium.

Precomputed ChIPQC results as a list for 2 Encode CTCF samples and their input can be found in the data directory.

data/CTCFQC.RData

We will also perform some of our own QC on some human data of Pancreas CTCF data. We should download thew BAM file here

We can also retrieve the relevant peak calls from here

  1. Load the CTCF QC .RData into R
library(ChIPQC)
load("data/CTCFQC.RData")
  1. Produce a cross coverage plot from these samples using ChIPQC. Add metadata for antibody
myMeta <- data.frame(Sample=names(myQClist),Antibody=c("CTCF","CTCF","Input"))

plotCC(myQClist,addMetaData = myMeta,facetBy="Sample",colourBy="Antibody")+
  ggtitle("CTCF in Ch12 CC-plot")+
  theme_minimal()

  1. Create a barplot of the percentage of duplicates in samples.
library(ggplot2)
myFlags <- flagtagcounts(myQClist)
dupRates <- myFlags["DuplicateByChIPQC",]/myFlags["Mapped",]
toPlot <- data.frame(Sample=names(dupRates),DupPercent=dupRates*100)
toPlot <- merge(myMeta,toPlot,by=1)
ggplot(toPlot,aes(x=Sample,y=DupPercent,fill=Antibody))+
  geom_bar(stat="identity")+coord_flip()+theme_bw()

  1. Plot the enrichment of reads over regions as seen below.
plotRegi(myQClist)+scale_fill_gradient2(low="white",high="red",
    mid="yellow")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

  1. Plot the SSD of the CTCF samples as seen below.
plotSSD(myQClist)+xlim(0,10)+theme_minimal()
## Using Sample as id variables
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

  1. With the downloaded peak calls in ENCFF627LTJ.bed.gz, import into R, filter to chromosomes chr1 to chr21 and plot the number of peaks these chromosomes.
require(rtracklayer)
download.file("https://www.encodeproject.org/files/ENCFF627LTJ/@@download/ENCFF627LTJ.bed.gz","ENCFF627LTJ.bed.gz")
myPeaks <- read.delim("ENCFF627LTJ.bed.gz",header = FALSE)
myPeakGR <- GRanges(myPeaks[,1],IRanges(myPeaks[,2],myPeaks[,3]))
myPeakGR <- myPeakGR[as.vector(seqnames(myPeakGR)) %in% paste0("chr",1:21)]
freqs <- table(as.vector(seqnames(myPeakGR)))
toPlot <- data.frame(Chromosome=names(freqs),Total=as.vector(freqs))
ggplot(toPlot,aes(x=Chromosome,y=Total))+
  geom_bar(stat="identity")+coord_flip()+theme_minimal()

  1. Annotate these peaks to genes using the ChIPseeker package.
library(ChIPseeker)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(org.Hs.eg.db)
peakAnno <- annotatePeak(myPeakGR, tssRegion=c(-500, 500), 
                         TxDb=TxDb.Hsapiens.UCSC.hg38.knownGene, 
                         annoDb="org.Hs.eg.db")
## >> preparing features information...      2021-08-02 03:25:45 PM 
## >> identifying nearest features...        2021-08-02 03:25:46 PM 
## >> calculating distance from peak to TSS...   2021-08-02 03:25:50 PM 
## >> assigning genomic annotation...        2021-08-02 03:25:50 PM 
## >> adding gene annotation...          2021-08-02 03:26:26 PM
## 'select()' returned 1:many mapping between keys and columns
## >> assigning chromosome lengths           2021-08-02 03:26:27 PM 
## >> done...                    2021-08-02 03:26:27 PM
  1. Create a pie chart of annotation of peaks by ChIPseeker
plotAnnoPie(peakAnno)

  1. Export annotated peaks to a tab separated file.
toWrite <- as.data.frame(peakAnno)
write.table(toWrite,file="annotatedPeaks",row.names = FALSE,sep="\t")
  1. Download the blacklist for hg38 and QC our newly downloaded BAM file in ChIPQC. To save time only run ChIPQC on chromosomes (chr10, chr11,chr12). Create cross-coverage plot using ChIPQC.
library(rtracklayer)
library(Rsamtools)
# Always check or index 
indexBam("ENCFF789ZHS.bam")
myQCCTCF <- ChIPQCsample("ENCFF789ZHS.bam",blacklist = blkList, annotation = "hg38",
       chromosomes = c("chr10","chr11","chr12"), verboseT = FALSE)
QCmetrics(myQCCTCF)
##      Reads       Map%      Filt%       Dup%      ReadL      FragL      RelCC 
## 4984428.00     100.00       5.56       0.00      76.00     264.00       2.34 
##        SSD       RiP%      RiBL% 
##       0.93         NA       2.61
plotCC(myQCCTCF)