Genomic Scores in Bioconductor exercises

Working with Genomic Scores

In this exercise we will work with some ChIP-seq data for the Encode experiment ENCSR000ERN.

Due to bigwig libraries only being available on unix machines, the exercises below will only work on Mac and Linux operating systems.

Download the signal pvalue bigWig file for replicate 1 for this experiment.

This data can be found at the Encode experiment here or at the direct link here.

suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(GenomicRanges))

Read in the bigWig signal for the region (chr1:133040000-133149400) as an RleList.

library(rtracklayer)
regionToImport <- GRanges(seqnames="chr1",
                          ranges=IRanges(133040000,133149400))
mySelection <- BigWigSelection(regionToImport)
bigWigRegion <- import.bw("ENCFF940MBK.bigWig",
                          selection=mySelection,
                          as="RleList")

Find the maximum and minimum of value of bigWig signal in the region.

myMax <- max(bigWigRegion$chr1[133040000:133149400])
myMax <- max(bigWigRegion)["chr1"]

Produce an area plot (using geom_area) of the signal over the transcriptional start site (+/- 500bp of gene start) for the Ppp1r15b gene using ggplot2.

TSSpos <- seq(133131166-500,133131166+500)
tssDF <- as.data.frame(bigWigRegion$chr1[TSSpos])
tssDF$position <- TSSpos
library(ggplot2)
ggplot(tssDF,aes(y=value,x=position))+geom_area()+theme_minimal()+xlab("Genome")+ylab("Score")+theme(axis.text.x = element_text(angle = 30, hjust = 1),axis.title.y = element_text(angle = 0))

Apply a log2 transformation to the genomic scores and save as a new variable.

newBigWigRegion <- bigWigRegion
newBigWigRegion <- log2(newBigWigRegion)

Export this as bigWig and visualise signal over Ppp1r15b gene in IGV.

export.bw(newBigWigRegion,"log2.bw")

Read in the file data/Myc_Ch12_1_withInput_Input_Ch12_peaks.xls and create a GRanges object of all peaks on chromosome 1.

library(rtracklayer)
library(GenomicRanges)
MYCpeaks <- read.delim("data/Myc_Ch12_1_withInput_Input_Ch12_peaks.xls",sep="\t",comment.char = "#")
MYCgranges <- GRanges(MYCpeaks$chr,IRanges(MYCpeaks$start,MYCpeaks$end),
                       FE=MYCpeaks$fold_enrichment,
                       minuslog10Pval=MYCpeaks$X.log10.pvalue.)
MYCgrangesChr1 <- MYCgranges[seqnames(MYCgranges) %in% "chr1"]

Using this GRanges of Myc peaks, import in the Myc ChIPseq bigwig signal over the peaks on chromosome 1.

library(rtracklayer)
library(GenomicRanges)

mySelection <- BigWigSelection(MYCgrangesChr1)
bigWigPeaks <- import.bw("ENCFF940MBK.bigWig",
                          selection=mySelection,
                          as="RleList")

Import the gene positions from mm10_GenePosForIGV.txt as a GRanges. Filter to genes on chromosome 1 and identify peaks overlapping the gene’s promoter (here defined as a region ranging between 2000bp upstream from a gene’s transcriptional start site to the transcriptional start site) and those not overlapping a promoter.

IGVGenePositions <- read.delim("data/mm10_GenePosForIGV.txt",sep="\t")

genePos <- GRanges(IGVGenePositions$seqnames,
                   IRanges(IGVGenePositions$start,IGVGenePositions$end),
                   strand = IGVGenePositions$strand)

names(genePos) <- IGVGenePositions$gene_id
tssPos <- resize(genePos,width = 1,fix = "start")
promoterPos <- resize(tssPos,width = 2000,fix = "end")

promoterPeaks <- MYCgrangesChr1[MYCgrangesChr1 %over% promoterPos]
nonPromoterPeaks <- MYCgrangesChr1[!MYCgrangesChr1 %over% promoterPos]

Create a violin plot of sum of bigWig signal within peaks at promoters and those not at promoters using ggplot2.

PeakCoveragePromoter <- bigWigPeaks[promoterPeaks]
PeakMaxs_Promoter <- sum(PeakCoveragePromoter)

PeakCoverageNonPromoter <- bigWigPeaks[nonPromoterPeaks]
PeakMaxs_NonPromoter <- sum(PeakCoverageNonPromoter)

df_promoter <- data.frame(Sum=PeakMaxs_Promoter,PromoterOrNot="Promoter")
df_notPromoter <- data.frame(Sum=PeakMaxs_NonPromoter,PromoterOrNot="NotPromoter")
toPlot_SumSignal <- rbind(df_promoter,df_notPromoter)

ggplot(toPlot_SumSignal,aes(x=PromoterOrNot,y=Sum,fill=PromoterOrNot))+geom_violin()+scale_y_log10()+scale_fill_brewer(palette = "Pastel2")+theme_minimal()

Export a bed file of TSS peaks whose sum of ChIPseq signal is 2 fold greater than the median of all peaks’ summed signals.

toExport <- bigWigPeaks
peakSumMedian <- median(c(sum(PeakCoveragePromoter),sum(PeakCoverageNonPromoter)))
filteredPeaks <- promoterPeaks[PeakMaxs_Promoter > peakSumMedian*2]
export.bed(filteredPeaks,"filteredPeaks.bed")

Genomic Scores in Bioconductor exercises

Rockefeller University, Bioinformatics Resource Centre

https://rockefelleruniversity.github.io/Bioconductor_Introduction/

Working with Genomic Scores