ATACseq in Bioconductor exercises (part 1)

In todays session we will work with some of the ATAC-seq data of T-regulatory cells from Christina Leslie’s lab.

ATAC-seq preprocessing.

Read in a random 10000 reads from read 1 and read 2 and produce two boxplots of quality scores over cycles.

library(GenomicRanges)
library(Rsamtools)
library(rtracklayer)
library(GenomicAlignments)
library(ShortRead)
library(ggplot2)

f1 <- FastqSampler("~/Downloads/ENCFF175VOD.fastq.gz", n=10000)
f2 <- FastqSampler("~/Downloads/ENCFF447BGX.fastq.gz", n=10000)

set.seed(123456)
p1 <- yield(f1)
set.seed(123456)
p2 <- yield(f2)

allQuals <- quality(p1)
forBox <- as(allQuals,"matrix")
colnames(forBox) <- paste0("Cycle",1:ncol(forBox))
boxplot(forBox,main="Read1")

allQuals <- quality(p2)
forBox <- as(allQuals,"matrix")
colnames(forBox) <- paste0("Cycle",1:ncol(forBox))
boxplot(forBox,main="Read2")

Create a line plot of Base frequencies across cycles for read1 and read2.

alpByCyle <- alphabetByCycle(sread(p1))
alpByCyleFilt <-  alpByCyle[c("A","G","C","T","N"),]
AbyCycFrame <- data.frame(Base="A",Freq=alpByCyleFilt["A",],Cycle=1:max(width(sread(p1))))
CbyCycFrame <- data.frame(Base="C",Freq=alpByCyleFilt["C",],Cycle=1:max(width(sread(p1))))
TbyCycFrame <- data.frame(Base="T",Freq=alpByCyleFilt["T",],Cycle=1:max(width(sread(p1))))
GbyCycFrame <- data.frame(Base="G",Freq=alpByCyleFilt["G",],Cycle=1:max(width(sread(p1))))
NbyCycFrame <- data.frame(Base="N",Freq=alpByCyleFilt["N",],Cycle=1:max(width(sread(p1))))
myFrameRead1 <- rbind(AbyCycFrame,CbyCycFrame,TbyCycFrame,GbyCycFrame,NbyCycFrame)
myFrameRead1$Read <- "Read1"
alpByCyle <- alphabetByCycle(sread(p2))
alpByCyleFilt <-  alpByCyle[c("A","G","C","T","N"),]
AbyCycFrame <- data.frame(Base="A",Freq=alpByCyleFilt["A",],Cycle=1:max(width(sread(p2))))
CbyCycFrame <- data.frame(Base="C",Freq=alpByCyleFilt["C",],Cycle=1:max(width(sread(p2))))
TbyCycFrame <- data.frame(Base="T",Freq=alpByCyleFilt["T",],Cycle=1:max(width(sread(p2))))
GbyCycFrame <- data.frame(Base="G",Freq=alpByCyleFilt["G",],Cycle=1:max(width(sread(p2))))
NbyCycFrame <- data.frame(Base="N",Freq=alpByCyleFilt["N",],Cycle=1:max(width(sread(p2))))
myFrameRead2 <- rbind(AbyCycFrame,CbyCycFrame,TbyCycFrame,GbyCycFrame,NbyCycFrame)
myFrameRead2$Read <- "Read2"
myFrame <- rbind(myFrameRead1,myFrameRead2)

ggplot(myFrame,aes(x=Cycle,y=Freq,colour=Base))+geom_line()+theme_bw()+facet_grid(~Read)

Optional Align the reads in hg38 genome and create sorted, indexed BAM file.

library(BSgenome.Hsapiens.UCSC.hg38)
mainChromosomes <- paste0("chr",c(1:21,"X","Y","M"))
mainChrSeq <- lapply(mainChromosomes,
                     function(x)BSgenome.Hsapiens.UCSC.hg38[[x]])
names(mainChrSeq) <- mainChromosomes
mainChrSeqSet <- DNAStringSet(mainChrSeq)
writeXStringSet(mainChrSeqSet,
                "BSgenome.Hsapiens.UCSC.hg38.mainChrs.fa")
library(Rsubread)
buildindex("BSgenome.Hsapiens.UCSC.hg38.mainChrs",
           "BSgenome.Hsapiens.UCSC.hg38.mainChrs.fa",
           indexSplit = TRUE,
           memory = 5000)

read1 <- "~/Downloads/ENCFF175VOD.fastq.gz"
read2 <- "~/Downloads/ENCFF447BGX.fastq.gz"
outBAM <- "TcellReg_ATAC.bam"

align("BSgenome.Hsapiens.UCSC.hg38.mainChrs",
      readfile1=read1,readfile2=read2,
      output_file = outBAM,
      nthreads=2,type=1,
      unique=TRUE,maxFragLength = 2000)
library(Rsamtools)
sortBam(outBAM,"Sorted_TcellReg_ATAC")
indexBam("Sorted_TcellReg_ATAC.bam")

Using the BAM file retrieved from ENCODE, plot the fragment length distribution for reads chromosome 10 using ggplot2.

library(Rsamtools)
library(ggplot2)
indexBam("~/Downloads/ENCFF053CGD.bam")
library(GenomicAlignments)
toReviewLenOfChrom <- idxstatsBam("~/Downloads/ENCFF053CGD.bam")
myParam=ScanBamParam(flag=scanBamFlag(isProperPair =TRUE),
                   what=c("qname","mapq","isize"),
                   which=GRanges("chr10", IRanges(1,130694993)))

myPairs <- readGAlignmentPairs("~/Downloads/ENCFF053CGD.bam",param = myParam)

IS <- abs(mcols(second(myPairs))$isize)
tableOfIS <- table(IS)
toPlot <- data.frame(InsertSize=as.numeric(names(tableOfIS)),
                            Count=as.numeric(tableOfIS))
fragLenPlot <- ggplot(toPlot,aes(x=InsertSize,y=Count))+geom_line()
fragLenPlot + scale_y_continuous(trans='log2')+theme_minimal()

Create a barplot of number of fragments within Greenleafs’ defined ranges – nucleosome free (< 100bp), mono-nucleosome (180bp-247bp) and di-nucleosome (315-437)

atacReads_Open <- myPairs[IS < 100,]
atacReads_MonoNuc <- myPairs[IS > 180 &
                                 IS < 247,]
atacReads_diNuc <- myPairs[IS > 315 &
                               IS < 437,]
toPlot <- data.frame(Fraction=c("NucleosomeFree","MonoNucleosome","DiNucleosome"),Total=c(length(atacReads_Open),
                     length(atacReads_MonoNuc),
                     length(atacReads_diNuc)))

ggplot(toPlot,aes(x=Fraction,y=Total,fill=Fraction))+geom_bar(stat="identity")+theme_bw()

6 Create a bigwig for the nucleosome free (< 100bp), mono-nucleosome (180bp-247bp) and di-nucleosome (315-437) fractions and visualise in IGV.

atacFragments_Open <- granges(atacReads_Open)
myCoverage_Open <- coverage(atacFragments_Open)
export.bw(myCoverage_Open,"NucleosomeFree.bw")

atacFragments_MonoNuc <- granges(atacReads_MonoNuc)
myCoverage_MonoNuc <- coverage(atacFragments_MonoNuc)
export.bw(myCoverage_MonoNuc,"MonoNucleosome.bw")

atacFragments_diNuc <- granges(atacReads_diNuc)
myCoverage_diNuc <- coverage(atacFragments_diNuc)
export.bw(myCoverage_diNuc,"DiNucleosome.bw")

ATACseq in Bioconductor exercises (part 1)

Rockefeller University, Bioinformatics Resource Centre

https://rockefelleruniversity.github.io/RU_ATACseq/

ATAC-seq preprocessing.