ChIPseq in Bioconductor exercises (part 1)

Exercises

1. Read data for QC

Read in a random sample of 10,000 reads from ENCFF001NQQ.fastq.gz into R.

library(ShortRead)
fastqFile <- "~/Downloads/ENCFF001NQQ.fastq.gz"
fqSample <- FastqSampler(fastqFile,n=10000)
fastq <- yield(fqSample)

2. QC - Quality

Produce a boxplot of quality scores across cycles.

boxplot(as(quality(fastq),"matrix"))

3. QC - Nucleotide occurence

Create a barplot of A,C,G,T,N occurrence in reads.

library(ggplot2)
readSequences <- sread(fastq)
readSequences_AlpFreq <- alphabetFrequency(readSequences)
summed__AlpFreq  <- colSums(readSequences_AlpFreq)
toPlot <- data.frame(Base=c("A","C","G","T","N"),Total=summed__AlpFreq[c("A","C","G","T","N")])
ggplot(toPlot,aes(x=Base,y=Total,fill=Base))+geom_bar(stat="identity")+theme_minimal()+coord_flip()

4. QC - Quality

Create a histogram of read scores.

readQuals <- alphabetScore(quality(fastq))
toPlot <- data.frame(ReadQ=readQuals)
ggplot(toPlot,aes(x=ReadQ))+geom_histogram()+theme_minimal()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

5. QC - Filter

Create a new FASTQ from file, filter reads with sum quality score less than 250 and N content greater than 50%.

fqStreamer <- FastqStreamer("~/Downloads/ENCFF001NQQ.fastq.gz", 1000000)

while (length(fq <- yield(fqStreamer))>0) {
    TotalReads <- TotalReads+length(fq)
    filt1 <- fq[alphabetScore(fq) > 250 ]
    filt2 <- filt1[alphabetFrequency(sread(filt1))[,"N"] < width(filt1)/2]
    TotalReadsFilt <- TotalReadsFilt+length(filt2)
    writeFastq(filt2,"filtered_ENCFF001NQQ.fastq.gz",mode="a")
}
TotalReads
TotalReadsFilt

6. Align

Align FASTQ file to mm10 genome (only chromosomes chr1 to chr19, X,Y and M) to produce a sorted, indexed BAM file.

library(BSgenome.Mmusculus.UCSC.mm10)
mainChromosomes <- paste0("chr",c(1:19,"X","Y","M"))
mainChrSeq <- lapply(mainChromosomes,function(x)BSgenome.Mmusculus.UCSC.mm10[[x]])
names(mainChrSeq) <- mainChromosomes
mainChrSeqSet <- DNAStringSet(mainChrSeq)
writeXStringSet(mainChrSeqSet,
                "BSgenome.Mmusculus.UCSC.mm10.mainChrs.fa")

# Here with Rsubread

library(Rsubread)
buildindex("mm10_mainchrs","BSgenome.Mmusculus.UCSC.mm10.mainChrs.fa", 
           memory=8000,
           indexSplit=TRUE)

myMapped <- align("mm10_mainchrs",
                    "filtered_ENCFF001NQQ.fastq.gz",
                    output_format = "BAM",
                    output_file = "Myc_Mel_2.bam",
                    type='dna',
                    phredOffset = 64,
                    nthreads = 4)

# Sort and index.
library(Rsamtools)

sortBam("Myc_Mel_2.bam","Sorted_Myc_Mel_2")
indexBam("Sorted_Myc_Mel_2.bam")

7. Export bigwigs

Produce a bigWig of coverage and another of coverage normalised to total reads (as Reads Per Million).

mappedReads <- idxstatsBam("Sorted_Myc_Mel_2.bam")
TotalMapped <- sum(mappedReads[,"mapped"])
forBigWig <- coverage("SR_Myc_Mel_rep2.bam")
export.bw(forBigWig,con="SR_Myc_Mel_rep2.bw")

forBigWigNorm <- coverage("SR_Myc_Mel_rep2.bam",weight = (10^6)/TotalMapped)
export.bw(forBigWigNorm,con="SR_Myc_Mel_rep2_weighted.bw")

# or 
forBigWig <- coverage("SR_Myc_Mel_rep2.bam")
export.bw(forBigWig,con="SR_Myc_Mel_rep2.bw")
forBigWigNorm <- forBigWig*((10^6)/TotalMapped)
export.bw(forBigWigNorm,con="SR_Myc_Mel_rep2_weighted.bw")