In these exercises we will review some of the functionality for summarizing counts and signal across genomes and within regions.
We will be using data directly downloaded from the Encode consortium.
Download the FASTQ for the other Myc MEL replicate from sample ENCSR000EUA. Direct link is here.
The resulting FQ file is ENCFF001NQQ.fastq.gz.
Read in a random sample of 10,000 reads from ENCFF001NQQ.fastq.gz into R.
library(ShortRead)
<- "~/Downloads/ENCFF001NQQ.fastq.gz"
fastqFile <- FastqSampler(fastqFile,n=10000)
fqSample <- yield(fqSample) fastq
Produce a boxplot of quality scores across cycles.
boxplot(as(quality(fastq),"matrix"))
Create a barplot of A,C,G,T,N occurrence in reads.
library(ggplot2)
<- sread(fastq)
readSequences <- alphabetFrequency(readSequences)
readSequences_AlpFreq <- colSums(readSequences_AlpFreq)
summed__AlpFreq <- data.frame(Base=c("A","C","G","T","N"),Total=summed__AlpFreq[c("A","C","G","T","N")])
toPlot ggplot(toPlot,aes(x=Base,y=Total,fill=Base))+geom_bar(stat="identity")+theme_minimal()+coord_flip()
Create a histogram of read scores.
<- alphabetScore(quality(fastq))
readQuals <- data.frame(ReadQ=readQuals)
toPlot ggplot(toPlot,aes(x=ReadQ))+geom_histogram()+theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Create a new FASTQ from file, filter reads with sum quality score less than 250 and N content greater than 50%.
<- FastqStreamer("~/Downloads/ENCFF001NQQ.fastq.gz", 1000000)
fqStreamer
while (length(fq <- yield(fqStreamer))>0) {
<- TotalReads+length(fq)
TotalReads <- fq[alphabetScore(fq) > 250 ]
filt1 <- filt1[alphabetFrequency(sread(filt1))[,"N"] < width(filt1)/2]
filt2 <- TotalReadsFilt+length(filt2)
TotalReadsFilt writeFastq(filt2,"filtered_ENCFF001NQQ.fastq.gz",mode="a")
}
TotalReads TotalReadsFilt
Align FASTQ file to mm10 genome (only chromosomes chr1 to chr19, X,Y and M) to produce a sorted, indexed BAM file.
library(BSgenome.Mmusculus.UCSC.mm10)
<- paste0("chr",c(1:19,"X","Y","M"))
mainChromosomes <- lapply(mainChromosomes,function(x)BSgenome.Mmusculus.UCSC.mm10[[x]])
mainChrSeq names(mainChrSeq) <- mainChromosomes
<- DNAStringSet(mainChrSeq)
mainChrSeqSet writeXStringSet(mainChrSeqSet,
"BSgenome.Mmusculus.UCSC.mm10.mainChrs.fa")
# Here with Rsubread
library(Rsubread)
buildindex("mm10_mainchrs","BSgenome.Mmusculus.UCSC.mm10.mainChrs.fa",
memory=8000,
indexSplit=TRUE)
<- align("mm10_mainchrs",
myMapped "filtered_ENCFF001NQQ.fastq.gz",
output_format = "BAM",
output_file = "Myc_Mel_2.bam",
type='dna',
phredOffset = 64,
nthreads = 4)
# Sort and index.
library(Rsamtools)
sortBam("Myc_Mel_2.bam","Sorted_Myc_Mel_2")
indexBam("Sorted_Myc_Mel_2.bam")
Produce a bigWig of coverage and another of coverage normalised to total reads (as Reads Per Million).
<- idxstatsBam("Sorted_Myc_Mel_2.bam")
mappedReads <- sum(mappedReads[,"mapped"])
TotalMapped <- coverage("SR_Myc_Mel_rep2.bam")
forBigWig export.bw(forBigWig,con="SR_Myc_Mel_rep2.bw")
<- coverage("SR_Myc_Mel_rep2.bam",weight = (10^6)/TotalMapped)
forBigWigNorm export.bw(forBigWigNorm,con="SR_Myc_Mel_rep2_weighted.bw")
# or
<- coverage("SR_Myc_Mel_rep2.bam")
forBigWig export.bw(forBigWig,con="SR_Myc_Mel_rep2.bw")
<- forBigWig*((10^6)/TotalMapped)
forBigWigNorm export.bw(forBigWigNorm,con="SR_Myc_Mel_rep2_weighted.bw")