These are answers for exercises that cover the FastQ sections of Introduction to Bioconductor.
In these exercises we will review some of the ShortRead packages functionality with handling fastQ files.
The example data can be found in the data directory.
data/heart.bodyMap.fq
library(ShortRead)
FileName <- "data/heart.bodyMap.fq"
my_fastq <- readFastq(FileName)
fastqRead <- my_fastq[as.character(id(my_fastq)) == "HWI-BRUNOP16X_0001:7:66:3246:157000#0/1"]
fastqReadSeq <- sread(fastqRead)
alpFreq <- alphabetFrequency(fastqReadSeq)
alpFreq5 <- alpFreq[,c("A","C","G","T","N")]
toPlot <- data.frame(bases=names(alpFreq5),Freq=alpFreq5)
library(ggplot2)
ggplot(toPlot,aes(x=bases,y=Freq,fill=bases))+geom_bar(stat="identity") + theme_bw()allQuals <- quality(my_fastq[1:10000,])
forBox <- as(allQuals,"matrix")
colnames(forBox) <- paste0("Cycle",1:ncol(forBox))
boxplot(forBox)alpByCyle <- alphabetByCycle(sread(my_fastq))
alpByCyleFilt <- alpByCyle[c("A","G","C","T","N"),]
AbyCycFrame <- data.frame(Base="A",Freq=alpByCyleFilt["A",],Cycle=1:75)
CbyCycFrame <- data.frame(Base="C",Freq=alpByCyleFilt["C",],Cycle=1:75)
TbyCycFrame <- data.frame(Base="T",Freq=alpByCyleFilt["T",],Cycle=1:75)
GbyCycFrame <- data.frame(Base="G",Freq=alpByCyleFilt["G",],Cycle=1:75)
NbyCycFrame <- data.frame(Base="N",Freq=alpByCyleFilt["N",],Cycle=1:75)
myFrame <- rbind(AbyCycFrame,CbyCycFrame,TbyCycFrame,GbyCycFrame,NbyCycFrame)
ggplot(myFrame,aes(x=Cycle,y=Freq,colour=Base))+geom_line()+theme_bw()length(my_fastq)## [1] 132604
mySampledFile <- FastqSampler("data/heart.bodyMap.fq",n=10000)
my_fastq_10000 <- yield(mySampledFile)
my_fastq_10000## class: ShortReadQ
## length: 10000 reads; width: 50 75 cycles
dupLogical <- srduplicated(my_fastq_10000)
numberOfDups <- table(dupLogical)
numberOfDups## dupLogical
## FALSE TRUE
## 4010 5990
nonDups <- my_fastq_10000[dupLogical]trimmedNonDups <- trimTails(nonDups,10,"5")hist(width(trimmedNonDups),xlab="Trimmed Length",main="Histogram of trimmed lengths")8.Filter out reads less than 32 bases and write to file
filteredFastq <- trimmedNonDups[width(trimmedNonDups) >= 32]
writeFastq(filteredFastq,file="filteredFastq.fq.gz")unlink("filteredFastq.fq.gz")