These are answers for exercises that cover the FastQ sections of Introduction to Bioconductor.

FastQs and ShortRead

In these exercises we will review some of the ShortRead packages functionality with handling fastQ files.

The example data can be found in the data directory.

data/heart.bodyMap.fq

  1. Plot the occurrence of A, G, T, C and N in the read with ID “HWI-BRUNOP16X_0001:7:66:3246:157000#0/1” using ggplot2
library(ShortRead)
FileName <- "data/heart.bodyMap.fq"
my_fastq <- readFastq(FileName)
fastqRead <- my_fastq[as.character(id(my_fastq)) == "HWI-BRUNOP16X_0001:7:66:3246:157000#0/1"]
fastqReadSeq <- sread(fastqRead)
alpFreq <- alphabetFrequency(fastqReadSeq)
alpFreq5 <- alpFreq[,c("A","C","G","T","N")]
toPlot <- data.frame(bases=names(alpFreq5),Freq=alpFreq5)
library(ggplot2)
ggplot(toPlot,aes(x=bases,y=Freq,fill=bases))+geom_bar(stat="identity") + theme_bw()

  1. Create a boxplot of quality scores over cycles for first 10,000 reads using base graphics.
allQuals <- quality(my_fastq[1:10000,])
forBox <- as(allQuals,"matrix")
colnames(forBox) <- paste0("Cycle",1:ncol(forBox))
boxplot(forBox)

  1. Plot the frequency of A, G, C, T and N bases over cycles using ggplot2.
alpByCyle <- alphabetByCycle(sread(my_fastq))
alpByCyleFilt <-  alpByCyle[c("A","G","C","T","N"),]
AbyCycFrame <- data.frame(Base="A",Freq=alpByCyleFilt["A",],Cycle=1:75)
CbyCycFrame <- data.frame(Base="C",Freq=alpByCyleFilt["C",],Cycle=1:75)
TbyCycFrame <- data.frame(Base="T",Freq=alpByCyleFilt["T",],Cycle=1:75)
GbyCycFrame <- data.frame(Base="G",Freq=alpByCyleFilt["G",],Cycle=1:75)
NbyCycFrame <- data.frame(Base="N",Freq=alpByCyleFilt["N",],Cycle=1:75)
myFrame <- rbind(AbyCycFrame,CbyCycFrame,TbyCycFrame,GbyCycFrame,NbyCycFrame)
ggplot(myFrame,aes(x=Cycle,y=Freq,colour=Base))+geom_line()+theme_bw()

  1. How many reads are in the original fastq? Read in a random 10000 bases from the original fastq.
length(my_fastq)
## [1] 132604
mySampledFile <- FastqSampler("data/heart.bodyMap.fq",n=10000)
my_fastq_10000 <- yield(mySampledFile)
my_fastq_10000
## class: ShortReadQ
## length: 10000 reads; width: 50 75 cycles
  1. Count the number of duplicate sequences and filter file to only the non-duplicated sequences
dupLogical <- srduplicated(my_fastq_10000)
numberOfDups <- table(dupLogical)
numberOfDups
## dupLogical
## FALSE  TRUE 
##  4010  5990
nonDups <- my_fastq_10000[dupLogical]
  1. Trim the resulting non-duplicated reads to remove reads when a succession of 10 bases fall below quality of 5.
trimmedNonDups <- trimTails(nonDups,10,"5")
  1. Plot a histogram of the resulting read lengths using base graphics
hist(width(trimmedNonDups),xlab="Trimmed Length",main="Histogram of trimmed lengths")

8.Filter out reads less than 32 bases and write to file

filteredFastq <- trimmedNonDups[width(trimmedNonDups) >= 32]
writeFastq(filteredFastq,file="filteredFastq.fq.gz")
unlink("filteredFastq.fq.gz")