FastQ exercises

FastQs and ShortRead

In these exercises we will review some of the ShortRead packages functionality with handling fastQ files.

The example data can be found in the data directory.

data/heart.bodyMap.fq

Plot the occurrence of A, G, T, C and N in the read with ID “HWI-BRUNOP16X_0001:7:66:3246:157000#0/1” using ggplot2

library(ShortRead)
FileName <- "data/heart.bodyMap.fq"
my_fastq <- readFastq(FileName)
fastqRead <- my_fastq[as.character(id(my_fastq)) == "HWI-BRUNOP16X_0001:7:66:3246:157000#0/1"]
fastqReadSeq <- sread(fastqRead)
alpFreq <- alphabetFrequency(fastqReadSeq)
alpFreq5 <- alpFreq[,c("A","C","G","T","N")]
toPlot <- data.frame(bases=names(alpFreq5),Freq=alpFreq5)
library(ggplot2)
ggplot(toPlot,aes(x=bases,y=Freq,fill=bases))+geom_bar(stat="identity") + theme_bw()

Create a boxplot of quality scores over cycles for first 10,000 reads using base graphics.

allQuals <- quality(my_fastq[1:10000,])
forBox <- as(allQuals,"matrix")
colnames(forBox) <- paste0("Cycle",1:ncol(forBox))
boxplot(forBox)

Plot the frequency of A, G, C, T and N bases over cycles using ggplot2.

alpByCyle <- alphabetByCycle(sread(my_fastq))
alpByCyleFilt <-  alpByCyle[c("A","G","C","T","N"),]
AbyCycFrame <- data.frame(Base="A",Freq=alpByCyleFilt["A",],Cycle=1:75)
CbyCycFrame <- data.frame(Base="C",Freq=alpByCyleFilt["C",],Cycle=1:75)
TbyCycFrame <- data.frame(Base="T",Freq=alpByCyleFilt["T",],Cycle=1:75)
GbyCycFrame <- data.frame(Base="G",Freq=alpByCyleFilt["G",],Cycle=1:75)
NbyCycFrame <- data.frame(Base="N",Freq=alpByCyleFilt["N",],Cycle=1:75)
myFrame <- rbind(AbyCycFrame,CbyCycFrame,TbyCycFrame,GbyCycFrame,NbyCycFrame)
ggplot(myFrame,aes(x=Cycle,y=Freq,colour=Base))+geom_line()+theme_bw()

How many reads are in the original fastq? Read in a random 10000 bases from the original fastq.

length(my_fastq)

## [1] 132604

mySampledFile <- FastqSampler("data/heart.bodyMap.fq",n=10000)
my_fastq_10000 <- yield(mySampledFile)
my_fastq_10000

## class: ShortReadQ
## length: 10000 reads; width: 50 75 cycles

Count the number of duplicate sequences and filter file to only the non-duplicated sequences

dupLogical <- srduplicated(my_fastq_10000)
numberOfDups <- table(dupLogical)
numberOfDups

## dupLogical
## FALSE  TRUE 
##  3973  6027

nonDups <- my_fastq_10000[dupLogical]

Trim the resulting non-duplicated reads to remove reads when a succession of 10 bases fall below quality of 5.

trimmedNonDups <- trimTails(nonDups,10,"5")

Plot a histogram of the resulting read lengths using base graphics

hist(width(trimmedNonDups),xlab="Trimmed Length",main="Histogram of trimmed lengths")

8.Filter out reads less than 32 bases and write to file

filteredFastq <- trimmedNonDups[width(trimmedNonDups) >= 32]
writeFastq(filteredFastq,file="filteredFastq.fq.gz")

unlink("filteredFastq.fq.gz")

FastQ exercises

Rockefeller University, Bioinformatics Resource Centre

https://rockefelleruniversity.github.io/Bioconductor_Introduction/

FastQs and ShortRead