These exercises are about fastq quality control.

Exercise 1 - Reading FastQs with ShortRead

library(ShortRead)

fqSample <- FastqSampler("~/Downloads/SRR20110417_1.fastq.gz",n=10^6)
fastq <- yield(fqSample)
fastq
## class: ShortReadQ
## length: 100000 reads; width: 40 cycles
sread(fastq)[1:10]
## DNAStringSet object of length 10:
##      width seq
##  [1]    40 TGTGAGAAAGCAAAGCTAGGGTGTACATCTGGTGTCTTTT
##  [2]    40 AGGTACCGGGTGCTTGTACAGGTCCGTGCAGATGAGATGC
##  [3]    40 CCTTCAGCCAATGTGTCTCAAAGGCAGGAGCAGCTCAAAG
##  [4]    40 ACACTAACTAATGGCTTCTCACTCTTACATACAGTGCTTT
##  [5]    40 AGTACATTGATATTTTATGTACAGTTTATTATAAATGTTT
##  [6]    40 ATTGGCCTTTCTGTTGTGGACTTGAAATAAGCTAATTGCT
##  [7]    40 GAGTCACGTGCGGGTAGGGTTTGGCGAGATCGGAAGAGCA
##  [8]    40 TATAAAAAGCTATAAACAAGGTGTGTTTGCCAAAATGTTT
##  [9]    40 TGTTCCAAAGACTATTGACTCTGAGAGCTACATGTCCTTA
## [10]    40 TCACTTCTATAAGTCTACATTTCCTCTTTTGTAAACTGGG
quality(fastq)[1:10]
## class: FastqQuality
## quality:
## BStringSet object of length 10:
##      width seq
##  [1]    40 ????????????????????????????????????????
##  [2]    40 ????????????????????????????????????????
##  [3]    40 ????????????????????????????????????????
##  [4]    40 ????????????????????????????????????????
##  [5]    40 ????????????????????????????????????????
##  [6]    40 ????????????????????????????????????????
##  [7]    40 ????????????????????????????????????????
##  [8]    40 ????????????????????????????????????????
##  [9]    40 ????????????????????????????????????????
## [10]    40 ????????????????????????????????????????
id(fastq)[1:10]
## BStringSet object of length 10:
##      width seq
##  [1]    31 SRR20110417.18783294 18783294/1
##  [2]    31 SRR20110417.23058003 23058003/1
##  [3]    27 SRR20110417.370785 370785/1
##  [4]    31 SRR20110417.38338769 38338769/1
##  [5]    31 SRR20110417.30559613 30559613/1
##  [6]    29 SRR20110417.3870902 3870902/1
##  [7]    31 SRR20110417.35673597 35673597/1
##  [8]    31 SRR20110417.38568471 38568471/1
##  [9]    31 SRR20110417.16498017 16498017/1
## [10]    31 SRR20110417.48650389 48650389/1
library(ggplot2)
readSequences <- sread(fastq)
readSequences_AlpFreq <- alphabetFrequency(readSequences)
summed__AlpFreq  <- colSums(readSequences_AlpFreq)
summed__AlpFreq[c("A","C","G","T","N")]
##       A       C       G       T       N 
## 1188117  840505  820889 1150389     100
readSequences_AlpbyCycle <- alphabetByCycle(readSequences)
AFreq <- readSequences_AlpbyCycle["A",]
CFreq <- readSequences_AlpbyCycle["C",]
GFreq <- readSequences_AlpbyCycle["G",]
TFreq <- readSequences_AlpbyCycle["T",]
toPlot <- data.frame(Count=c(AFreq,CFreq,GFreq,TFreq),
                     Cycle=rep(1:40,4),
                     Base=rep(c("A","C","G","T"),each=40))


ggplot(toPlot, aes(y=Count,x=Cycle,colour=Base)) + geom_line() +
  theme_bw()

library(ShortRead)

fqSample <- FastqSampler("~/Downloads/SRR20110417_2.fastq.gz",n=10^6)
fastq2 <- yield(fqSample)
fastq2
## class: ShortReadQ
## length: 1000000 reads; width: 40 cycles
readSequences <- sread(fastq2)
readSequences_AlpFreq <- alphabetFrequency(readSequences)
summed__AlpFreq  <- colSums(readSequences_AlpFreq)
summed__AlpFreq[c("A","C","G","T","N")]
##        A        C        G        T        N 
## 11580549  8126464  8888673 11403102     1212
readSequences_AlpbyCycle <- alphabetByCycle(readSequences)
AFreq <- readSequences_AlpbyCycle["A",]
CFreq <- readSequences_AlpbyCycle["C",]
GFreq <- readSequences_AlpbyCycle["G",]
TFreq <- readSequences_AlpbyCycle["T",]
toPlot <- data.frame(Count=c(AFreq,CFreq,GFreq,TFreq),
                     Cycle=rep(1:40,4),
                     Base=rep(c("A","C","G","T"),each=40))


ggplot(toPlot, aes(y=Count,x=Cycle,colour=Base)) + geom_line() +
  theme_bw()

Exercise 2 - Run Rfastp

library(Rfastp)

rfastp(read1="/Users/mattpaul/Downloads/SRR20110417_1.fastq.gz", read2 = "/Users/mattpaul/Downloads/SRR20110417_2.fastq.gz",outputFastq = "SOX9CNR_D0_rep2_filtered")

The html is here

You could try: * trimming from the front of the read * adding in over representation analysis (this will take a little longer)

rfastp(read1="/Users/mattpaul/Downloads/SRR20110417_1.fastq.gz", read2 = "/Users/mattpaul/Downloads/SRR20110417_2.fastq.gz",outputFastq = "SOX9CNR_D0_rep2_filtered2",
       trimFrontRead1=10,
       trimFrontRead2=10,
       overrepresentationAnalysis=T)

The html is here