My Exercise 1

These exercises are about fastq quality control.

Exercise 1 - Reading FastQs with ShortRead

Download fastq files for the second replicate of the Cut and Run from ENA. There are direct links for Read1 and Read2.
Read in a subset of Read 1 (lets just try 1000000 to keep things snappy).

library(ShortRead)

fqSample <- FastqSampler("~/Downloads/SRR20110417_1.fastq.gz",n=10^6)
fastq <- yield(fqSample)
fastq

## class: ShortReadQ
## length: 100000 reads; width: 40 cycles

Lets review the first 10 reads. Lets look at ids, quality and sequence for the first ten reads.

sread(fastq)[1:10]

## DNAStringSet object of length 10:
##      width seq
##  [1]    40 TGTGAGAAAGCAAAGCTAGGGTGTACATCTGGTGTCTTTT
##  [2]    40 AGGTACCGGGTGCTTGTACAGGTCCGTGCAGATGAGATGC
##  [3]    40 CCTTCAGCCAATGTGTCTCAAAGGCAGGAGCAGCTCAAAG
##  [4]    40 ACACTAACTAATGGCTTCTCACTCTTACATACAGTGCTTT
##  [5]    40 AGTACATTGATATTTTATGTACAGTTTATTATAAATGTTT
##  [6]    40 ATTGGCCTTTCTGTTGTGGACTTGAAATAAGCTAATTGCT
##  [7]    40 GAGTCACGTGCGGGTAGGGTTTGGCGAGATCGGAAGAGCA
##  [8]    40 TATAAAAAGCTATAAACAAGGTGTGTTTGCCAAAATGTTT
##  [9]    40 TGTTCCAAAGACTATTGACTCTGAGAGCTACATGTCCTTA
## [10]    40 TCACTTCTATAAGTCTACATTTCCTCTTTTGTAAACTGGG

quality(fastq)[1:10]

## class: FastqQuality
## quality:
## BStringSet object of length 10:
##      width seq
##  [1]    40 ????????????????????????????????????????
##  [2]    40 ????????????????????????????????????????
##  [3]    40 ????????????????????????????????????????
##  [4]    40 ????????????????????????????????????????
##  [5]    40 ????????????????????????????????????????
##  [6]    40 ????????????????????????????????????????
##  [7]    40 ????????????????????????????????????????
##  [8]    40 ????????????????????????????????????????
##  [9]    40 ????????????????????????????????????????
## [10]    40 ????????????????????????????????????????

id(fastq)[1:10]

## BStringSet object of length 10:
##      width seq
##  [1]    31 SRR20110417.18783294 18783294/1
##  [2]    31 SRR20110417.23058003 23058003/1
##  [3]    27 SRR20110417.370785 370785/1
##  [4]    31 SRR20110417.38338769 38338769/1
##  [5]    31 SRR20110417.30559613 30559613/1
##  [6]    29 SRR20110417.3870902 3870902/1
##  [7]    31 SRR20110417.35673597 35673597/1
##  [8]    31 SRR20110417.38568471 38568471/1
##  [9]    31 SRR20110417.16498017 16498017/1
## [10]    31 SRR20110417.48650389 48650389/1

Lets review the nucleotide frequency for each base in our read.

library(ggplot2)
readSequences <- sread(fastq)
readSequences_AlpFreq <- alphabetFrequency(readSequences)
summed__AlpFreq  <- colSums(readSequences_AlpFreq)
summed__AlpFreq[c("A","C","G","T","N")]

##       A       C       G       T       N 
## 1188117  840505  820889 1150389     100

readSequences_AlpbyCycle <- alphabetByCycle(readSequences)
AFreq <- readSequences_AlpbyCycle["A",]
CFreq <- readSequences_AlpbyCycle["C",]
GFreq <- readSequences_AlpbyCycle["G",]
TFreq <- readSequences_AlpbyCycle["T",]
toPlot <- data.frame(Count=c(AFreq,CFreq,GFreq,TFreq),
                     Cycle=rep(1:40,4),
                     Base=rep(c("A","C","G","T"),each=40))


ggplot(toPlot, aes(y=Count,x=Cycle,colour=Base)) + geom_line() +
  theme_bw()

Compare this to read 2.

library(ShortRead)

fqSample <- FastqSampler("~/Downloads/SRR20110417_2.fastq.gz",n=10^6)
fastq2 <- yield(fqSample)
fastq2

## class: ShortReadQ
## length: 1000000 reads; width: 40 cycles

readSequences <- sread(fastq2)
readSequences_AlpFreq <- alphabetFrequency(readSequences)
summed__AlpFreq  <- colSums(readSequences_AlpFreq)
summed__AlpFreq[c("A","C","G","T","N")]

##        A        C        G        T        N 
## 11580549  8126464  8888673 11403102     1212

readSequences_AlpbyCycle <- alphabetByCycle(readSequences)
AFreq <- readSequences_AlpbyCycle["A",]
CFreq <- readSequences_AlpbyCycle["C",]
GFreq <- readSequences_AlpbyCycle["G",]
TFreq <- readSequences_AlpbyCycle["T",]
toPlot <- data.frame(Count=c(AFreq,CFreq,GFreq,TFreq),
                     Cycle=rep(1:40,4),
                     Base=rep(c("A","C","G","T"),each=40))


ggplot(toPlot, aes(y=Count,x=Cycle,colour=Base)) + geom_line() +
  theme_bw()

Exercise 2 - Run Rfastp

Run Rfastp on our dataset with default settings.

library(Rfastp)

rfastp(read1="/Users/mattpaul/Downloads/SRR20110417_1.fastq.gz", read2 = "/Users/mattpaul/Downloads/SRR20110417_2.fastq.gz",outputFastq = "SOX9CNR_D0_rep2_filtered")

The html is here

Try running Rfastp on our dataset with some extra parameters. Review the help page with ?rfastp.

You could try: * trimming from the front of the read * adding in over representation analysis (this will take a little longer)

rfastp(read1="/Users/mattpaul/Downloads/SRR20110417_1.fastq.gz", read2 = "/Users/mattpaul/Downloads/SRR20110417_2.fastq.gz",outputFastq = "SOX9CNR_D0_rep2_filtered2",
       trimFrontRead1=10,
       trimFrontRead2=10,
       overrepresentationAnalysis=T)

The html is here

My Exercise 1

Rockefeller University, Bioinformatics Resource Centre

https://rockefelleruniversity.github.io/RU_course_template/