Â
These exercises are about fastq quality control.
Exercise 1 - Reading FastQs with ShortRead
Download fastq files for the second replicate of the Cut and Run from ENA. There are direct links for Read1 and Read2.
Read in a subset of Read 1 (lets just try 1000000 to keep things snappy).
library(ShortRead)
fqSample <- FastqSampler("~/Downloads/SRR20110417_1.fastq.gz",n=10^6)
fastq <- yield(fqSample)
fastq
## class: ShortReadQ
## length: 100000 reads; width: 40 cycles
## DNAStringSet object of length 10:
## width seq
## [1] 40 TGTGAGAAAGCAAAGCTAGGGTGTACATCTGGTGTCTTTT
## [2] 40 AGGTACCGGGTGCTTGTACAGGTCCGTGCAGATGAGATGC
## [3] 40 CCTTCAGCCAATGTGTCTCAAAGGCAGGAGCAGCTCAAAG
## [4] 40 ACACTAACTAATGGCTTCTCACTCTTACATACAGTGCTTT
## [5] 40 AGTACATTGATATTTTATGTACAGTTTATTATAAATGTTT
## [6] 40 ATTGGCCTTTCTGTTGTGGACTTGAAATAAGCTAATTGCT
## [7] 40 GAGTCACGTGCGGGTAGGGTTTGGCGAGATCGGAAGAGCA
## [8] 40 TATAAAAAGCTATAAACAAGGTGTGTTTGCCAAAATGTTT
## [9] 40 TGTTCCAAAGACTATTGACTCTGAGAGCTACATGTCCTTA
## [10] 40 TCACTTCTATAAGTCTACATTTCCTCTTTTGTAAACTGGG
## class: FastqQuality
## quality:
## BStringSet object of length 10:
## width seq
## [1] 40 ????????????????????????????????????????
## [2] 40 ????????????????????????????????????????
## [3] 40 ????????????????????????????????????????
## [4] 40 ????????????????????????????????????????
## [5] 40 ????????????????????????????????????????
## [6] 40 ????????????????????????????????????????
## [7] 40 ????????????????????????????????????????
## [8] 40 ????????????????????????????????????????
## [9] 40 ????????????????????????????????????????
## [10] 40 ????????????????????????????????????????
## BStringSet object of length 10:
## width seq
## [1] 31 SRR20110417.18783294 18783294/1
## [2] 31 SRR20110417.23058003 23058003/1
## [3] 27 SRR20110417.370785 370785/1
## [4] 31 SRR20110417.38338769 38338769/1
## [5] 31 SRR20110417.30559613 30559613/1
## [6] 29 SRR20110417.3870902 3870902/1
## [7] 31 SRR20110417.35673597 35673597/1
## [8] 31 SRR20110417.38568471 38568471/1
## [9] 31 SRR20110417.16498017 16498017/1
## [10] 31 SRR20110417.48650389 48650389/1
library(ggplot2)
readSequences <- sread(fastq)
readSequences_AlpFreq <- alphabetFrequency(readSequences)
summed__AlpFreq <- colSums(readSequences_AlpFreq)
summed__AlpFreq[c("A","C","G","T","N")]
## A C G T N
## 1188117 840505 820889 1150389 100
readSequences_AlpbyCycle <- alphabetByCycle(readSequences)
AFreq <- readSequences_AlpbyCycle["A",]
CFreq <- readSequences_AlpbyCycle["C",]
GFreq <- readSequences_AlpbyCycle["G",]
TFreq <- readSequences_AlpbyCycle["T",]
toPlot <- data.frame(Count=c(AFreq,CFreq,GFreq,TFreq),
Cycle=rep(1:40,4),
Base=rep(c("A","C","G","T"),each=40))
ggplot(toPlot, aes(y=Count,x=Cycle,colour=Base)) + geom_line() +
theme_bw()
library(ShortRead)
fqSample <- FastqSampler("~/Downloads/SRR20110417_2.fastq.gz",n=10^6)
fastq2 <- yield(fqSample)
fastq2
## class: ShortReadQ
## length: 1000000 reads; width: 40 cycles
readSequences <- sread(fastq2)
readSequences_AlpFreq <- alphabetFrequency(readSequences)
summed__AlpFreq <- colSums(readSequences_AlpFreq)
summed__AlpFreq[c("A","C","G","T","N")]
## A C G T N
## 11580549 8126464 8888673 11403102 1212
readSequences_AlpbyCycle <- alphabetByCycle(readSequences)
AFreq <- readSequences_AlpbyCycle["A",]
CFreq <- readSequences_AlpbyCycle["C",]
GFreq <- readSequences_AlpbyCycle["G",]
TFreq <- readSequences_AlpbyCycle["T",]
toPlot <- data.frame(Count=c(AFreq,CFreq,GFreq,TFreq),
Cycle=rep(1:40,4),
Base=rep(c("A","C","G","T"),each=40))
ggplot(toPlot, aes(y=Count,x=Cycle,colour=Base)) + geom_line() +
theme_bw()
Exercise 2 - Run Rfastp
library(Rfastp)
rfastp(read1="/Users/mattpaul/Downloads/SRR20110417_1.fastq.gz", read2 = "/Users/mattpaul/Downloads/SRR20110417_2.fastq.gz",outputFastq = "SOX9CNR_D0_rep2_filtered")
The html is here
You could try: * trimming from the front of the read * adding in over representation analysis (this will take a little longer)
rfastp(read1="/Users/mattpaul/Downloads/SRR20110417_1.fastq.gz", read2 = "/Users/mattpaul/Downloads/SRR20110417_2.fastq.gz",outputFastq = "SOX9CNR_D0_rep2_filtered2",
trimFrontRead1=10,
trimFrontRead2=10,
overrepresentationAnalysis=T)
The html is here