params <- list(isSlides = "no") ## ----include=FALSE------------------------------------------------------------ suppressPackageStartupMessages(require(knitr)) knitr::opts_chunk$set(echo = TRUE, tidy = T) # delete cache before any merging ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides != "yes"){ cat("# ChIPseq (part 1) --- " ) } ## ----setup2, include=FALSE,eval=FALSE,echo=FALSE------------------------------ ## library(ShortRead) ## ## fqSample <- FastqSampler("~/Downloads/ENCFF001NQP.fastq.gz",n=10^6) ## temp <- yield(fqSample) ## ## ## writeFastq(fastqSample,file = "~/Projects/Software/Github/RUBioconductor_Introduction/r_course/Data/sampled_ENCFF000CXH.fastq.gz",mode = "w") ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides == "yes"){ cat("class: inverse, center, middle # Working with raw ChIPseq data

--- " ) }else{ cat("# Working with raw ChIPseq data --- " ) } ## ----shortreada,include=FALSE------------------------------------------------- library(ShortRead) ## ----shortread, warning=F, message=F------------------------------------------ library(ShortRead) ## ---- echo=F,eval=F----------------------------------------------------------- ## fqSample <- FastqSampler("~/Downloads/ENCFF001NQP.fastq.gz",n=10^6) ## fastq <- yield(fqSample) ## ## writeFastq(fastq,file = "~/Documents/Box Sync/RU/Teaching/RU_side/RU_ChIPseq/chipseq/inst/extdata/data/sampled_ENCFF001NQP.fastq.gz",mode = "w") ## ## ----eval=T, echo=F----------------------------------------------------------- fastq <- readFastq(dirPath = "data/sampled_ENCFF001NQP.fastq.gz") ## ----mycRep1Reads,echo=T,eval=F----------------------------------------------- ## fqSample <- FastqSampler("~/Downloads/ENCFF001NQP.fastq.gz",n=10^6) ## fastq <- yield(fqSample) ## ----mycRep1ReadsShortReadQ,cache=TRUE,dependson="mycRep1Reads"--------------- fastq ## ----mycRep1ReadsAccessor,cache=TRUE,dependson="mycRep1Reads"----------------- readSequences <- sread(fastq) readQuality <- quality(fastq) readIDs <- id(fastq) readSequences ## ----mycRep1ReadsQScores,cache=TRUE,dependson="mycRep1Reads"------------------ readQuality <- quality(fastq) readQualities <- alphabetScore(readQuality) readQualities[1:10] ## ----mycRep1ReadsQScoresPlot,cache=TRUE,dependson="mycRep1ReadsQScores",fig.height=3,fig.width=8---- library(ggplot2) toPlot <- data.frame(ReadQ=readQualities) ggplot(toPlot,aes(x=ReadQ))+geom_histogram()+theme_minimal() ## ----mycRep1ReadsAlpFreq,cache=TRUE,dependson="mycRep1Reads"------------------ readSequences <- sread(fastq) readSequences_AlpFreq <- alphabetFrequency(readSequences) readSequences_AlpFreq[1:3,] ## ----mycRep1ReadsAlpFreqSum,cache=TRUE,dependson="mycRep1ReadsAlpFreq"-------- summed__AlpFreq <- colSums(readSequences_AlpFreq) summed__AlpFreq[c("A","C","G","T","N")] ## ----mycRep1ReadsAlpByCycle,cache=TRUE,dependson="mycRep1ReadsAlpFreq"-------- readSequences_AlpbyCycle <- alphabetByCycle(readSequences) readSequences_AlpbyCycle[1:4,1:10] ## ----mycRep1ReadsAlpByCyclePlot,cache=TRUE,dependson="mycRep1ReadsAlpFreq"---- AFreq <- readSequences_AlpbyCycle["A",] CFreq <- readSequences_AlpbyCycle["C",] GFreq <- readSequences_AlpbyCycle["G",] TFreq <- readSequences_AlpbyCycle["T",] toPlot <- data.frame(Count=c(AFreq,CFreq,GFreq,TFreq), Cycle=rep(1:36,4), Base=rep(c("A","C","G","T"),each=36)) ## ----mycRep1ReadsAlpByCyclePlot2,cache=TRUE,eval=FALSE,dependson="mycRep1ReadsAlpByCyclePlot",fig.height=4,fig.width=8---- ## ## ggplot(toPlot,aes(y=Count,x=Cycle,colour=Base))+geom_line()+ ## theme_bw() ## ----mycRep1ReadsAlpByCyclePlot3,cache=TRUE,echo=FALSE,dependson="mycRep1ReadsAlpByCyclePlot",fig.height=4,fig.width=8---- ggplot(toPlot,aes(y=Count,x=Cycle,colour=Base))+geom_line()+ylim(150000,400000)+ theme_bw() ## ----mycRep1ReadsQByCycle,cache=TRUE,dependson="mycRep1ReadsAlpFreq"---------- qualAsMatrix <- as(readQuality,"matrix") qualAsMatrix[1:2,] ## ----mycRep1ReadsQByCyclePlot,cache=TRUE,dependson="mycRep1ReadsQByCycle",fig.width=8,fig.height=4---- boxplot(qualAsMatrix[1:1000,]) ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides == "yes"){ cat("class: inverse, center, middle # Filtering data

--- " ) }else{ cat("# Filtering data --- " ) } ## ----out,eval=FALSE----------------------------------------------------------- ## fqStreamer <- FastqStreamer("~/Downloads/ENCFF001NQP.fastq.gz", ## n=100000) ## ----out1,eval=FALSE---------------------------------------------------------- ## TotalReads <- 0 ## TotalReadsFilt <- 0 ## while (length(fq <- yield(fqStreamer))>0) { ## TotalReads <- TotalReads+length(fq) ## filt1 <- fq[alphabetScore(fq) > 300 ] ## filt2 <- filt1[alphabetFrequency(sread(filt1))[,"N"] < 10] ## TotalReadsFilt <- TotalReadsFilt+length(filt2) ## writeFastq(filt2,"filtered_ENCFF001NQP.fastq.gz",mode="a") ## } ## ----echo=F,eval=T------------------------------------------------------------ TotalReads<-25555179 TotalReadsFilt<-22864597 ## ----out11,eval=T,echo=T------------------------------------------------------ TotalReads TotalReadsFilt ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides == "yes"){ cat("class: inverse, center, middle # Aligning data

--- " ) }else{ cat("# Aligning data --- " ) } ## ----fa1q, include=FALSE------------------------------------------------------ library(BSgenome.Mmusculus.UCSC.mm10) ## ----fa1, echo=TRUE----------------------------------------------------------- library(BSgenome.Mmusculus.UCSC.mm10) BSgenome.Mmusculus.UCSC.mm10 ## ----fa2,cache=FALSE,echo=TRUE------------------------------------------------ mainChromosomes <- paste0("chr",c(1:19,"X","Y","M")) mainChrSeq <- lapply(mainChromosomes, function(x)BSgenome.Mmusculus.UCSC.mm10[[x]]) names(mainChrSeq) <- mainChromosomes mainChrSeqSet <- DNAStringSet(mainChrSeq) mainChrSeqSet ## ----fa3, echo=TRUE,eval=FALSE------------------------------------------------ ## writeXStringSet(mainChrSeqSet, ## "BSgenome.Mmusculus.UCSC.mm10.mainChrs.fa") ## ---- echo=TRUE,eval=FALSE---------------------------------------------------- ## library(Rsubread) ## buildindex("mm10_mainchrs","BSgenome.Mmusculus.UCSC.mm10.mainChrs.fa", ## memory=8000, ## indexSplit=TRUE) ## ## ---- echo=F, eval=F---------------------------------------------------------- ## myMapped <- align("~/Documents/Box Sync/RU/Teaching/Compilation/Genomes_And_Datasets/mm10/mm10_mainchrs", ## "filtered_ENCFF001NQP.fastq.gz", ## output_format = "BAM", ## output_file = "Myc_Mel_1.bam", ## nthreads = 4) ## ## ---- echo=TRUE,eval=FALSE---------------------------------------------------- ## ## myMapped <- align("mm10_mainchrs", ## "filtered_ENCFF001NQP.fastq.gz", ## output_format = "BAM", ## output_file = "Myc_Mel_1.bam", ## type='dna', ## phredOffset = 64, ## nthreads = 4) ## ## ----sampleTabless1, echo=TRUE,eval=FALSE------------------------------------- ## library(Rbowtie2) ## ----bsgecdnoaame, echo=TRUE,eval=FALSE--------------------------------------- ## bowtie2_build(references="BSgenome.Mmusculus.UCSC.mm10.mainChrs.fa", ## bt2Index=file.path("BSgenome.Mmusculus.UCSC.mm10.mainChrs")) ## ----bsgcdcenoaame, echo=TRUE,eval=FALSE-------------------------------------- ## library(R.utils) ## gunzip("filtered_ENCFF001NQP.fastq.gz", ## remove=FALSE) ## ## bowtie2(bt2Index = "BSgenome.Mmusculus.UCSC.mm10.mainChrs", ## samOutput = "ENCFF001NQP.sam", ## seq1 = "filtered_ENCFF001NQP.fastq") ## ----bsgenoaaxssme, echo=TRUE,eval=FALSE-------------------------------------- ## bowtieBam <- asBam("ENCFF001NQP.sam") ## ----bsgxxxnoaaxssme, echo=TRUE,eval=FALSE------------------------------------ ## unlink("ENCFF001NQP.sam") ## ## ----sortindex, echo=TRUE,eval=FALSE------------------------------------------ ## library(Rsamtools) ## sortBam("Myc_Mel_1.bam","SR_Myc_Mel_rep1") ## indexBam("SR_Myc_Mel_rep1.bam") ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides == "yes"){ cat("class: inverse, center, middle # Mapped data

--- " ) }else{ cat("# Mapped data --- " ) } ## ---- eval=F, echo=F---------------------------------------------------------- ## mappedReads <- idxstatsBam("SR_Myc_Mel_1.bam") ## save(mappedReads, file="data/idxstatsBam_MycMel.RData") ## ----mapped1, echo=TRUE,eval=FALSE-------------------------------------------- ## mappedReads <- idxstatsBam("SR_Myc_Mel_rep1.bam") ## TotalMapped <- sum(mappedReads[,"mapped"]) ## ggplot(mappedReads,aes(x=seqnames,y=mapped))+ ## geom_bar(stat="identity")+coord_flip() ## ----mapped, echo=FALSE,eval=TRUE,fig.width=4,fig.height=4-------------------- load("data/idxstatsBam_MycMel.RData") TotalMapped <- sum(mappedReads[,"mapped"]) suppressPackageStartupMessages(library(ggplot2)) ggplot(mappedReads,aes(x=seqnames,y=mapped))+geom_bar(stat="identity")+coord_flip() ## ----coverage, echo=TRUE,eval=FALSE------------------------------------------- ## forBigWig <- coverage("SR_Myc_Mel_rep1.bam") ## forBigWig ## ----bw, echo=TRUE,eval=FALSE------------------------------------------------- ## library(rtracklayer) ## export.bw(forBigWig,con="SR_Myc_Mel_rep1.bw") ## ----weightedCover, echo=TRUE,eval=FALSE-------------------------------------- ## forBigWig <- coverage("SR_Myc_Mel_rep1.bam", ## weight = (10^6)/TotalMapped) ## forBigWig ## export.bw(forBigWig,con="SR_Myc_Mel_rep1_weighted.bw")