params <- list(isSlides = "no") ## ----include=FALSE------------------------------------------------------------ require(ShortRead) library(ggplot2) suppressPackageStartupMessages(require(knitr)) knitr::opts_chunk$set(echo = TRUE, tidy = T) ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides != "yes"){ cat("# ATACseq (part 1) --- " ) } ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides == "yes"){ cat("class: inverse, center, middle # Alignment for ATACseq

--- " ) }else{ cat("# Alignment for ATACseq --- " ) } ## ----processData_BuildIndex, echo=TRUE,eval=FALSE,cache=FALSE,tidy=FALSE------ ## library(BSgenome.Hsapiens.UCSC.hg19) ## mainChromosomes <- paste0("chr",c(1:21,"X","Y","M")) ## mainChrSeq <- lapply(mainChromosomes, ## function(x)BSgenome.Hsapiens.UCSC.hg19[[x]]) ## names(mainChrSeq) <- mainChromosomes ## mainChrSeqSet <- DNAStringSet(mainChrSeq) ## writeXStringSet(mainChrSeqSet, ## "BSgenome.Hsapiens.UCSC.hg19.mainChrs.fa") ## ----index, echo=TRUE,eval=FALSE,tidy=FALSE----------------------------------- ## library(Rsubread) ## buildindex("BSgenome.Hsapiens.UCSC.hg19.mainChrs", ## "BSgenome.Hsapiens.UCSC.hg19.mainChrs.fa", ## indexSplit = TRUE, ## memory = 1000) ## ----------------------------------------------------------------------------- read1 <- "ATAC_Data/ATAC_FQs/SRR891269_1.fastq.gz" read2 <- "ATAC_Data/ATAC_FQs/SRR891269_2.fastq.gz" ## ----eval=FALSE,include=FALSE------------------------------------------------- ## require(ShortRead) ## read1 <- readFastq("~/Downloads/ENCFF175VOD.fastq.gz") ## read2 <- readFastq("~/Downloads/ENCFF447BGX.fastq.gz") ## writeFastq(read1[1:1000,],"ATACSample_r1.fastq.gz") ## writeFastq(read2[1:1000,],"ATACSample_r2.fastq.gz") ## # id(read2[1:1000,]) ## # myRes <- bamQC("~/Downloads/Sorted_ATAC_50K_2.bam") ## ----eval=TRUE---------------------------------------------------------------- require(ShortRead) read1 <- readFastq("data/ATACSample_r1.fastq.gz") read2 <- readFastq("data/ATACSample_r2.fastq.gz") id(read1)[1:2] id(read2)[1:2] ## ----processData_acdc, include=FALSE, eval=F---------------------------------- ## setwd("~/Projects/Results/chipseq/testRunforTalk/") ## ----processData_align, echo=TRUE,eval=FALSE,cache=FALSE,tidy=FALSE----------- ## ## align("BSgenome.Hsapiens.UCSC.hg19.mainChrs", ## readfile1=read1,readfile2=read2, ## output_file = "ATAC_50K_2.bam", ## nthreads=2,type=1, ## unique=TRUE,maxFragLength = 2000) ## ## ----indedsx, echo=TRUE,eval=FALSE,tidy=FALSE--------------------------------- ## library(Rbowtie2) ## bowtie2_build(references="BSgenome.Hsapiens.UCSC.hg19.mainChrs.fa", ## bt2Index="BSgenome.Hsapiens.UCSC.hg19.mainChrs_bowtie2") ## ----indeaax, echo=TRUE, eval=FALSE------------------------------------------- ## gunzip("ATAC_Data/ATAC_FQs/SRR891269_1.fastq.gz") ## gunzip("ATAC_Data/ATAC_FQs/SRR891269_2.fastq.gz") ## ----indexas, echo=TRUE,eval=FALSE,tidy=FALSE--------------------------------- ## library(Rsamtools) ## bowtie2(bt2Index = "BSgenome.Hsapiens.UCSC.hg19.mainChrs_bowtie2", ## samOutput = "ATAC_50K_2_bowtie2.sam", ## seq1 = "ATAC_Data/ATAC_FQs/SRR891269_1.fastq", ## seq1 = "ATAC_Data/ATAC_FQs/SRR891269_2.fastq" ## ) ## asBam("ATAC_50K_2_bowtie2.sam") ## ----processData_indexAndSort, echo=TRUE,eval=FALSE,cache=FALSE,tidy=FALSE---- ## library(Rsamtools) ## sortedBAM <- file.path(dirname(outBAM), ## paste0("Sorted_",basename(outBAM)) ## ) ## ## sortBam(outBAM,gsub("\\.bam","",basename(sortedBAM))) ## indexBam(sortedBAM) ## ---- eval=F, echo=F---------------------------------------------------------- ## mappedReads <- idxstatsBam(sortedBAM) ## save(mappedReads,file="data/pres1_idxstats.RData") ## ---- eval=T, echo=F,cache=TRUE,dependson="processData_setBAM"---------------- load("data/pres1_idxstats.RData") ## ----quickMappingStatsPerChromosomea, echo=TRUE,eval=F------------------------ ## library(Rsamtools) ## mappedReads <- idxstatsBam(sortedBAM) ## ## ----quickMappingStatsPerChromosomes, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_setBAM",fig.height=4,fig.width=6---- library(ggplot2) ggplot(mappedReads,aes(seqnames,mapped,fill=seqnames))+ geom_bar(stat="identity")+coord_flip() ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides == "yes"){ cat("class: inverse, center, middle # Post-alignment processing

--- " ) }else{ cat("# Post-alignment processing --- " ) } ## ----processData_readingInDatad, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_setBAM"---- library(GenomicAlignments) flags=scanBamFlag(isProperPair = TRUE) ## ----processData_readingInDatas, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_setBAM"---- myParam=ScanBamParam(flag=flags, what=c("qname","mapq","isize"), which=GRanges("chr20", IRanges(1,63025520))) myParam ## ---- eval=F, echo=F---------------------------------------------------------- ## atacReads <- readGAlignmentPairs("~/Downloads/ATAC_Workshop/Sorted_ATAC_50K_2.bam", ## param=myParam) ## save(atacReads, file="data/pres1_atacreads.RData") ## ----processData_readingInDataa, echo=TRUE,eval=F----------------------------- ## atacReads <- readGAlignmentPairs(sortedBAM, ## param=myParam) ## class(atacReads) ## ## ---- eval=T, echo=F---------------------------------------------------------- load("data/pres1_atacreads.RData") class(atacReads) ## ----processData_readingInData, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_setBAM"---- atacReads[1:2,] ## ----processData_readingInData2, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_setBAM"---- read1 <- first(atacReads) read2 <- second(atacReads) read2[1,] ## ----processData_readingInData3, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_readingInData2"---- read1MapQ <- mcols(read1)$mapq read2MapQ <- mcols(read2)$mapq read1MapQ[1:2] ## ----processData_readingInData4, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_readingInData3"---- read1MapQFreqs <- table(read1MapQ) read2MapQFreqs <- table(read2MapQ) read1MapQFreqs read2MapQFreqs ## ----processData_readingInData5,fig.width=9,fig.height=4, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_readingInData4"---- library(ggplot2) toPlot <- data.frame(MapQ=c(names(read1MapQFreqs),names(read2MapQFreqs)), Frequency=c(read1MapQFreqs,read2MapQFreqs), Read=c(rep("Read1",length(read1MapQFreqs)),rep("Read2",length(read2MapQFreqs)))) toPlot$MapQ <- factor(toPlot$MapQ,levels = unique(sort(as.numeric(toPlot$MapQ)))) ggplot(toPlot,aes(x=MapQ,y=Frequency,fill=MapQ))+geom_bar(stat="identity")+facet_grid(~Read) ## ----processData_extractingReadsss1, echo=TRUE,eval=FALSE,cache=TRUE,dependson="processData_readingInData"---- ## atacReads_read1 <- first(atacReads) ## insertSizes <- abs(elementMetadata(atacReads_read1)$isize) ## head(insertSizes) ## ----processData_extractingRead1, echo=FALSE,eval=TRUE,cache=TRUE,dependson="processData_readingInData"---- atacReads_read1 <- GenomicAlignments::first(atacReads) insertSizes <- abs(elementMetadata(atacReads_read1)$isize) head(insertSizes) ## ----processData_plottingFrffagmentLengths, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_extractingRead1"---- fragLenSizes <- table(insertSizes) fragLenSizes[1:5] ## ----processData_plottingFrdagmentLengths, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_extractingRead1"---- library(ggplot2) toPlot <- data.frame(InsertSize=as.numeric(names(fragLenSizes)), Count=as.numeric(fragLenSizes)) fragLenPlot <- ggplot(toPlot,aes(x=InsertSize,y=Count))+geom_line() ## ----processData_plottfingFragmentLengths2, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_extractingRead1"---- fragLenPlot+theme_bw() ## ----processData_plottingFragmentLengths3, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_extractingRead1"---- fragLenPlot + scale_y_continuous(trans='log2')+theme_bw() ## ----processData_plottingFragmentLengths24, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_plottingFragmentLengths",fig.width=6,fig.height=4---- fragLenPlot+ scale_y_continuous(trans='log2')+ geom_vline(xintercept = c(180,247),colour="red")+ geom_vline(xintercept = c(315,437),colour="darkblue")+ geom_vline(xintercept = c(100),colour="darkgreen")+theme_bw() ## ----processData_createOpenRegionBAM, echo=TRUE,eval=TRUE,cache=TRUE,dependson=c("processData_extractingRead1","processData_readingInData")---- atacReads_NucFree <- atacReads[insertSizes < 100,] atacReads_MonoNuc <- atacReads[insertSizes > 180 & insertSizes < 240,] atacReads_diNuc <- atacReads[insertSizes > 315 & insertSizes < 437,] ## ----processData_createOpenRegionBAM_2, echo=TRUE,eval=FALSE,cache=TRUE,dependson="processData_createOpenRegionBAM"---- ## nucFreeRegionBam <- gsub("\\.bam","_nucFreeRegions\\.bam",sortedBAM) ## monoNucBam <- gsub("\\.bam","_monoNuc\\.bam",sortedBAM) ## diNucBam <- gsub("\\.bam","_diNuc\\.bam",sortedBAM) ## ## library(rtracklayer) ## export(atacReads_NucFree,nucFreeRegionBam,format = "bam") ## export(atacReads_MonoNuc,monoNucBam,format = "bam") ## export(atacReads_diNuc,diNucBam,format = "bam") ## ----processData_createOpenRegionBigWig_2, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_createOpenRegionBAM"---- atacReads[1,] atacFragments<- granges(atacReads) atacFragments[1,] ## ----processData_createOpenRegionBigWig_5, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_createOpenRegionBigWig_2"---- duplicatedFragments <- sum(duplicated(atacFragments)) totalFragments <- length(atacFragments) duplicateRate <- duplicatedFragments/totalFragments nonRedundantFraction <- 1-duplicateRate nonRedundantFraction ## ----processData_createOpenRegionBigWig_21, echo=TRUE,eval=FALSE,cache=TRUE,dependson="processData_createOpenRegionBigWig_2"---- ## openRegionRPMBigWig <- gsub("\\.bam","_openRegionRPM\\.bw",sortedBAM) ## myCoverage <- coverage(atacFragments, ## weight = (10^6/length(atacFragments))) ## export.bw(myCoverage,openRegionRPMBigWig) ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides == "yes"){ cat("class: inverse, center, middle # ATACseqQC

--- " ) }else{ cat("# ATACseqQC --- " ) } ## ----eval=FALSE--------------------------------------------------------------- ## BiocManager::install("ATACseqQC") ## ----eval=FALSE--------------------------------------------------------------- ## library(ATACseqQC) ## ATACQC <- bamQC("~/Downloads/Sorted_ATAC_50K_2_ch17.bam") ## ----eval=TRUE,echo=FALSE,include=FALSE--------------------------------------- load("data/ATACQC.RData") ## ----------------------------------------------------------------------------- names(ATACQC) ## ----------------------------------------------------------------------------- ATACQC$PCRbottleneckCoefficient_1 ## ----------------------------------------------------------------------------- ATACQC$PCRbottleneckCoefficient_2