params <- list(isSlides = "no") ## ----include=FALSE------------------------------------------------------------ suppressPackageStartupMessages(require(knitr)) knitr::opts_chunk$set(echo = TRUE, tidy = T) # condaSalmon <- CondaSysReqs::install_CondaTools("salmon","salmon") # pathToSalmon <- file.path(dirname(dirname(condaSalmon$pathToConda)),"envs",condaSalmon$environment,"bin","salmon") ## ---- echo=F, eval=F---------------------------------------------------------- ## if(!file.exists("~/ENCFF332KDA.fastq.gz")){ ## download.file("https://www.encodeproject.org/files/ENCFF332KDA/@@download/ENCFF332KDA.fastq.gz","~/ENCFF332KDA.fastq.gz") ## } ## # if(!file.exists("~/ENCFF070QMF.fastq.gz")){ ## # download.file("https://www.encodeproject.org/files/ENCFF070QMF/@@download/ENCFF070QMF.fastq.gz","~/ENCFF070QMF.fastq.gz") ## # } ## ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides != "yes"){ cat("# RNAseq (part 1) --- " ) } ## ---- eval=F------------------------------------------------------------------ ## setwd("Path/to/Download/RU_RNAseq-master") ## ## ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides == "yes"){ cat("class: inverse, center, middle # Working with raw RNAseq data

--- " ) }else{ cat("# Working with raw RNAseq data --- " ) } ## ----shortreada,include=FALSE------------------------------------------------- library(ShortRead) library(Rfastp) library(ggplot2) library(TxDb.Mmusculus.UCSC.mm10.knownGene) ## ----tregFQ1, eval=F, echo=T-------------------------------------------------- ## ## fq<-"https://www.encodeproject.org/files/ENCFF332KDA/@@download/ENCFF332KDA.fastq.gz" ## download.file(fq, "ENCFF332KDA.fastq.gz") ## ## ----------------------------------------------------------------------------- json_report <- rfastp(read1 = "data/ENCFF332KDA_sampled.fastq.gz", outputFastq = "ENCFF332KDA_rfastp") ## ----tregShow,eval=T, echo=F-------------------------------------------------- dir(pattern = "ENCFF332KDA_rfastp") ## ----------------------------------------------------------------------------- qcSummary(json_report) ## ----------------------------------------------------------------------------- curvePlot(json_report) ## ----------------------------------------------------------------------------- curvePlot(json_report, curve="content_curves") ## ----------------------------------------------------------------------------- ?rfastp ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides == "yes"){ cat("class: inverse, center, middle # Aligning RNAseq data

--- " ) }else{ cat("# Aligning RNAseq data --- " ) } ## ----fa1q, include=FALSE------------------------------------------------------ library(BSgenome.Mmusculus.UCSC.mm10) library(TxDb.Mmusculus.UCSC.mm10.knownGene) ## ----fa1, echo=TRUE----------------------------------------------------------- library(BSgenome.Mmusculus.UCSC.mm10) BSgenome.Mmusculus.UCSC.mm10 ## ----fa2,cache=FALSE,echo=TRUE------------------------------------------------ mainChromosomes <- paste0("chr",c(1:19,"X","Y","M")) mainChrSeq <- lapply(mainChromosomes, function(x) BSgenome.Mmusculus.UCSC.mm10[[x]]) names(mainChrSeq) <- mainChromosomes mainChrSeqSet <- DNAStringSet(mainChrSeq) mainChrSeqSet ## ----fa3, echo=TRUE,eval=FALSE------------------------------------------------ ## ## writeXStringSet(mainChrSeqSet, ## "BSgenome.Mmusculus.UCSC.mm10.mainChrs.fa") ## ---- echo=TRUE,eval=FALSE---------------------------------------------------- ## library(Rsubread) ## buildindex("mm10_mainchrs", ## "BSgenome.Mmusculus.UCSC.mm10.mainChrs.fa", ## memory=8000, ## indexSplit=TRUE) ## ## ---- echo=TRUE, eval=TRUE---------------------------------------------------- library(TxDb.Mmusculus.UCSC.mm10.knownGene) myExons <- exons(TxDb.Mmusculus.UCSC.mm10.knownGene,columns=c("tx_id","gene_id")) myExons <- myExons[lengths(myExons$gene_id) == 1] myExons ## ---- echo=TRUE, eval=TRUE---------------------------------------------------- dfExons <- as.data.frame(myExons) SAF <- data.frame(GeneID=dfExons$gene_id, Chr=dfExons$seqnames, Start=dfExons$start, End=dfExons$end, Strand=dfExons$strand) ## ---- echo=TRUE, eval=FALSE--------------------------------------------------- ## ## myMapped <- subjunc("mm10_mainchrs", ## "ENCFF332KDA.fastq.gz", ## output_format = "BAM", ## output_file = "Treg_1.bam", ## useAnnotation = TRUE, ## annot.ext = SAF, ## isGTF=FALSE, ## nthreads = 4) ## ## ----sortindex, echo=TRUE, eval=FALSE----------------------------------------- ## library(Rsamtools) ## sortBam("Treg_1.bam","Sorted_Treg_1") ## indexBam("Sorted_Treg_1.bam") ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides == "yes"){ cat("class: inverse, center, middle # Counting with aligned RNAseq data

--- " ) }else{ cat("# Counting with aligned RNAseq data --- " ) } ## ----gm, echo=TRUE,eval=TRUE,cache=TRUE--------------------------------------- library(TxDb.Mmusculus.UCSC.mm10.knownGene) geneExons <- exonsBy(TxDb.Mmusculus.UCSC.mm10.knownGene,by="gene") class(geneExons) ## ----gm2,echo=TRUE,eval=TRUE,cache=TRUE,dependson="gm"------------------------ geneExons[1:2] ## ----gm0,echo=TRUE,eval=F----------------------------------------------------- ## library(GenomicAlignments) ## myBam <- BamFile("Sorted_Treg_1.bam",yieldSize = 10000) ## tregGeneCounts <- summarizeOverlaps(geneExons,myBam, ## ignore.strand = TRUE) ## tregGeneCounts ## ----gm3,echo=FALSE,eval=TRUE,cache=TRUE-------------------------------------- load("data/myTregCounts2020.RData") tregGeneCounts ## ----em2,echo=TRUE,eval=TRUE,cache=TRUE,dependson="em"------------------------ library(GenomicFeatures) nonOverlappingExons <- disjointExons(TxDb.Mmusculus.UCSC.mm10.knownGene) names(nonOverlappingExons) <- paste(mcols(nonOverlappingExons)$gene_id, mcols(nonOverlappingExons)$exonic_part, sep="_") nonOverlappingExons[1:3,] ## ----em3,echo=TRUE,eval=FALSE,cache=TRUE,dependson="em2"---------------------- ## tregExonCounts <- summarizeOverlaps(nonOverlappingExons, ## myBam, ## ignore.strand = TRUE, ## inter.feature=FALSE) ## ----em4A,echo=FALSE,eval=TRUE,cache=TRUE------------------------------------- load("data/myTregExonCounts2020.RData") ## ----em4,echo=TRUE,eval=TRUE,cache=TRUE,dependson=c("em4A","gm3")------------- geneCounts <- assay(tregGeneCounts) exonCounts <- assay(tregExonCounts) head(exonCounts,2) head(geneCounts,2) ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides == "yes"){ cat("class: inverse, center, middle # Transcript quantification with pseudo-alignment

--- " ) }else{ cat("# Transcript quantification with pseudo-alignment --- " ) } ## ---- eval=F------------------------------------------------------------------ ## BiocManager::install("Herper") ## library(Herper) ## ## ---- echo=T, eval=F---------------------------------------------------------- ## salmon_paths <- install_CondaTools(tools="salmon", env="RNAseq_analysis") ## salmon_paths ## ---- eval=F, echo=F---------------------------------------------------------- ## tempdir2 <- function() { ## tempDir <- tempdir() ## if(dir.exists(tempDir)){ ## tempDir <- file.path(tempDir,"rr") ## } ## tempDir <- gsub("\\", "/", tempDir, fixed = TRUE) ## tempDir ## } ## ## myMiniconda <- file.path(tempdir2(), "Test") ## install_CondaTools(tools="salmon", env="RNAseq_analysis", pathToMiniConda = myMiniconda) ## ## ----sal1,echo=TRUE,eval=F,cache=TRUE----------------------------------------- ## allTxSeq <- extractTranscriptSeqs(BSgenome.Mmusculus.UCSC.mm10, ## TxDb.Mmusculus.UCSC.mm10.knownGene, ## use.names=TRUE) ## allTxSeq ## ----sal2,echo=F,eval=F,cache=TRUE,dependson="sal1"--------------------------- ## writeXStringSet(allTxSeq,"mm10Trans.fa") ## ----sal0,echo=TRUE,eval=F---------------------------------------------------- ## writeXStringSet(allTxSeq, ## "mm10Trans.fa") ## ----sal2t,echo=T,eval=F,dependson="sal1", cache.lazy = FALSE---------------- ## mainChromosomes <- paste0("chr",c(1:19,"X","Y","M")) ## mainChrSeq <- lapply(mainChromosomes, ## function(x)BSgenome.Mmusculus.UCSC.mm10[[x]]) ## names(mainChrSeq) <- mainChromosomes ## mainChrSeqSet <- DNAStringSet(mainChrSeq) ## gentrome <- c(allTxSeq,mainChrSeqSet) ## ----sal0t,echo=TRUE,eval=F--------------------------------------------------- ## writeXStringSet(gentrome, ## "mm10Gentrome.fa") ## write.table(as.data.frame(mainChromosomes),"decoy.txt", ## row.names = FALSE, ## col.names = FALSE, ## quote=FALSE) ## ----salI_1,echo=TRUE,eval=F,dependson="sal1", warning=F---------------------- ## ## fastaTx <- "mm10Trans.fa" ## indexName <- "mm10Trans" ## ## with_CondaEnv("RNAseq_analysis", ## system2(command="salmon",args = c("index", ## "-i",indexName, ## "-t",fastaTx), ## ## stdout = TRUE)) ## ## ---- eval=F, echo=F, message=F, warning=F------------------------------------ ## ## with_CondaEnv("RNAseq_analysis", ## system2(command="salmon",args = c("quant", "--help-alignment"), ## stdout = TRUE), ## pathToMiniConda = myMiniconda) ## ## ## ----salI_1t,echo=TRUE,eval=F------------------------------------------------- ## ## with_CondaEnv("RNAseq_analysis", ## system2(command="salmon",args = c("index", ## "-i",indexName, ## "-t",fastaTx, ## "-d decoy.txt"), ## ## stdout = TRUE)) ## ## ----salQ_1,echo=TRUE,eval=F-------------------------------------------------- ## fq <- "ENCFF332KDA.fastq.gz" ## outDir <- "TReg_1_Quant" ## ## with_CondaEnv("RNAseq_analysis", ## system2(command="salmon",args = c("quant", ## "-i",indexName, ## "-o",outDir, ## "-l A", ## "-r",fq), ## ## stdout = TRUE)) ## ## ---- eval=F, echo=F---------------------------------------------------------- ## ## fq <- "data/ENCFF332KDA_sampled.fastq.gz" ## outDir <- "TReg_1_Quant" ## ## with_CondaEnv("RNAseq_analysis", ## system2(command="salmon",args = c("quant", ## "-i",indexName, ## "-o",outDir, ## "-l A", ## "-r",fq), ## ## stdout = TRUE), ## pathToMiniConda = myMiniconda) ## ----include=FALSE,eval=FALSE------------------------------------------------- ## salmonExec <- paste0(pathToSalmon," quant") ## fq <- "ENCFF001LDC.fastq.gz" ## outDir <- "Liver1" ## salmonQuantCmd <- paste(salmonExec, ## "-i",indexName, ## "-o",outDir, ## "-l A", ## "-r",fq) ## salmonQuantCmd ## system(salmonQuantCmd, wait = TRUE) ## ## salmonExec <- paste0(pathToSalmon," quant") ## fq <- "ENCFF001LCY.fastq.gz" ## outDir <- "Liver2" ## salmonQuantCmd <- paste(salmonExec, ## "-i",indexName, ## "-o",outDir, ## "-l A", ## "-r",fq) ## salmonQuantCmd ## system(salmonQuantCmd, wait = TRUE) ## ## salmonExec <- paste0(pathToSalmon," quant") ## fq <- "ENCFF001LCD.fastq.gz" ## outDir <- "Heart1" ## salmonQuantCmd <- paste(salmonExec, ## "-i",indexName, ## "-o",outDir, ## "-l A", ## "-r",fq) ## salmonQuantCmd ## system(salmonQuantCmd, wait = TRUE) ## ## salmonExec <- paste0(pathToSalmon," quant") ## fq <- "ENCFF001LCE.fastq.gz" ## outDir <- "Heart2" ## salmonQuantCmd <- paste(salmonExec, ## "-i",indexName, ## "-o",outDir, ## "-l A", ## "-r",fq) ## salmonQuantCmd ## system(salmonQuantCmd, wait = TRUE) ## ----salReadIn,echo=TRUE,eval=FALSE,cache=TRUE-------------------------------- ## myQuant <- read.delim("TReg_1_Quant/quant.sf") ## myQuant[1:3,] ## ----salReadIn2,echo=FALSE,eval=TRUE,cache=TRUE------------------------------- myQuant <- read.delim("data/TReg_1_Quant/quant.sf") myQuant[1:3,]