These are the first exercises are about alignment and counting in RNAseq.

In todays session we will work with some of the RNAseq data of T-regulatory cells from Christina Leslie’s lab.

Sequencing data as a FASTQ file can be found here.

Aligned data as a BAM file can be found here.

Exercises

1. Run Rfastp

Download the above FASTQ file for T-regulatory cells (replicate 2) - ENCFF070QMF.fastq.gz. Alteranitvely you can work with the much smaller sampled dataset in the data folder:

data/ENCFF070QMF_sampled.fastq.gz

library(Rfastp)
json_report <- rfastp(read1 = "data/ENCFF070QMF_sampled.fastq.gz", outputFastq = "ENCFF070QMF_rfastp")

2. Check Rfastp QC plots

Check the QC summary and then look at the GC and Quality curves.

qcSummary(json_report)
##                      Before_QC     After_QC
## total_reads       1.000000e+05 9.967500e+04
## total_bases       5.000000e+06 4.983750e+06
## q20_bases         4.875344e+06 4.869811e+06
## q30_bases         4.749907e+06 4.748659e+06
## q20_rate          9.750690e-01 9.771380e-01
## q30_rate          9.499810e-01 9.528280e-01
## read1_mean_length 5.000000e+01 5.000000e+01
## gc_content        4.802670e-01 4.802000e-01
curvePlot(json_report)

curvePlot(json_report, curve="content_curves")

2. Alignment

Align our filtered reads to the chromosome 10 of mm10 genome. Sort and index the resulting BAM file.

library(BSgenome.Mmusculus.UCSC.mm10)
library(Rsubread)

mainChromosomes <- paste0("chr10")
mainChrSeq <- lapply(mainChromosomes,
                     function(x)BSgenome.Mmusculus.UCSC.mm10[[x]])
names(mainChrSeq) <- mainChromosomes
mainChrSeqSet <- DNAStringSet(mainChrSeq)
writeXStringSet(mainChrSeqSet,
                "mm10Chr10.fa")



library(TxDb.Mmusculus.UCSC.mm10.knownGene)
myExons <- exons(TxDb.Mmusculus.UCSC.mm10.knownGene,columns=c("tx_id","gene_id"))
myExons <- myExons[lengths(myExons$gene_id) == 1]
dfExons <- as.data.frame(myExons)
SAF <- data.frame(GeneID=dfExons$gene_id,
                  Chr=dfExons$seqnames,
                  Start=dfExons$start,
                  End=dfExons$end,
                  Strand=dfExons$strand)

## Rsubread

buildindex("mm10Chr10",
           "mm10Chr10.fa")
subjunc("mm10Chr10",
      "ENCFF070QMF_rfastp_R1.fastq.gz",
      output_file="Treg_2.bam",annot.ext = SAF,isGTF = FALSE,useAnnotation = TRUE)


sortBam("Treg_2.bam","Sorted_Treg_2")
indexBam("Sorted_Treg_2.bam")

4. Alignment

Count the reads in our newly aligned and indexed BAM file mapping within genes. Plot a density plot of log 10 of reads counts across genes on chromosome 10.
NOTE: Add 1 read to all counts to avoid log of zero

library(TxDb.Mmusculus.UCSC.mm10.knownGene)
geneExons <- exonsBy(TxDb.Mmusculus.UCSC.mm10.knownGene,by="gene")
library(GenomicAlignments)
myBam <- BamFile("Sorted_Treg_2.bam",yieldSize = 10000)
treg2GeneCounts <- summarizeOverlaps(geneExons,myBam,
                                    ignore.strand = TRUE)

myGeneGR <- rowRanges(treg2GeneCounts)
treg2GeneCountsChr10 <- treg2GeneCounts[all(seqnames(myGeneGR) ==
    "chr10"),]
treg2GeneCountsChr10Matrix <- assay(treg2GeneCountsChr10)+1
myCounts <- data.frame(Counts =treg2GeneCountsChr10Matrix[,1])
ggplot(myCounts,aes(x=Counts))+geom_density(fill="Red")+scale_x_log10()+theme_minimal()

5. Salmon Quantification [ADVANCED]

Download and install Salmon. Using Salmon, quantify transcript levels using reads from our filtered FASTQ.

allTxSeq <- extractTranscriptSeqs(BSgenome.Mmusculus.UCSC.mm10,
                      TxDb.Mmusculus.UCSC.mm10.knownGene,
                      use.names=TRUE)
allTxSeq
writeXStringSet(allTxSeq,
                "mm10Trans.fa")

library(Herper)
install_CondaTools(tools="salmon", env="RNAseq_analysis")

fastaTx <- "mm10Trans.fa"
indexName <- "mm10Trans"

with_CondaEnv("RNAseq_analysis",
                      system2(command="salmon",args = c("index",
                        "-i",indexName,
                        "-t",fastaTx),
                        
                        stdout = TRUE))


fq <- "~/Downloads/filtered_ENCFF070QMF.fastq.gz"
outDir <- "TReg_2_Quant"

with_CondaEnv("RNAseq_analysis",
                      system2(command="salmon",args = c("quant",
                        "-i",indexName,
                        "-o",outDir,
                        "-l A",
                        "-r",fq),
                        
                        stdout = TRUE))

6. Review Salmon scores

Read in the generated quant.sf file and plot log2 read counts by log10 TPM scores in a scatter plot. NOTE: If you did not run salmon yourself, the quant file can be found here: “data/Salmon/TReg_2_Quant/quant.sf”

myQuant <- read.delim("data/Salmon/TReg_2_Quant/quant.sf")
ggplot(myQuant,aes(x=NumReads,y=TPM))+geom_point()+scale_x_log10()+scale_y_log10()+theme_bw()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis