These are the first exercises are about alignment and counting in RNAseq.
In todays session we will work with some of the RNAseq data of T-regulatory cells from Christina Leslie’s lab.
Sequencing data as a FASTQ file can be found here.
Aligned data as a BAM file can be found here.
Download the above FASTQ file for T-regulatory cells (replicate 2) - ENCFF070QMF.fastq.gz. Alteranitvely you can work with the much smaller sampled dataset in the data folder:
data/ENCFF070QMF_sampled.fastq.gz
library(Rfastp)
<- rfastp(read1 = "data/ENCFF070QMF_sampled.fastq.gz", outputFastq = "ENCFF070QMF_rfastp") json_report
Check the QC summary and then look at the GC and Quality curves.
qcSummary(json_report)
## Before_QC After_QC
## total_reads 1.000000e+05 9.967500e+04
## total_bases 5.000000e+06 4.983750e+06
## q20_bases 4.875344e+06 4.869811e+06
## q30_bases 4.749907e+06 4.748659e+06
## q20_rate 9.750690e-01 9.771380e-01
## q30_rate 9.499810e-01 9.528280e-01
## read1_mean_length 5.000000e+01 5.000000e+01
## gc_content 4.802670e-01 4.802000e-01
curvePlot(json_report)
curvePlot(json_report, curve="content_curves")
Align our filtered reads to the chromosome 10 of mm10 genome. Sort and index the resulting BAM file.
library(BSgenome.Mmusculus.UCSC.mm10)
library(Rsubread)
<- paste0("chr10")
mainChromosomes <- lapply(mainChromosomes,
mainChrSeq function(x)BSgenome.Mmusculus.UCSC.mm10[[x]])
names(mainChrSeq) <- mainChromosomes
<- DNAStringSet(mainChrSeq)
mainChrSeqSet writeXStringSet(mainChrSeqSet,
"mm10Chr10.fa")
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
<- exons(TxDb.Mmusculus.UCSC.mm10.knownGene,columns=c("tx_id","gene_id"))
myExons <- myExons[lengths(myExons$gene_id) == 1]
myExons <- as.data.frame(myExons)
dfExons <- data.frame(GeneID=dfExons$gene_id,
SAF Chr=dfExons$seqnames,
Start=dfExons$start,
End=dfExons$end,
Strand=dfExons$strand)
## Rsubread
buildindex("mm10Chr10",
"mm10Chr10.fa")
subjunc("mm10Chr10",
"ENCFF070QMF_rfastp_R1.fastq.gz",
output_file="Treg_2.bam",annot.ext = SAF,isGTF = FALSE,useAnnotation = TRUE)
sortBam("Treg_2.bam","Sorted_Treg_2")
indexBam("Sorted_Treg_2.bam")
Count the reads in our newly aligned and indexed BAM file mapping within genes. Plot a density plot of log 10 of reads counts across genes on chromosome 10.
NOTE: Add 1 read to all counts to avoid log of zero
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
<- exonsBy(TxDb.Mmusculus.UCSC.mm10.knownGene,by="gene")
geneExons library(GenomicAlignments)
<- BamFile("Sorted_Treg_2.bam",yieldSize = 10000)
myBam <- summarizeOverlaps(geneExons,myBam,
treg2GeneCounts ignore.strand = TRUE)
<- rowRanges(treg2GeneCounts)
myGeneGR <- treg2GeneCounts[all(seqnames(myGeneGR) ==
treg2GeneCountsChr10 "chr10"),]
<- assay(treg2GeneCountsChr10)+1
treg2GeneCountsChr10Matrix <- data.frame(Counts =treg2GeneCountsChr10Matrix[,1])
myCounts ggplot(myCounts,aes(x=Counts))+geom_density(fill="Red")+scale_x_log10()+theme_minimal()
Download and install Salmon. Using Salmon, quantify transcript levels using reads from our filtered FASTQ.
<- extractTranscriptSeqs(BSgenome.Mmusculus.UCSC.mm10,
allTxSeq
TxDb.Mmusculus.UCSC.mm10.knownGene,use.names=TRUE)
allTxSeqwriteXStringSet(allTxSeq,
"mm10Trans.fa")
library(Herper)
install_CondaTools(tools="salmon", env="RNAseq_analysis")
<- "mm10Trans.fa"
fastaTx <- "mm10Trans"
indexName
with_CondaEnv("RNAseq_analysis",
system2(command="salmon",args = c("index",
"-i",indexName,
"-t",fastaTx),
stdout = TRUE))
<- "~/Downloads/filtered_ENCFF070QMF.fastq.gz"
fq <- "TReg_2_Quant"
outDir
with_CondaEnv("RNAseq_analysis",
system2(command="salmon",args = c("quant",
"-i",indexName,
"-o",outDir,
"-l A",
"-r",fq),
stdout = TRUE))
Read in the generated quant.sf file and plot log2 read counts by log10 TPM scores in a scatter plot. NOTE: If you did not run salmon yourself, the quant file can be found here: “data/Salmon/TReg_2_Quant/quant.sf”
<- read.delim("data/Salmon/TReg_2_Quant/quant.sf")
myQuant ggplot(myQuant,aes(x=NumReads,y=TPM))+geom_point()+scale_x_log10()+scale_y_log10()+theme_bw()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis