These are the first exercises are about alignment and counting in RNAseq.
In todays session we will work with some of the RNAseq data of T-regulatory cells from Christina Leslie’s lab.
Sequencing data as a FASTQ file can be found here.
Aligned data as a BAM file can be found here.
Download the above FASTQ file for T-regulatory cells (replicate 2) - ENCFF070QMF.fastq.gz. Alteranitvely you can work with the much smaller sampled dataset in the data folder:
data/ENCFF070QMF_sampled.fastq.gz
Check the QC summary and then look at the GC and Quality curves.
## Before_QC After_QC
## total_reads 1.000000e+05 9.967500e+04
## total_bases 5.000000e+06 4.983750e+06
## q20_bases 4.875344e+06 4.869811e+06
## q30_bases 4.749907e+06 4.748659e+06
## q20_rate 9.750690e-01 9.771380e-01
## q30_rate 9.499810e-01 9.528280e-01
## read1_mean_length 5.000000e+01 5.000000e+01
## gc_content 4.802670e-01 4.802000e-01
Align our filtered reads to the chromosome 10 of mm10 genome. Sort and index the resulting BAM file.
library(BSgenome.Mmusculus.UCSC.mm10)
library(Rsubread)
mainChromosomes <- paste0("chr10")
mainChrSeq <- lapply(mainChromosomes,
function(x)BSgenome.Mmusculus.UCSC.mm10[[x]])
names(mainChrSeq) <- mainChromosomes
mainChrSeqSet <- DNAStringSet(mainChrSeq)
writeXStringSet(mainChrSeqSet,
"mm10Chr10.fa")
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
myExons <- exons(TxDb.Mmusculus.UCSC.mm10.knownGene,columns=c("tx_id","gene_id"))
myExons <- myExons[lengths(myExons$gene_id) == 1]
dfExons <- as.data.frame(myExons)
SAF <- data.frame(GeneID=dfExons$gene_id,
Chr=dfExons$seqnames,
Start=dfExons$start,
End=dfExons$end,
Strand=dfExons$strand)
## Rsubread
buildindex("mm10Chr10",
"mm10Chr10.fa")
subjunc("mm10Chr10",
"ENCFF070QMF_rfastp_R1.fastq.gz",
output_file="Treg_2.bam",annot.ext = SAF,isGTF = FALSE,useAnnotation = TRUE)
sortBam("Treg_2.bam","Sorted_Treg_2")
indexBam("Sorted_Treg_2.bam")
Count the reads in our newly aligned and indexed BAM file mapping
within genes. Plot a density plot of log 10 of reads counts across genes
on chromosome 10.
NOTE: Add 1 read to all counts to avoid log of zero
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
geneExons <- exonsBy(TxDb.Mmusculus.UCSC.mm10.knownGene,by="gene")
library(GenomicAlignments)
myBam <- BamFile("Sorted_Treg_2.bam",yieldSize = 10000)
treg2GeneCounts <- summarizeOverlaps(geneExons,myBam,
ignore.strand = TRUE)
myGeneGR <- rowRanges(treg2GeneCounts)
treg2GeneCountsChr10 <- treg2GeneCounts[all(seqnames(myGeneGR) ==
"chr10"),]
treg2GeneCountsChr10Matrix <- assay(treg2GeneCountsChr10)+1
myCounts <- data.frame(Counts =treg2GeneCountsChr10Matrix[,1])
ggplot(myCounts,aes(x=Counts))+geom_density(fill="Red")+scale_x_log10()+theme_minimal()
Download and install Salmon. Using Salmon, quantify transcript levels using reads from our filtered FASTQ.
allTxSeq <- extractTranscriptSeqs(BSgenome.Mmusculus.UCSC.mm10,
TxDb.Mmusculus.UCSC.mm10.knownGene,
use.names=TRUE)
allTxSeq
writeXStringSet(allTxSeq,
"mm10Trans.fa")
library(Herper)
install_CondaTools(tools="salmon", env="RNAseq_analysis")
fastaTx <- "mm10Trans.fa"
indexName <- "mm10Trans"
with_CondaEnv("RNAseq_analysis",
system2(command="salmon",args = c("index",
"-i",indexName,
"-t",fastaTx),
stdout = TRUE))
fq <- "~/Downloads/filtered_ENCFF070QMF.fastq.gz"
outDir <- "TReg_2_Quant"
with_CondaEnv("RNAseq_analysis",
system2(command="salmon",args = c("quant",
"-i",indexName,
"-o",outDir,
"-l A",
"-r",fq),
stdout = TRUE))
Read in the generated quant.sf file and plot log2 read counts by log10 TPM scores in a scatter plot. NOTE: If you did not run salmon yourself, the quant file can be found here: “data/Salmon/TReg_2_Quant/quant.sf”
myQuant <- read.delim("data/Salmon/TReg_2_Quant/quant.sf")
ggplot(myQuant,aes(x=NumReads,y=TPM))+geom_point()+scale_x_log10()+scale_y_log10()+theme_bw()
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.