Alignment

In these exercises we will review some of the Rsubread packages functionality in alignment.

For these exercises we will be using some of the example data available from the BodyMap consortium.

We have already reviewed this data when looking at IGV in an earlier session.

The example data can be found in the data directory

  1. Identify the number of reads in heart.bodyMap.fq and liver.bodyMap.fq files.
library(Rsubread)
library(ShortRead)
library(ggplot2)
library(Rsamtools)
heartQ <- readFastq("data/heart.bodyMap.fq")
liverQ <- readFastq("data/liver.bodyMap.fq")
length(heartQ)
## [1] 132604
length(liverQ)
## [1] 48401
  1. Create a violin plot of summed read qualities for the two samples in ggplot2
heartQualities <- alphabetScore(quality(heartQ))
LiverQualities <- alphabetScore(quality(liverQ))
HFrame <- data.frame(Sample="Heart",Qualities=heartQualities)
LFrame <- data.frame(Sample="Liver",Qualities=LiverQualities)
toPlot <- rbind(HFrame,LFrame)
ggplot(toPlot,aes(x=Sample,y=Qualities,fill=Sample))+geom_violin()+theme_minimal()

  1. Plot the overall frequency of A, C, G, T and N in the two samples using ggplot2
myFreqHeart <- colSums(alphabetFrequency(sread(heartQ)))
myFreqLiver <- colSums(alphabetFrequency(sread(liverQ)))
myFreqHeartFilt <- myFreqHeart[c("A","C","G","T","N")]
myFreqLiverFilt <- myFreqLiver[c("A","C","G","T","N")]
myFreqHeartDF <- data.frame(Sample="Heart",Bases=names(myFreqHeartFilt),Frequency=myFreqHeartFilt)
myFreqLiverDF <- data.frame(Sample="Liver",Bases=names(myFreqLiverFilt),Frequency=myFreqLiverFilt)
toPlot <- rbind(myFreqHeartDF,myFreqLiverDF)
ggplot(toPlot,aes(x=Bases,y=Frequency,fill=Bases))+geom_bar(stat="identity")+theme_minimal()+facet_grid(~Sample)

  1. Create a FASTA file containing sequence for chromosome 12 from Human Hg38 genome build, with contig sequence named “chr12”. Build an index with this FASTA.
library(Rsubread)

library(BSgenome.Hsapiens.UCSC.hg38)
chr12hg38 <- BSgenome.Hsapiens.UCSC.hg38[["chr12"]]
chr12hg38Set <- DNAStringSet(list(chr12=chr12hg38))
writeXStringSet(chr12hg38Set,file="chr12.fa")
buildindex("chr12","chr12.fa")
  1. Align the two samples to the chromosome 12 FASTA file using Rsubread in a splice aware mode. Sort and index the resulting alignments
FileName <- c("data/heart.bodyMap.fq","data/liver.bodyMap.fq")
OutputName <- c("data/heart.bodyMap.bam","data/liver.bodyMap.bam")

subjunc("chr12",FileName,
        output_format = "BAM",
        output_file = OutputName)

sortBam("data/heart.bodyMap.bam","Heart")
sortBam("data/liver.bodyMap.bam","Liver")

indexBam("Heart.bam")
indexBam("Liver.bam")
  1. Count the number of unaligned reads in both samples. How many unique read IDs (QNAMEs) are there?
quickBamFlagSummary("Heart.bam")
quickBamFlagSummary("Liver.bam")
  1. Visualize the result in IGV

[HINT: Check Gene SLC25A3]