In these exercises we will review how we can handle reads in BAM format using the Rsamtools and GAlignments package.
For these exercises we will be using some of the example data available from the BodyMap consortium.
We have already reviewed this data when looking at IGV in an earlier session.
The example data can be found in the data directory
library(Rsamtools)
library(GenomicAlignments)
sortedHeart <- sortBam("data/heart.bodyMap.bam","SortedHeart")
sortedLiver <- sortBam("data/liver.bodyMap.bam","SortedLiver")
indexBam(sortedHeart)## SortedHeart.bam
## "SortedHeart.bam.bai"
indexBam(sortedLiver)## SortedLiver.bam
## "SortedLiver.bam.bai"
library(ggplot2)
idxHeart <- idxstatsBam(sortedHeart)
idxLiver <- idxstatsBam(sortedLiver)
idxHeartDF <- data.frame(Sample="Heart",idxHeart)
idxLiverDF <- data.frame(Sample="Liver",idxLiver)
toPlot <- rbind(idxHeartDF,idxLiverDF)
ggplot(toPlot,aes(x=seqnames,y=mapped,fill=seqnames))+
geom_bar(stat="identity")+
facet_wrap(~Sample)+
coord_flip()+
theme_bw()+xlab("Chromosome")Plot this again, but add a limit to the alignment of 2.5Kb in the plotting.
myReads <- readGAlignments("data/heart.bodyMap.bam")
toPlot <- data.frame(readLength=qwidth(myReads),alignmentLength=width(myReads),junctions=factor(njunc(myReads)))
ggplot(toPlot,aes(x=readLength,y=alignmentLength))+
geom_point()+facet_grid(~junctions)+
theme_minimal()+xlab("Read Length")+ylab("Alignment Length")myReads <- readGAlignments("data/heart.bodyMap.bam")
toPlot <- data.frame(readLength=qwidth(myReads),alignmentLength=width(myReads),junctions=factor(njunc(myReads)))
ggplot(toPlot,aes(x=readLength,y=alignmentLength))+
geom_point()+facet_grid(~junctions)+
theme_minimal()+xlab("Read Length")+ylab("Alignment Length")+ylim(0,25000)## Warning: Removed 6 rows containing missing values (geom_point).
library(rtracklayer)
export(myReads[width(myReads) > 40000],"longAlignments.bam")myParam <- ScanBamParam(what=c("qname","seq","qual"))
infoInReads <- readGAlignments("SortedHeart.bam",param = myParam)readnames <- mcols(infoInReads)$qname
uniqueIDs <- length(unique(readnames))
totalReads <- length(infoInReads)
uniqueIDs## [1] 91316
totalReads## [1] 132381
uniqueReads <- infoInReads[!duplicated(readnames) & qwidth(infoInReads) == 75]
seqOfReads <- mcols(uniqueReads)$seq
alpFreq <- alphabetFrequency(seqOfReads)
sumedAlpFreq <- colSums(alpFreq)
mainBases <- sumedAlpFreq[c("A","C","G","T","N")]
toPlot <- data.frame(Base=names(mainBases),Freq=mainBases)
ggplot(toPlot,aes(x=Base,y=Freq,fill=Base))+geom_bar(stat="identity")+theme_bw()library(org.Hs.eg.db)
myIds <- AnnotationDbi::select(org.Hs.eg.db, keys = "SLC25A3", keytype = "SYMBOL",
columns = c("SYMBOL", "GENENAME","ENTREZID"))## 'select()' returned 1:1 mapping between keys and columns
entrezIDforSLC25A3 <- myIds[,"ENTREZID"]
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
allExons <- exonsBy(TxDb.Hsapiens.UCSC.hg38.knownGene,by="gene")
exonsforSLC25A3 <- allExons[[entrezIDforSLC25A3]]
seqlevels(exonsforSLC25A3) <- "chr12"
for(i in 1:length(exonsforSLC25A3)){
myRegionOfInterest <- exonsforSLC25A3[i]
myParam <- ScanBamParam(which = myRegionOfInterest)
ReadsInExons <- readGAlignments("SortedHeart.bam",param = myParam)
print(length(ReadsInExons))
}## [1] 1313
## [1] 1313
## [1] 7426
## [1] 1313
## [1] 1313
## [1] 1313
## [1] 1312
## [1] 7425
## [1] 1306
## [1] 1306
## [1] 7390
## [1] 7156
## [1] 17750
## [1] 7136
## [1] 7260
## [1] 12610
## [1] 12636
## [1] 4635
## [1] 4616
## [1] 25995
## [1] 25832
## [1] 25943
## [1] 25943
## [1] 25963
## [1] 31095
## [1] 13464
## [1] 26107
## [1] 22094
## [1] 22057
## [1] 20306
## [1] 20304
## [1] 44496
## [1] 44516
## [1] 44536
## [1] 44537
## [1] 44834
## [1] 44907
## [1] 46644
## [1] 46644