In these exercises we will review how we can handle reads in BAM format using the Rsamtools and GAlignments package.
For these exercises we will be using some of the example data available from the BodyMap consortium.
We have already reviewed this data when looking at IGV in an earlier session.
The example data can be found in the data directory
library(Rsamtools)
library(GenomicAlignments)
<- sortBam("data/heart.bodyMap.bam","SortedHeart")
sortedHeart <- sortBam("data/liver.bodyMap.bam","SortedLiver")
sortedLiver indexBam(sortedHeart)
## SortedHeart.bam
## "SortedHeart.bam.bai"
indexBam(sortedLiver)
## SortedLiver.bam
## "SortedLiver.bam.bai"
library(ggplot2)
<- idxstatsBam(sortedHeart)
idxHeart <- idxstatsBam(sortedLiver)
idxLiver <- data.frame(Sample="Heart",idxHeart)
idxHeartDF <- data.frame(Sample="Liver",idxLiver)
idxLiverDF <- rbind(idxHeartDF,idxLiverDF)
toPlot ggplot(toPlot,aes(x=seqnames,y=mapped,fill=seqnames))+
geom_bar(stat="identity")+
facet_wrap(~Sample)+
coord_flip()+
theme_bw()+xlab("Chromosome")
Plot this again, but add a limit to the alignment of 2.5Kb in the plotting.
<- readGAlignments("data/heart.bodyMap.bam")
myReads <- data.frame(readLength=qwidth(myReads),alignmentLength=width(myReads),junctions=factor(njunc(myReads)))
toPlot ggplot(toPlot,aes(x=readLength,y=alignmentLength))+
geom_point()+facet_grid(~junctions)+
theme_minimal()+xlab("Read Length")+ylab("Alignment Length")
<- readGAlignments("data/heart.bodyMap.bam")
myReads <- data.frame(readLength=qwidth(myReads),alignmentLength=width(myReads),junctions=factor(njunc(myReads)))
toPlot ggplot(toPlot,aes(x=readLength,y=alignmentLength))+
geom_point()+facet_grid(~junctions)+
theme_minimal()+xlab("Read Length")+ylab("Alignment Length")+ylim(0,25000)
## Warning: Removed 6 rows containing missing values (geom_point).
library(rtracklayer)
export(myReads[width(myReads) > 40000],"longAlignments.bam")
<- ScanBamParam(what=c("qname","seq","qual"))
myParam <- readGAlignments("SortedHeart.bam",param = myParam) infoInReads
<- mcols(infoInReads)$qname
readnames <- length(unique(readnames))
uniqueIDs <- length(infoInReads)
totalReads
uniqueIDs
## [1] 91316
totalReads
## [1] 132381
<- infoInReads[!duplicated(readnames) & qwidth(infoInReads) == 75]
uniqueReads <- mcols(uniqueReads)$seq
seqOfReads <- alphabetFrequency(seqOfReads)
alpFreq <- colSums(alpFreq)
sumedAlpFreq <- sumedAlpFreq[c("A","C","G","T","N")]
mainBases <- data.frame(Base=names(mainBases),Freq=mainBases)
toPlot ggplot(toPlot,aes(x=Base,y=Freq,fill=Base))+geom_bar(stat="identity")+theme_bw()
library(org.Hs.eg.db)
<- AnnotationDbi::select(org.Hs.eg.db, keys = "SLC25A3", keytype = "SYMBOL",
myIds columns = c("SYMBOL", "GENENAME","ENTREZID"))
## 'select()' returned 1:1 mapping between keys and columns
<- myIds[,"ENTREZID"]
entrezIDforSLC25A3 library(TxDb.Hsapiens.UCSC.hg38.knownGene)
<- exonsBy(TxDb.Hsapiens.UCSC.hg38.knownGene,by="gene")
allExons <- allExons[[entrezIDforSLC25A3]]
exonsforSLC25A3
seqlevels(exonsforSLC25A3) <- "chr12"
for(i in 1:length(exonsforSLC25A3)){
<- exonsforSLC25A3[i]
myRegionOfInterest <- ScanBamParam(which = myRegionOfInterest)
myParam <- readGAlignments("SortedHeart.bam",param = myParam)
ReadsInExons print(length(ReadsInExons))
}
## [1] 1313
## [1] 1313
## [1] 7426
## [1] 1313
## [1] 1313
## [1] 1313
## [1] 1312
## [1] 7425
## [1] 1306
## [1] 1306
## [1] 7390
## [1] 7156
## [1] 17750
## [1] 7136
## [1] 7260
## [1] 12610
## [1] 12636
## [1] 4635
## [1] 4616
## [1] 25995
## [1] 25832
## [1] 25943
## [1] 25943
## [1] 25963
## [1] 31095
## [1] 13464
## [1] 26107
## [1] 22094
## [1] 22057
## [1] 20306
## [1] 20304
## [1] 44496
## [1] 44516
## [1] 44536
## [1] 44537
## [1] 44834
## [1] 44907
## [1] 46644
## [1] 46644