Summarizing Data in Bioconductor exercises

Summarizing data

In these exercises we will review some of the functionality for summarizing counts and signal across genomes and within regions.

The example data can be found in the data directory

data/heart.bodyMap.bam data/liver.bodyMap.bam

Sort and index both the heart.bodyMap.bam and liver.bodyMap.bam files.

library(GenomicAlignments)
library(Rsamtools)
sortBam("data/liver.bodyMap.bam","Liver")

## [1] "Liver.bam"

sortBam("data/heart.bodyMap.bam","Heart")

## [1] "Heart.bam"

indexBam("Liver.bam")

##       Liver.bam 
## "Liver.bam.bai"

indexBam("Heart.bam")

##       Heart.bam 
## "Heart.bam.bai"

Calculate the coverage from the sorted, indexed BAM files normalised to the total mapped reads per sample.

liverMapped <- idxstatsBam("Liver.bam")
heartMapped <- idxstatsBam("Heart.bam")
totalLiver <- sum(liverMapped[,"mapped"])
totalHeart <- sum(heartMapped[,"mapped"])

liverCoverageNorm <- coverage("Liver.bam",weight = 1/totalLiver)
heartCoverageNorm <- coverage("Heart.bam",weight = 1/totalHeart)

Plot the coverage for Heart and Liver samples over region Chr12 98,986,183-98,998,558 using ggplot2

heartCoverageNorm12 <- heartCoverageNorm[["chr12"]]
signalDepthHeart <- heartCoverageNorm12[98592587:98607803]
signalDepthScaled <- data.frame(Sample="Heart",
                                Position=98592587:98607803,
                                Signal=signalDepthHeart)
liverCoverageNorm12 <- liverCoverageNorm[["chr12"]]
signalDepthLiver <- liverCoverageNorm12[98592587:98607803]
signalDepthScaled2 <- data.frame(Sample="Liver",
                                Position=98592587:98607803,
                                Signal=signalDepthLiver)

toPlot <- rbind(signalDepthScaled,signalDepthScaled2)

library(ggplot2)
ggplot(toPlot,aes(x=Position,y=Signal,colour=Sample))+
  geom_line()+theme_minimal()+ggtitle("Coverage over Chr12 98,986,183-98,998,558 for liver and heart samples")

Using the normalised coverage for heart and liver samples, calculate the sum coverage within TSS (+500bp/-500bp) of the TMPO, SLC25A3 and IKBIP genes. Create a barplot of the sum coverage for each gene using ggplot2.

library(org.Hs.eg.db)
myIds <- AnnotationDbi::select(org.Hs.eg.db, keys = c("SLC25A3","TMPO","IKBIP"), keytype = "SYMBOL", 
            columns = c("SYMBOL", "GENENAME","ENTREZID"))

## 'select()' returned 1:1 mapping between keys and columns

entrezIDforGenes <- myIds[,"ENTREZID"]
library(TxDb.Hsapiens.UCSC.hg38.knownGene)

hg38Genes <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene)

##   228 genes were dropped because they have exons located on both strands
##   of the same reference sequence or on more than one reference sequence,
##   so cannot be represented by a single genomic range.
##   Use 'single.strand.genes.only=FALSE' to get all the genes in a
##   GRangesList object, or use suppressMessages() to suppress this message.

myPromoters <- promoters(hg38Genes,500,500)
prms <- myPromoters[myPromoters$gene_id %in% entrezIDforGenes,]

liverSums <- sum(liverCoverageNorm[prms])
heartSums <- sum(heartCoverageNorm[prms])

lFfra <- data.frame(Sample="Liver",ENTREZID=prms$gene_id,Coverage=liverSums)
hFfra <- data.frame(Sample="Heart",ENTREZID=prms$gene_id,Coverage=heartSums)
covFrame <- rbind(lFfra,hFfra)

toPlot <- merge(myIds,covFrame,by="ENTREZID")
ggplot(toPlot,aes(x=SYMBOL,y=Coverage,fill=Sample))+geom_bar(stat="identity",position = "dodge")+theme_bw()

Using the summarizeOverlaps() function, count reads from Heart and liver samples in the TMPO, SLC25A3 and IKBIP genes. Normalise reads to total number of mapped reads in the sample per million mapped reads and plot the mean counts per gene versus the log2 fold difference between heart and liver samples.

library(org.Hs.eg.db)
myIds <- AnnotationDbi::select(org.Hs.eg.db, keys = c("SLC25A3","TMPO","IKBIP"), keytype = "SYMBOL", 
            columns = c("SYMBOL", "GENENAME","ENTREZID"))

## 'select()' returned 1:1 mapping between keys and columns

entrezIDforGenes <- myIds[,"ENTREZID"]
library(TxDb.Hsapiens.UCSC.hg38.knownGene)

hg38Genes <- exonsBy(TxDb.Hsapiens.UCSC.hg38.knownGene,by="gene")

exByGenes <- hg38Genes[names(hg38Genes) %in% entrezIDforGenes,]

sortBam("data/liver.bodyMap.bam","Liver")

## [1] "Liver.bam"

sortBam("data/heart.bodyMap.bam","Heart")

## [1] "Heart.bam"

indexBam("Liver.bam")

##       Liver.bam 
## "Liver.bam.bai"

indexBam("Heart.bam")

##       Heart.bam 
## "Heart.bam.bai"

liverMapped <- idxstatsBam("Liver.bam")
heartMapped <- idxstatsBam("Heart.bam")
totalLiver <- sum(liverMapped[,"mapped"])
totalHeart <- sum(heartMapped[,"mapped"])

newCounts <- summarizeOverlaps(exByGenes,c("Liver.bam","Heart.bam"))
cMatrix <- assay(newCounts)

cMatrix[,"Liver.bam"] <- (cMatrix[,"Liver.bam"]/totalLiver)*(10^6)
cMatrix[,"Heart.bam"] <- (cMatrix[,"Heart.bam"]/totalHeart)*(10^6)


toPlot <- data.frame(ENTREZID=rownames(cMatrix),mean=rowMeans(cMatrix),log2FC=log2(cMatrix[,"Liver.bam"]/cMatrix[,"Heart.bam"]))
toPlot <- merge(myIds,toPlot,by="ENTREZID")
ggplot(toPlot,aes(x=mean,y=log2FC))+geom_point()+theme_bw()+ylim(-3,3)+ggtitle("Liver_Minus_Heart")+geom_text(label=toPlot$SYMBOL)

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing missing values (geom_text).

Summarizing Data in Bioconductor exercises

Rockefeller University, Bioinformatics Resource Centre

https://rockefelleruniversity.github.io/Bioconductor_Introduction/

Summarizing data