ATACseq in Bioconductor exercises (part 2)

In todays session we will work with some of the ATAC-seq data of T-regulatory cells from Christina Leslie’s lab.

ATAC-seq analysis

Plot the nucleosome free abd mono-nucleosome signals over mouse genome mm10 TSS regions from the T-reg ATAC-seq data for chromosomes chr1 to chr19.

library(soGGi)
library(ggplot2)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
genesLocations <- genes(TxDb.Mmusculus.UCSC.mm10.knownGene)
tssLocations <- resize(genesLocations,fix="start",width = 1)
tssLocations <- tssLocations[seqnames(tssLocations) %in% paste0("chr",1:19)]
 
sortedBAM <- "~/Downloads/ENCFF053CGD.bam"
indexBam(sortedBAM)

nucFree <- regionPlot(bamFile = sortedBAM,
                        testRanges = tssLocations,
                        style = "point",
                        format="bam",
                        paired=TRUE,
                        minFragmentLength = 0,
                        maxFragmentLength = 100,
                        forceFragment = 50)

monoNuc <- regionPlot(bamFile = sortedBAM,
                        testRanges = tssLocations,
                        style = "point",
                        format="bam",
                        paired=TRUE,
                        minFragmentLength = 180,
                      maxFragmentLength = 240,
                      forceFragment = 80)
plotRegion(nucFree)+theme_bw()
plotRegion(monoNuc)+theme_bw()

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

Extract the FRIP and Duplication rate for the Encode T-reg ATAC-seq BAM and associated peaks for chromosome 10.

library(ChIPQC)
library(rtracklayer)
openRegionPeaks <- "~/Downloads/ENCFF520GRD.bed.gz"

qcRes <- ChIPQCsample(sortedBAM,
                      peaks = openRegionPeaks,
                      annotation ="mm10",
                      chromosomes = "chr10",
                      verboseT = FALSE)


myMetrics <- QCmetrics(qcRes)
myMetrics[c("RiP%")]
flgCounts <- flagtagcounts(qcRes)
DupRate <- flgCounts["DuplicateByChIPQC"]/flgCounts["Mapped"]
DupRate*100

load(file="data/chipqc_Treg.RData")
myMetrics <- QCmetrics(qcRes)
myMetrics[c("RiP%")]

## RiP% 
## 43.9

flgCounts <- flagtagcounts(qcRes)
DupRate <- flgCounts["DuplicateByChIPQC"]/flgCounts["Mapped"]
DupRate*100

## DuplicateByChIPQC 
##          6.743219

For the Treg data produce a plot of transposase cut-sites for nucleosome free fragments around CTCF motifs on chromosome 18. Use the motif “Hsapiens-JASPAR_2014-CTCF-MA0139.1” from MotifDB package.

sortedBAM <- "~/Downloads/ENCFF053CGD.bam"
chrLens <- seqlengths(BamFile(sortedBAM))
param=ScanBamParam(flag=scanBamFlag(isProperPair =TRUE), 
                    what=c("qname","mapq","isize"), 
                   which=GRanges("chr18", IRanges(1,chrLens["chr18"])))
atacReads <- readGAlignmentPairs(sortedBAM,param = param)
insertSizes <- abs(elementMetadata(GenomicAlignments::first(atacReads))$isize)
atacReads_Open <- atacReads[insertSizes < 100]
read1 <- GenomicAlignments::first(atacReads_Open)
read2 <- second(atacReads_Open)
Firsts <- resize(granges(read1),fix="start",1)
First_Pos_toCut <- shift(granges(Firsts[strand(read1) == "+"]),
                                         4)
First_Neg_toCut <- shift(granges(Firsts[strand(read1) == "-"]),
                                         -5)

Seconds <- resize(granges(read2),fix="start",1)
Second_Pos_toCut <- shift(granges(Seconds[strand(read2) == "+"]),
                                4)
Second_Neg_toCut <- shift(granges(Seconds[strand(read2) == "-"]),
                                -5)

test_toCut <- c(First_Pos_toCut,First_Neg_toCut,
                Second_Pos_toCut,Second_Neg_toCut)
cutsCoverage <- coverage(test_toCut)
cutsCoverage18 <- cutsCoverage["chr18"]

library(MotifDb)
library(Biostrings)
library(BSgenome.Mmusculus.UCSC.mm10)
CTCF <- query(MotifDb, c("CTCF"))
ctcfMotif <- CTCF[["Hsapiens-JASPAR_2014-CTCF-MA0139.1"]]
myRes <- matchPWM(ctcfMotif,BSgenome.Mmusculus.UCSC.mm10[["chr18"]])
toCompare <- GRanges("chr18",ranges(myRes))
CTCF_Cuts_open <- regionPlot(cutsCoverage18,
                         testRanges = toCompare,
                         style = "point",
                         format="rlelist",distanceAround = 500)

plotRegion(CTCF_Cuts_open)+theme_bw()

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

Load the counts for ENCODE Kidney, Liver and Hindbrain. Identify ATACseq sites significantly higher in Liver compared to Hindbrain (padj < 0.05 and padj is not NA) and perform a functional enrichment test using RGreat to identify BioCyc Pathways enriched in these sites.

Counts are in *data/myCounts.RData**

library(DESeq2)
load("data/myCounts.RData")

Group <- factor(c("HindBrain","HindBrain","Kidney","Kidney",
                  "Liver","Liver"))
metaData <- data.frame(Group,row.names=colnames(myCounts))

atacDDS <- DESeqDataSetFromMatrix(assay(myCounts),
                                  metaData,
                                  ~Group,
                                  rowRanges=rowRanges(myCounts))
atacDDS <- DESeq(atacDDS)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.

## final dispersion estimates

## fitting model and testing

LiverMinusHindbrain <- results(atacDDS,
                                c("Group","Liver","HindBrain"),
                               format="GRanges")
LiverMinusHindbrain <- LiverMinusHindbrain[
                                order(LiverMinusHindbrain$pvalue)
                         ]
LiverMinusHindbrain <- LiverMinusHindbrain[
                          (!is.na(LiverMinusHindbrain$padj) 
                           & LiverMinusHindbrain$padj < 0.05)
                          & LiverMinusHindbrain$log2FoldChange > 0]

library(rGREAT)
great_Job <- submitGreatJob(LiverMinusHindbrain, species = "mm10",request_interval=1,version = "3.0.0")

## Warning in submitGreatJob(LiverMinusHindbrain, species = "mm10", request_interval = 1, : GREAT gives a warning:
## Your set hits a large fraction of the genes in the genome, which often
## does not work well with the GREAT Significant by Both view due to a
## saturation of the gene-based hypergeometric test.
## 
## See our tips for handling large datasets or try the Significant By
## Region-based Binomial view.

availableCategories(great_Job)

## [1] "GO"                               "Phenotype Data and Human Disease"
## [3] "Pathway Data"                     "Gene Expression"                 
## [5] "Regulatory Motifs"                "Gene Families"

great_ResultTable = getEnrichmentTables(great_Job, category = "Pathway Data")

## The default enrichment tables contain no associated genes for the input
## regions. You can set `download_by = 'tsv'` to download the complete
## table, but note only the top 500 regions can be retreived. See the
## following link:
## 
## https://great-help.atlassian.net/wiki/spaces/GREAT/pages/655401/Export#Export-GlobalExport

names(great_ResultTable)

## [1] "PANTHER Pathway" "BioCyc Pathway"  "MSigDB Pathway"

great_ResultTable[["BioCyc Pathway"]][1:4, ]

##             ID
## 1     PWY-5920
## 2     PWY-5189
## 3 PWY3DJ-11470
## 4     PWY-6358
##                                                                     name
## 1                                                   heme biosynthesis II
## 2                                           tetrapyrrole biosynthesis II
## 3                     sphingosine and sphingosine-1-phosphate metabolism
## 4 superpathway of D-<i>myo</i>-inositol (1,4,5)-trisphosphate metabolism
##   Binom_Genome_Fraction Binom_Expected Binom_Observed_Region_Hits
## 1          2.378908e-04      2.3282380                         26
## 2          9.254901e-05      0.9057772                         14
## 3          5.385445e-04      5.2707350                         25
## 4          4.999543e-04      4.8930530                         23
##   Binom_Fold_Enrichment Binom_Region_Set_Coverage Binom_Raw_PValue
## 1             11.167240               0.002656585     8.974517e-19
## 2             15.456340               0.001430469     1.224402e-12
## 3              4.743171               0.002554409     4.530668e-10
## 4              4.700542               0.002350056     2.592819e-09
##   Binom_Adjp_BH Hyper_Total_Genes Hyper_Expected Hyper_Observed_Gene_Hits
## 1  2.898769e-16                 9       3.137796                        8
## 2  1.977409e-10                 5       1.743220                        4
## 3  4.878019e-08                 9       3.137796                        5
## 4  2.093701e-07                11       3.835084                        8
##   Hyper_Fold_Enrichment Hyper_Gene_Set_Coverage Hyper_Term_Gene_Coverage
## 1              2.549561            0.0010476690                0.8888889
## 2              2.294605            0.0005238345                0.8000000
## 3              1.593475            0.0006547931                0.5555556
## 4              2.086004            0.0010476690                0.7272727
##   Hyper_Raw_PValue Hyper_Adjp_BH
## 1      0.001353018     0.2185124
## 2      0.053250040     0.6307870
## 3      0.169392700     0.8255783
## 4      0.011911010     0.2993989

ATACseq in Bioconductor exercises (part 2)

Rockefeller University, Bioinformatics Resource Centre

https://rockefelleruniversity.github.io/RU_ATACseq/

ATAC-seq analysis