In todays session we will work with some of the ATAC-seq data of T-regulatory cells from Christina Leslie’s lab.
Aligned data as a BAM file can be found here.
The peak calls for ATAC-seq data can be found here
library(soGGi)
library(ggplot2)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
<- genes(TxDb.Mmusculus.UCSC.mm10.knownGene)
genesLocations <- resize(genesLocations,fix="start",width = 1)
tssLocations <- tssLocations[seqnames(tssLocations) %in% paste0("chr",1:19)]
tssLocations
<- "~/Downloads/ENCFF053CGD.bam"
sortedBAM indexBam(sortedBAM)
<- regionPlot(bamFile = sortedBAM,
nucFree testRanges = tssLocations,
style = "point",
format="bam",
paired=TRUE,
minFragmentLength = 0,
maxFragmentLength = 100,
forceFragment = 50)
<- regionPlot(bamFile = sortedBAM,
monoNuc testRanges = tssLocations,
style = "point",
format="bam",
paired=TRUE,
minFragmentLength = 180,
maxFragmentLength = 240,
forceFragment = 80)
plotRegion(nucFree)+theme_bw()
plotRegion(monoNuc)+theme_bw()
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
library(ChIPQC)
library(rtracklayer)
<- "~/Downloads/ENCFF520GRD.bed.gz"
openRegionPeaks
<- ChIPQCsample(sortedBAM,
qcRes peaks = openRegionPeaks,
annotation ="mm10",
chromosomes = "chr10",
verboseT = FALSE)
<- QCmetrics(qcRes)
myMetrics c("RiP%")]
myMetrics[<- flagtagcounts(qcRes)
flgCounts <- flgCounts["DuplicateByChIPQC"]/flgCounts["Mapped"]
DupRate *100 DupRate
load(file="data/chipqc_Treg.RData")
<- QCmetrics(qcRes)
myMetrics c("RiP%")] myMetrics[
## RiP%
## 43.9
<- flagtagcounts(qcRes)
flgCounts <- flgCounts["DuplicateByChIPQC"]/flgCounts["Mapped"]
DupRate *100 DupRate
## DuplicateByChIPQC
## 6.743219
<- "~/Downloads/ENCFF053CGD.bam"
sortedBAM <- seqlengths(BamFile(sortedBAM))
chrLens =ScanBamParam(flag=scanBamFlag(isProperPair =TRUE),
paramwhat=c("qname","mapq","isize"),
which=GRanges("chr18", IRanges(1,chrLens["chr18"])))
<- readGAlignmentPairs(sortedBAM,param = param)
atacReads <- abs(elementMetadata(GenomicAlignments::first(atacReads))$isize)
insertSizes <- atacReads[insertSizes < 100]
atacReads_Open <- GenomicAlignments::first(atacReads_Open)
read1 <- second(atacReads_Open)
read2 <- resize(granges(read1),fix="start",1)
Firsts <- shift(granges(Firsts[strand(read1) == "+"]),
First_Pos_toCut 4)
<- shift(granges(Firsts[strand(read1) == "-"]),
First_Neg_toCut -5)
<- resize(granges(read2),fix="start",1)
Seconds <- shift(granges(Seconds[strand(read2) == "+"]),
Second_Pos_toCut 4)
<- shift(granges(Seconds[strand(read2) == "-"]),
Second_Neg_toCut -5)
<- c(First_Pos_toCut,First_Neg_toCut,
test_toCut
Second_Pos_toCut,Second_Neg_toCut)<- coverage(test_toCut)
cutsCoverage <- cutsCoverage["chr18"]
cutsCoverage18
library(MotifDb)
library(Biostrings)
library(BSgenome.Mmusculus.UCSC.mm10)
<- query(MotifDb, c("CTCF"))
CTCF <- CTCF[["Hsapiens-JASPAR_2014-CTCF-MA0139.1"]]
ctcfMotif <- matchPWM(ctcfMotif,BSgenome.Mmusculus.UCSC.mm10[["chr18"]])
myRes <- GRanges("chr18",ranges(myRes))
toCompare <- regionPlot(cutsCoverage18,
CTCF_Cuts_open testRanges = toCompare,
style = "point",
format="rlelist",distanceAround = 500)
plotRegion(CTCF_Cuts_open)+theme_bw()
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
Counts are in *data/myCounts.RData**
library(DESeq2)
load("data/myCounts.RData")
<- factor(c("HindBrain","HindBrain","Kidney","Kidney",
Group "Liver","Liver"))
<- data.frame(Group,row.names=colnames(myCounts))
metaData
<- DESeqDataSetFromMatrix(assay(myCounts),
atacDDS
metaData,~Group,
rowRanges=rowRanges(myCounts))
<- DESeq(atacDDS) atacDDS
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
<- results(atacDDS,
LiverMinusHindbrain c("Group","Liver","HindBrain"),
format="GRanges")
<- LiverMinusHindbrain[
LiverMinusHindbrain order(LiverMinusHindbrain$pvalue)
]<- LiverMinusHindbrain[
LiverMinusHindbrain !is.na(LiverMinusHindbrain$padj)
(& LiverMinusHindbrain$padj < 0.05)
& LiverMinusHindbrain$log2FoldChange > 0]
library(rGREAT)
<- submitGreatJob(LiverMinusHindbrain, species = "mm10",request_interval=1,version = "3.0.0") great_Job
## Warning in submitGreatJob(LiverMinusHindbrain, species = "mm10", request_interval = 1, : GREAT gives a warning:
## Your set hits a large fraction of the genes in the genome, which often
## does not work well with the GREAT Significant by Both view due to a
## saturation of the gene-based hypergeometric test.
##
## See our tips for handling large datasets or try the Significant By
## Region-based Binomial view.
availableCategories(great_Job)
## [1] "GO" "Phenotype Data and Human Disease"
## [3] "Pathway Data" "Gene Expression"
## [5] "Regulatory Motifs" "Gene Families"
= getEnrichmentTables(great_Job, category = "Pathway Data") great_ResultTable
## The default enrichment tables contain no associated genes for the input
## regions. You can set `download_by = 'tsv'` to download the complete
## table, but note only the top 500 regions can be retreived. See the
## following link:
##
## https://great-help.atlassian.net/wiki/spaces/GREAT/pages/655401/Export#Export-GlobalExport
names(great_ResultTable)
## [1] "PANTHER Pathway" "BioCyc Pathway" "MSigDB Pathway"
"BioCyc Pathway"]][1:4, ] great_ResultTable[[
## ID
## 1 PWY-5920
## 2 PWY-5189
## 3 PWY3DJ-11470
## 4 PWY-6358
## name
## 1 heme biosynthesis II
## 2 tetrapyrrole biosynthesis II
## 3 sphingosine and sphingosine-1-phosphate metabolism
## 4 superpathway of D-<i>myo</i>-inositol (1,4,5)-trisphosphate metabolism
## Binom_Genome_Fraction Binom_Expected Binom_Observed_Region_Hits
## 1 2.378908e-04 2.3282380 26
## 2 9.254901e-05 0.9057772 14
## 3 5.385445e-04 5.2707350 25
## 4 4.999543e-04 4.8930530 23
## Binom_Fold_Enrichment Binom_Region_Set_Coverage Binom_Raw_PValue
## 1 11.167240 0.002656585 8.974517e-19
## 2 15.456340 0.001430469 1.224402e-12
## 3 4.743171 0.002554409 4.530668e-10
## 4 4.700542 0.002350056 2.592819e-09
## Binom_Adjp_BH Hyper_Total_Genes Hyper_Expected Hyper_Observed_Gene_Hits
## 1 2.898769e-16 9 3.137796 8
## 2 1.977409e-10 5 1.743220 4
## 3 4.878019e-08 9 3.137796 5
## 4 2.093701e-07 11 3.835084 8
## Hyper_Fold_Enrichment Hyper_Gene_Set_Coverage Hyper_Term_Gene_Coverage
## 1 2.549561 0.0010476690 0.8888889
## 2 2.294605 0.0005238345 0.8000000
## 3 1.593475 0.0006547931 0.5555556
## 4 2.086004 0.0010476690 0.7272727
## Hyper_Raw_PValue Hyper_Adjp_BH
## 1 0.001353018 0.2185124
## 2 0.053250040 0.6307870
## 3 0.169392700 0.8255783
## 4 0.011911010 0.2993989