My Exercise 4

These exercises are about consensus peaks and differentials.

Exercise 1 - Consensus ATAC Peaks

Lets read in the ATAC peaks from MACS. They are contained in data/atac_peaks.

library(rtracklayer)
peakFiles <- dir("data/atac_peaks",pattern="narrowPeak",
                 full.names = TRUE)
peakFiles

## [1] "data/atac_peaks/D0_ATAC_rep1_macs_peaks.narrowPeak"
## [2] "data/atac_peaks/D0_ATAC_rep2_macs_peaks.narrowPeak"
## [3] "data/atac_peaks/W6_ATAC_rep1_macs_peaks.narrowPeak"
## [4] "data/atac_peaks/W6_ATAC_rep2_macs_peaks.narrowPeak"

macsPeaks_list <- lapply(peakFiles, rtracklayer::import)

Compare W6 peaks for each replicate. How many peaks are unique to each replicate and how many are in common? Export the unique peaks.

macsPeaks_GRL <- GRangesList(macsPeaks_list)
names(macsPeaks_GRL) <- c("D0_rep1","D0_rep2","W6_rep1","W6_rep2")

W6_rep1_unique.Peaks <- macsPeaks_GRL[[3]][!macsPeaks_GRL[[3]] %over% macsPeaks_GRL[[4]]]

W6_rep2_unique.Peaks <- macsPeaks_GRL[[4]][!macsPeaks_GRL[[4]] %over% macsPeaks_GRL[[3]]]

my_tots <- c(length(W6_rep1_unique.Peaks),
  length(W6_rep2_unique.Peaks),
  length(macsPeaks_GRL[[3]]) - length(W6_rep1_unique.Peaks))
names(my_tots) <- c("Rep1_Unique", "Rep2_Unique", "Common")
my_tots

## Rep1_Unique Rep2_Unique      Common 
##       35572       14103       53001

rtracklayer::export(W6_rep1_unique.Peaks, "ATAC_W6_rep1_unique.Peaks.bed")

rtracklayer::export(W6_rep2_unique.Peaks, "ATAC_W6_rep2_unique.Peaks.bed")

Build a nonredundant peak set across all 4 samples.

allPeaksSet_Overlapping <- unlist(macsPeaks_GRL)
allPeaksSet_nR <- reduce(allPeaksSet_Overlapping)
allPeaksSet_nR

## GRanges object with 130138 ranges and 0 metadata columns:
##                  seqnames          ranges strand
##                     <Rle>       <IRanges>  <Rle>
##        [1]           chr1 3398990-3399195      *
##        [2]           chr1 3426076-3426243      *
##        [3]           chr1 3455860-3455997      *
##        [4]           chr1 3583575-3583721      *
##        [5]           chr1 3611242-3611457      *
##        ...            ...             ...    ...
##   [130134] chrUn_GL456368       1054-1363      *
##   [130135] chrUn_GL456368     16472-16664      *
##   [130136] chrUn_GL456367       8397-8711      *
##   [130137] chrUn_GL456367     11398-12675      *
##   [130138] chrUn_GL456370     18753-19115      *
##   -------
##   seqinfo: 39 sequences from an unspecified genome; no seqlengths

Add metadata to our nonredundant peak set that contains the presence/absence of peaks from each sample. Summarize this with a venn diagram.

overlap <- list()
for(i in 1:length(macsPeaks_GRL)){
  overlap[[i]] <- allPeaksSet_nR %over% macsPeaks_GRL[[i]]
}

overlapMatrix <- do.call(cbind,overlap)
colnames(overlapMatrix) <- names(macsPeaks_GRL)

mcols(allPeaksSet_nR) <- overlapMatrix
allPeaksSet_nR[1:3,]

## GRanges object with 3 ranges and 4 metadata columns:
##       seqnames          ranges strand |   D0_rep1   D0_rep2   W6_rep1   W6_rep2
##          <Rle>       <IRanges>  <Rle> | <logical> <logical> <logical> <logical>
##   [1]     chr1 3398990-3399195      * |      TRUE     FALSE     FALSE     FALSE
##   [2]     chr1 3426076-3426243      * |     FALSE     FALSE     FALSE      TRUE
##   [3]     chr1 3455860-3455997      * |     FALSE      TRUE     FALSE     FALSE
##   -------
##   seqinfo: 39 sequences from an unspecified genome; no seqlengths

library(limma)
vennDiagram(mcols(allPeaksSet_nR))

Make high confidence consensus peaks. Export the result.

HC_Peaks <- allPeaksSet_nR[
  rowSums(as.data.frame(
    mcols(allPeaksSet_nR)[,c("D0_rep1","D0_rep2")])) >= 2 |
  rowSums(as.data.frame(
    mcols(allPeaksSet_nR)[,c("W6_rep1","W6_rep2")])) >= 2  
  ]
export.bed(HC_Peaks,"ATAC_HC_Peaks.bed")

Exercise 2 - Generate counts

Count the reads that overlap with the consensus peaks. Save the counts object. [NOTE: this can take some time. If you want to skip ahead to differentials, we have provided a counts object]

You can grab the bam files from DropBox

library(Rsamtools)

bams <- c("ATAC_D0_rep1.bam",
          "ATAC_D0_rep2.bam",
          "ATAC_W6_rep1.bam",
          "ATAC_W6_rep2.bam")

bamFL <- BamFileList(bams,yieldSize = 5000000)
bamFL

HC_Peaks <- rtracklayer::import("data/ATAC_HC_Peaks.bed")
library(GenomicAlignments)
MyCounts <- summarizeOverlaps(HC_Peaks,
                              reads = bamFL,
                              ignore.strand = TRUE)
save(MyCounts,file="ATAC_Counts.RData")

Exercise 3 - Differentials with DESeq2

Load in the counts object. If you did not complete the counts section you can just load it from the data directory: data/ATAC_Counts.RData.

load("data/ATAC_Counts.RData")

class(MyCounts)

## [1] "RangedSummarizedExperiment"
## attr(,"package")
## [1] "SummarizedExperiment"

MyCounts

## class: RangedSummarizedExperiment 
## dim: 57433 4 
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(2): name score
## colnames(4): D0_ATAC_rep1_sorted.bam D0_ATAC_rep2_sorted.bam
##   W6_ATAC_rep1_sorted.bam W6_ATAC_rep2_sorted.bam
## colData names(0):

Generate metadata data frame

metaDataFrame <- data.frame(DevStage=c("D0","D0","W6","W6"))
rownames(metaDataFrame) <- colnames(MyCounts)
metaDataFrame

##                         DevStage
## D0_ATAC_rep1_sorted.bam       D0
## D0_ATAC_rep2_sorted.bam       D0
## W6_ATAC_rep1_sorted.bam       W6
## W6_ATAC_rep2_sorted.bam       W6

Run DESeq2 and get results GRange. Order this object by padj value.

library(DESeq2)
HC_Peaks <- rtracklayer::import("data/ATAC_HC_Peaks.bed")
dds <- DESeqDataSetFromMatrix(countData = assay(MyCounts),
                              colData = metaDataFrame,
                              design = ~DevStage,
                              rowRanges= HC_Peaks)
dds <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

W6MinusD0 <- results(dds,
                        contrast = c("DevStage","W6","D0"),
                        format="GRanges")
W6MinusD0 <- W6MinusD0[order(W6MinusD0$padj),]
W6MinusD0

## GRanges object with 57433 ranges and 6 metadata columns:
##           seqnames              ranges strand |  baseMean log2FoldChange
##              <Rle>           <IRanges>  <Rle> | <numeric>      <numeric>
##       [1]     chr4   97465816-97466559      * |   624.313        5.14453
##       [2]     chr3 154696026-154696867      * |   506.128        5.83443
##       [3]    chr19   21882105-21883278      * |   641.165        3.87490
##       [4]     chr1 177385925-177386486      * |   390.054        5.04621
##       [5]    chr16   76851339-76852034      * |   356.948        6.03987
##       ...      ...                 ...    ... .       ...            ...
##   [57429]     chr2   76057296-76057422      * |   61.8615    2.28036e-04
##   [57430]     chr8 126685200-126685821      * |  892.5875   -6.53726e-05
##   [57431]     chr7   73595414-73595544      * |   80.2626    8.32260e-05
##   [57432]     chr6   57839762-57840115      * |   70.6482   -5.62486e-05
##   [57433]    chr11 117999734-117999888      * |   54.4867   -1.81022e-05
##               lfcSE         stat       pvalue        padj
##           <numeric>    <numeric>    <numeric>   <numeric>
##       [1]  0.237520      21.6593 4.96320e-104 2.85052e-99
##       [2]  0.307708      18.9609  3.58843e-80 1.03047e-75
##       [3]  0.211921      18.2847  1.09614e-74 2.09849e-70
##       [4]  0.285292      17.6879  5.20147e-70 7.46840e-66
##       [5]  0.352288      17.1447  6.88455e-66 7.90801e-62
##       ...       ...          ...          ...         ...
##   [57429]  0.548252  4.15932e-04     0.999668    0.999738
##   [57430]  0.172098 -3.79856e-04     0.999697    0.999749
##   [57431]  0.470401  1.76926e-04     0.999859    0.999894
##   [57432]  0.493346 -1.14014e-04     0.999909    0.999926
##   [57433]  0.560431 -3.23005e-05     0.999974    0.999974
##   -------
##   seqinfo: 34 sequences from an unspecified genome; no seqlengths

Export the results to a dataframe.

rio::export(as.data.frame(W6MinusD0), "data/ATAC_W6MinusD0.xlsx")

Plot a PCA of our samples

myrlog <- rlog(dds)
plotPCA(myrlog, intgroup="DevStage")

## using ntop=500 top features by variance

Make a volcano plot

library(EnhancedVolcano)
W6MinusD0.Filt <- W6MinusD0[!is.na(W6MinusD0$pvalue) | !is.na(W6MinusD0$padj)]
EnhancedVolcano(as.data.frame(W6MinusD0.Filt),
                lab=paste0(seqnames(W6MinusD0.Filt), ranges(W6MinusD0.Filt)),
                x = "log2FoldChange",
                y="padj", selectLab = "")

My Exercise 4

Rockefeller University, Bioinformatics Resource Centre

https://rockefelleruniversity.github.io/RU_course_template/