These exercises are about motifs.

Exercise 1 - Motif Databases

library(MotifDb)
SMAD3Motifs <- query(MotifDb,"SMAD3")
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): input string 6084 is invalid
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): input string 6140 is invalid
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): input string 6281 is invalid
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): input string 6598 is invalid
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): input string 6622 is invalid
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
## Warning in grep(queryString, mcols(object)[, colname], ignore.case =
## ignore.case): unable to translate 'Three-zinc finger Kr<c3><bc>ppel-related
## factors' to a wide string
length(SMAD3Motifs )
## [1] 14
SMAD3Motifs[[1]]
##          1        2        3         4         5         6          7
## A 0.282316 0.341654 0.399523 0.3650856 0.2433208 0.0298729 0.00494519
## C 0.413385 0.135011 0.173081 0.1963598 0.1778388 0.8084791 0.81546493
## G 0.114930 0.267943 0.168845 0.2118558 0.2242058 0.0363090 0.04063490
## T 0.189369 0.255392 0.258551 0.2266988 0.3546346 0.1253390 0.13895499
##             8          9          10          11         12       13       14
## A 0.934557140 0.00509888 0.969303806 0.008189473 0.66957954 0.130511 0.198982
## C 0.058944109 0.00399771 0.020594196 0.984816325 0.01218231 0.171241 0.289660
## G 0.005027481 0.98614801 0.001929150 0.004198781 0.28215323 0.334041 0.252392
## T 0.001471270 0.00475540 0.008172848 0.002795421 0.03608493 0.364207 0.258966
##          15       16       17
## A 0.4231716 0.213519 0.371956
## C 0.1909778 0.287388 0.285061
## G 0.1644108 0.294414 0.150191
## T 0.2214398 0.204679 0.192792
library(JASPAR2024)
library(RSQLite)
library(TFBSTools)

JASPAR2024 <- JASPAR2024()
sq24 <- RSQLite::dbConnect(RSQLite::SQLite(), db(JASPAR2024))
SMAD3mat <- getMatrixByName(sq24,"SMAD3")
SMAD3mat
## An object of class PFMatrix
## ID: MA0795.1
## Name: SMAD3
## Matrix Class: SMAD/NF-1 DNA-binding domain factors
## strand: +
## Tags: 
## $comment
## [1] "Data is from Taipale HTSELEX DBD (2013)"
## 
## $family
## [1] "SMAD factors"
## 
## $medline
## [1] "12582250"
## 
## $remap_tf_name
## [1] "SMAD3"
## 
## $source
## [1] "23332764"
## 
## $tax_group
## [1] "vertebrates"
## 
## $type
## [1] "HT-SELEX"
## 
## $unibind
## [1] "1"
## 
## $collection
## [1] "CORE"
## 
## $species
##           9606 
## "Homo sapiens" 
## 
## $acc
## [1] "P84022"
## 
## Background: 
##    A    C    G    T 
## 0.25 0.25 0.25 0.25 
## Matrix: 
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## A  554   20  103    9    0 3491    0 3491   20  3491
## C 1829   16   29 3491   46   30   10  199 3491   434
## G   87 3491   83   11   31   22 3491    5   10   316
## T 1662   18 3491    6 3491    0    3   56   20   758

Exercise 2 - Visualizing motifs

seqLogo::seqLogo(SMAD3Motifs[[1]],ic.scale = FALSE)

seqLogo::seqLogo(SMAD3Motifs[[1]])

- Lets compare this to the motif from JASPAR2024. Are these equivalent motifs?

myMatrix <- Matrix(SMAD3mat)
ppm <-myMatrix/colSums(myMatrix)
seqLogo::seqLogo(ppm)
## Warning in makePWM(pwm): Columns of PWM must add up to 1.0

Exercise 3 - Motif enrichment analysis

library(Herper)
install_CondaTools("meme","motif_analysis", channels="bioconda", pathToMiniConda = "miniconda")
library(universalmotif)

opts <- list()
opts[["tax_group"]] <- "vertebrates"
opts[["collection"]] <- "CORE"
opts[["all_versions"]] <- FALSE

jaspar <- JASPAR2024()
sq24 <- RSQLite::dbConnect(RSQLite::SQLite(), db(jaspar))

motifs <- getMatrixSet(sq24, opts)
motifs_um <- convert_motifs(motifs)
class(motifs_um)
## [1] "list"
class(motifs_um[[1]])
## [1] "universalmotif"
## attr(,"package")
## [1] "universalmotif"
library(rio)
## 
## Attaching package: 'rio'
## The following object is masked from 'package:MotifDb':
## 
##     export
## The following object is masked from 'package:BSgenome':
## 
##     export
## The following objects are masked from 'package:rtracklayer':
## 
##     export, import
## The following objects are masked from 'package:BiocIO':
## 
##     export, import
library(Biostrings)
library(BSgenome.Mmusculus.UCSC.mm10)

dge <- rio::import("data/ATAC_W6MinusD0.xlsx")
dge_up <- dge[dge$padj < 0.05 & dge$log2FoldChange > 0,]
dge_up_gr <- GRanges(dge_up$seqnames, IRanges(dge_up$start, dge_up$end))



peaksSequences <- getSeq(BSgenome.Mmusculus.UCSC.mm10, dge_up_gr)
names(peaksSequences) <- paste0("peak_", 1:length(dge_up_gr))

HC_peaks <- rtracklayer::import("data/ATAC_HC_Peaks.bed")
background_peaksSequences <- getSeq(BSgenome.Mmusculus.UCSC.mm10, HC_peaks)
names(background_peaksSequences) <- paste0("peak_", 1:length(HC_peaks))
library(memes)

meme_path_id="miniconda/envs/motif_analysis/bin"
ame_results <- runAme(peaksSequences,
                          control = background_peaksSequences,
                          outdir = "ATAC_ame_diff",
                          meme_path = meme_path_id,
                          database = motifs_um)
load(file = "data/ATAC_ame_results.RData")
ame_results %>% 
  dplyr::filter(rank %in% 1:10) %>% 
  plot_ame_heatmap()  +
    ggtitle("Top 10 AME Hits")

top_res_motif <- getMatrixByID(sq24,ame_results$motif_alt_id[1])
top_res_motif_mat <- Matrix(top_res_motif)
ggseqlogo(top_res_motif_mat)+theme_minimal()

top_res_motif <- getMatrixByID(sq24,ame_results$motif_alt_id[2])
top_res_motif_mat <- Matrix(top_res_motif)
ggseqlogo(top_res_motif_mat)+theme_minimal()

top_res_motif <- getMatrixByID(sq24,ame_results$motif_alt_id[3])
top_res_motif_mat <- Matrix(top_res_motif)
ggseqlogo(top_res_motif_mat)+theme_minimal()

Exercise 4 - De novo motif analysis

dreme_results <- runDreme(peaksSequences,
                          control = background_peaksSequences,
                          outdir = "ATAC_dreme",
                          meme_path = meme_path_id)
load(file = "data/ATAC_dreme_results.RData")
dreme_results[2:4,] %>% 
  to_list() %>% 
  view_motifs()

tomtom_results <- runTomTom(dreme_results[2:nrow(dreme_results),],
            meme_path = meme_path_id,
            database = motifs_um,
            outdir = "ATAC_tomtom_from_dreme"
            )
load(file = "data/ATAC_tomtom_results.RData")
view_motifs(tomtom_results$best_match_motif[1:3])

tomtom_results[1,] %>% 
  view_tomtom_hits(top_n = 5)
## $m02_AKTCA

tomtom_results[2,] %>% 
  view_tomtom_hits(top_n = 5)
## $m39_AAT

tomtom_results[3,] %>% 
  view_tomtom_hits(top_n = 5)
## $m32_CAG

Exercise 5 - Finding Motifs

library(motifmatchr)
opts <- list()
opts[["tax_group"]] <- "vertebrates"
opts[["collection"]] <- "CORE"
opts[["all_versions"]] <- FALSE

jaspar <- JASPAR2024()
sq24 <- RSQLite::dbConnect(RSQLite::SQLite(), db(jaspar))
motifs <- getMatrixSet(sq24, opts)

motifs_subset <- motifs[names(motifs) %in% ame_results$motif_alt_id[1]]

motifHits <- matchMotifs(motifs, peaksSequences, out="matches")

motifHits
## class: SummarizedExperiment 
## dim: 12987 879 
## metadata(0):
## assays(1): motifMatches
## rownames: NULL
## rowData names(0):
## colnames(879): MA0004.1 MA0069.1 ... MA1602.2 MA1722.2
## colData names(1): name
mmMatrix <- motifMatches(motifHits)
mat_sums <- colSums(mmMatrix)
hist(mat_sums)
mat_sums[which.max(mat_sums)]

# meis1 = MA0498.3
UpinW6_peaksWithSox9 <- dge_up_gr[mmMatrix[,"MA0077.2"] == 1]
UpinW6_peaksWithSox9
motif_positions <- matchMotifs(motifs_subset, peaksSequences, out="positions")

motif_positions <- motif_positions$MA1152.2

names(motif_positions) <- names(peaksSequences)
my_motifs <- unlist(motif_positions, use.names = TRUE)
my_motifs