In these exercises will gain some experience working with the BSgenome packages.
suppressPackageStartupMessages(library(BSgenome.Hsapiens.UCSC.hg19))
library(BSgenome.Hsapiens.UCSC.hg19)
length(seqnames(BSgenome.Hsapiens.UCSC.hg19))
## [1] 298
<- seqlengths(BSgenome.Hsapiens.UCSC.hg19)
contigLengths sum(contigLengths[order(contigLengths,decreasing = FALSE)][1:3])
## [1] 35839
<-alphabetFrequency(BSgenome.Hsapiens.UCSC.hg19$chr20)
alpFreq "N"] alpFreq[
## N
## 3520000
library(ggplot2)
<-alphabetFrequency(BSgenome.Hsapiens.UCSC.hg19$chr20)
alpFreq <- alpFreq[c("A","T","C","G")]
atcgFreq <- data.frame(Bases=names(atcgFreq),Frequency=atcgFreq)
atcgFreqDF ggplot(atcgFreqDF,aes(y=Frequency,x=Bases,fill=Bases))+geom_bar(stat = "identity")+theme_minimal()
<- BSgenome.Hsapiens.UCSC.hg19$chr20[1000000:1000020]
mySeq complement(mySeq)
## 21-letter DNAString object
## seq: CACCCTCTCTTGACCTTGTTC
<- DNAStringSet(complement(mySeq))
mySet names(mySet) <- "testFasta"
writeXStringSet(mySet,filepath ="data/testFasta.fa")
:128748315-128748869 chr8
<- BSgenome.Hsapiens.UCSC.hg19$chr8[128748315:128748869] mySeq
Compare the sequence to that found in IGV and identify start of translated region in gene
Count the number of classical start codons (ATG) in the first exon.
countPattern("ATG",mySeq)
## [1] 0
<- BSgenome.Hsapiens.UCSC.hg19$chr12[6643976:6644027]
gapdhSeq countPattern("ATG",gapdhSeq)
## [1] 1