In these exercises will gain some experience working with the Gene models and annnotation using the TxDb, OrgDb and GenomicFeatures packages.
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
<- length(genes(TxDb.Mmusculus.UCSC.mm10.knownGene)) genelen
## 66 genes were dropped because they have exons located on both strands
## of the same reference sequence or on more than one reference sequence,
## so cannot be represented by a single genomic range.
## Use 'single.strand.genes.only=FALSE' to get all the genes in a
## GRangesList object, or use suppressMessages() to suppress this message.
<- length(transcripts(TxDb.Mmusculus.UCSC.mm10.knownGene)) txLen
<- genes(TxDb.Mmusculus.UCSC.mm10.knownGene) allGenes
## 66 genes were dropped because they have exons located on both strands
## of the same reference sequence or on more than one reference sequence,
## so cannot be represented by a single genomic range.
## Use 'single.strand.genes.only=FALSE' to get all the genes in a
## GRangesList object, or use suppressMessages() to suppress this message.
<- as.data.frame(allGenes)
geneTable <- geneTable[geneTable$seqnames %in% c("chr1","chr2","chr3"),]
filtGeneTable library(ggplot2)
ggplot(filtGeneTable,aes(x=width,fill=seqnames))+geom_density()+scale_x_log10()+facet_grid(seqnames~.)+theme_minimal()+scale_fill_discrete(name="Chromosome")
<- transcripts(TxDb.Mmusculus.UCSC.mm10.knownGene)
allTranscripts <- as.data.frame(allTranscripts)
transcriptTable <- transcriptTable[transcriptTable$seqnames %in% c("chr1"),]
filtTranscriptTable <- transcriptLengths(TxDb.Mmusculus.UCSC.mm10.knownGene)
txLengths <- merge(filtTranscriptTable,txLengths,all=FALSE,by="tx_name")
toPlotTxLen ggplot(toPlotTxLen,aes(x=width,y=tx_len,size=nexon))+geom_point()+scale_x_log10()+scale_y_log10()+theme_minimal()
<- transcripts(TxDb.Mmusculus.UCSC.mm10.knownGene)
allTX <- GRanges("chr1",IRanges(3083473,3876510))
myRanges <- allTX[allTX %over% myRanges]
filtTX <- filtTX[order(width(filtTX),decreasing = TRUE),]
orderedFiltTx <- orderedFiltTx[1]
longestTX
library(BSgenome.Mmusculus.UCSC.mm10)
<- extractTranscriptSeqs(BSgenome.Mmusculus.UCSC.mm10,TxDb.Mmusculus.UCSC.mm10.knownGene)
allSeqs <- allSeqs[longestTX$tx_id]
filtSeq <-alphabetFrequency(filtSeq)
alpFreq <- alpFreq[,c("A","T","C","G")]
atcgFreq <- data.frame(Bases=names(atcgFreq),Frequency=atcgFreq)
atcgFreqDF
ggplot(atcgFreqDF,aes(y=Frequency,x=Bases,fill=Bases))+geom_bar(stat = "identity")+theme_minimal()
<- read.delim("data/GM12878_minus_HeLa.csv",sep=",",stringsAsFactors = FALSE) myDETable
library(org.Hs.eg.db)
<- select(org.Hs.eg.db,keys=as.character(myDETable$ID),keytype = "ENTREZID",columns=c("ENTREZID","GENENAME")) myAnnotationTable
## 'select()' returned 1:1 mapping between keys and columns
<- merge(myAnnotationTable,myDETable,by=1,all.x=FALSE,all.y=TRUE)
newTable <- newTable[order(newTable$padj),] newTable
<- newTable$ENTREZID[newTable$padj < 0.05 & !is.na(newTable$padj) & newTable$log2FoldChange > 1]
genesToSelect library(TxDb.Hsapiens.UCSC.hg19.knownGene)
<- promoters(TxDb.Hsapiens.UCSC.hg19.knownGene,upstream = 2000,downstream = 50,columns=c("gene_id"))
myPromoters <- myPromoters[as.vector(myPromoters$gene_id) %in% genesToSelect]
selectedPromoters <- reduce(selectedPromoters)
reducedselectedPromoters export.bed(reducedselectedPromoters,con="DE_Promoters.bed")