These exercises are about the factors and data frames sections of Introduction to R.

Exercise 1 - Factors

CellType <- factor(c("DC1","DC1","DC1","NK","NK","Mono","Mono","DC2","NK"))
CellType
## [1] DC1  DC1  DC1  NK   NK   Mono Mono DC2  NK  
## Levels: DC1 DC2 Mono NK
levels(CellType) <- c(levels(CellType),"Neu")
CellType[3] <- "Neu"
CellType
## [1] DC1  DC1  Neu  NK   NK   Mono Mono DC2  NK  
## Levels: DC1 DC2 Mono NK Neu
CellType2 <- factor(c("DC1","DC1","DC1","NK","NK","Mono","Mono","DC2","NK"), levels = c("DC1", "DC2", "Mono", "NK", "Neu", "Bcell", "Tcell"))
CellType2
## [1] DC1  DC1  DC1  NK   NK   Mono Mono DC2  NK  
## Levels: DC1 DC2 Mono NK Neu Bcell Tcell
CellType2 <- c(CellType2, factor(c("Neu","Neu","Bcell","DC1"), levels = c("DC1", "DC2", "Mono", "NK", "Neu", "Bcell", "Tcell")))
CellType2
##  [1] DC1   DC1   DC1   NK    NK    Mono  Mono  DC2   NK    Neu   Neu   Bcell
## [13] DC1  
## Levels: DC1 DC2 Mono NK Neu Bcell Tcell
summary(CellType2)
##   DC1   DC2  Mono    NK   Neu Bcell Tcell 
##     4     1     2     3     2     1     0
levels(CellType2) <- c("Bcell","DC1", "DC2", "Mono", "Neu","NK","Tcell")
summary(CellType2)
## Bcell   DC1   DC2  Mono   Neu    NK Tcell 
##     4     1     2     3     2     1     0
Height <- factor(c("high", "low", "mid", "low", "mid", "low", "mid", "high", "mid", "high"),ordered=T,levels=c("low", "mid", "high"))
Height
##  [1] high low  mid  low  mid  low  mid  high mid  high
## Levels: low < mid < high
filteredHeight <- Height[Height > "low"]
filteredHeight
## [1] high mid  mid  mid  high mid  high
## Levels: low < mid < high
newFactor <- factor(Height,ordered=T,levels=c("low", "mid", "high","veryHigh"))
newFactor[length(newFactor)] <- "veryHigh"
newFactor[newFactor > "mid"]
## [1] high     high     veryHigh
## Levels: low < mid < high < veryHigh

Exercise 2 - Data frames

Annotation <- data.frame(geneNames=c("Gene_1", "Gene_2", "Gene_3","Gene_4","Gene_5"), ensembl=c("Ens001", "Ens003", "Ens006", "Ens007", "Ens010"),pathway=c("Glycolysis", "TGFb", "Glycolysis", "TGFb", "Glycolysis"),geneLengths=c(100, 3000, 200, 1000,1200))
Annotation
##   geneNames ensembl    pathway geneLengths
## 1    Gene_1  Ens001 Glycolysis         100
## 2    Gene_2  Ens003       TGFb        3000
## 3    Gene_3  Ens006 Glycolysis         200
## 4    Gene_4  Ens007       TGFb        1000
## 5    Gene_5  Ens010 Glycolysis        1200
Annotation[Annotation$geneLengths>500 & Annotation$geneLengths<2000,]
##   geneNames ensembl    pathway geneLengths
## 4    Gene_4  Ens007       TGFb        1000
## 5    Gene_5  Ens010 Glycolysis        1200
class(Annotation[,1])
## [1] "character"
class(Annotation[,2])
## [1] "character"
class(Annotation[,3])
## [1] "character"
class(Annotation[,4])
## [1] "numeric"
Annotation[,3] <- factor(Annotation[,3])
Annotation
##   geneNames ensembl    pathway geneLengths
## 1    Gene_1  Ens001 Glycolysis         100
## 2    Gene_2  Ens003       TGFb        3000
## 3    Gene_3  Ens006 Glycolysis         200
## 4    Gene_4  Ens007       TGFb        1000
## 5    Gene_5  Ens010 Glycolysis        1200
class(Annotation[,3])
## [1] "factor"
Sample1 <- data.frame(ensembl=c("Ens001", "Ens003", "Ens006","Ens010"),expression=c(1000, 3000, 10000,5000))
Sample1
##   ensembl expression
## 1  Ens001       1000
## 2  Ens003       3000
## 3  Ens006      10000
## 4  Ens010       5000
Sample2 <- data.frame(ensembl=c("Ens001", "Ens003", "Ens006","Ens007","Ens010"),expression=c(1500, 1500, 17000,500,10000))
Sample2
##   ensembl expression
## 1  Ens001       1500
## 2  Ens003       1500
## 3  Ens006      17000
## 4  Ens007        500
## 5  Ens010      10000
AnnoSample1 <- merge(Annotation,Sample1,by.x=2,by.y=1,all=F) 
AnnoSample1And2 <- merge(AnnoSample1,Sample2,by=1,all=F) 
AnnoSample1And2
##   ensembl geneNames    pathway geneLengths expression.x expression.y
## 1  Ens001    Gene_1 Glycolysis         100         1000         1500
## 2  Ens003    Gene_2       TGFb        3000         3000         1500
## 3  Ens006    Gene_3 Glycolysis         200        10000        17000
## 4  Ens010    Gene_5 Glycolysis        1200         5000        10000
AnnoSample1And2 <- AnnoSample1And2[order(AnnoSample1And2$geneLengths, decreasing = T),]
AnnoSample1And2
##   ensembl geneNames    pathway geneLengths expression.x expression.y
## 2  Ens003    Gene_2       TGFb        3000         3000         1500
## 4  Ens010    Gene_5 Glycolysis        1200         5000        10000
## 3  Ens006    Gene_3 Glycolysis         200        10000        17000
## 1  Ens001    Gene_1 Glycolysis         100         1000         1500
AnnoSample1And2$Sample1_lne <- AnnoSample1And2$expression.x/AnnoSample1And2$geneLengths
AnnoSample1And2$Sample2_lne <- AnnoSample1And2$expression.y/AnnoSample1And2$geneLengths
AnnoSample1And2
##   ensembl geneNames    pathway geneLengths expression.x expression.y
## 2  Ens003    Gene_2       TGFb        3000         3000         1500
## 4  Ens010    Gene_5 Glycolysis        1200         5000        10000
## 3  Ens006    Gene_3 Glycolysis         200        10000        17000
## 1  Ens001    Gene_1 Glycolysis         100         1000         1500
##   Sample1_lne Sample2_lne
## 2    1.000000    0.500000
## 4    4.166667    8.333333
## 3   50.000000   85.000000
## 1   10.000000   15.000000
rownames(AnnoSample1And2) <- AnnoSample1And2$ensembl
mean(c(AnnoSample1And2["Ens006","Sample1_lne"],AnnoSample1And2["Ens006","Sample2_lne"]))
## [1] 67.5
log2FoldChange <- log2(AnnoSample1And2$Sample2_lne) - log2(AnnoSample1And2$Sample1_lne)
names(log2FoldChange) <- AnnoSample1And2$geneNames
log2FoldChange
##     Gene_2     Gene_5     Gene_3     Gene_1 
## -1.0000000  1.0000000  0.7655347  0.5849625
sum(AnnoSample1And2[AnnoSample1And2$pathway == "Glycolysis","geneLengths"])
## [1] 1500