Introduction to R - Abridged, Session 2

http://rockefelleruniversity.github.io/RU_introtoR_abridged/ · working directory

getwd()
dir()
setwd("~/Downloads/RockefellerUniversity-Intro_To_R/r_course")
Table <- read.table("data/readThisTable.csv",sep=",",header=T)
Table[1:4,1:3]
##   Gene_Name Sample_1.hi Sample_2.hi
## 1    Gene_a    4.570237    3.230467
## 2    Gene_b    3.561733    3.632285
## 3    Gene_c    3.797274    2.874462
## 4    Gene_d    3.398242    4.415202
Table <- read.table("data/readThisTable.csv",sep=",",header=T,row.names=1)
Table[1:4,1:3]
##        Sample_1.hi Sample_2.hi Sample_3.hi
## Gene_a    4.570237    3.230467    3.351827
## Gene_b    3.561733    3.632285    3.587523
## Gene_c    3.797274    2.874462    4.016916
## Gene_d    3.398242    4.415202    4.893561
URL <- "http://rockefelleruniversity.github.io/readThisTable.csv"
Table <- read.table(URL,sep=",",header=T)
Table[1:2,1:3]
##   Gene_Name Sample_1.hi Sample_2.hi
## 1    Gene_a    4.111851    3.837018
## 2    Gene_b    6.047822    5.683518
write.table(Table, file="data/writeThisTable.csv", sep=",")
write.table(Table, file="data/writeThisTable.csv", sep=",", row.names =F, col.names=T)
head(Table)
##   Gene_Name Sample_1.hi Sample_2.hi Sample_3.hi Sample_4.low Sample_5.low
## 1    Gene_a    4.111851    3.837018    4.360628     3.752517     4.368069
## 2    Gene_b    6.047822    5.683518    4.315889     3.381136     3.630273
## 3    Gene_c    2.597068    3.316300    3.681509     4.886520     4.318289
## 4    Gene_d    6.009197    5.927419    2.244701     6.574108     8.288831
## 5    Gene_e   10.152509   10.218200   10.004835     2.251603     1.805168
## 6    Gene_f   11.107868    9.592153   10.263975     3.567560     2.496475
##   Sample_1.low
## 1     3.421009
## 2     5.560802
## 3     5.097783
## 4     6.857291
## 5     2.396295
## 6     3.587755
tail(Table)
##   Gene_Name Sample_1.hi Sample_2.hi Sample_3.hi Sample_4.low Sample_5.low
## 3    Gene_c    2.597068    3.316300    3.681509     4.886520     4.318289
## 4    Gene_d    6.009197    5.927419    2.244701     6.574108     8.288831
## 5    Gene_e   10.152509   10.218200   10.004835     2.251603     1.805168
## 6    Gene_f   11.107868    9.592153   10.263975     3.567560     2.496475
## 7    Gene_g    8.705787    8.949422    9.226990    10.051516     7.841664
## 8    Gene_h    9.239039    9.839734   10.027812    11.084444     9.316200
##   Sample_1.low
## 3     5.097783
## 4     6.857291
## 5     2.396295
## 6     3.587755
## 7     9.649869
## 8     8.742943
head(Table, 3)
##   Gene_Name Sample_1.hi Sample_2.hi Sample_3.hi Sample_4.low Sample_5.low
## 1    Gene_a    4.111851    3.837018    4.360628     3.752517     4.368069
## 2    Gene_b    6.047822    5.683518    4.315889     3.381136     3.630273
## 3    Gene_c    2.597068    3.316300    3.681509     4.886520     4.318289
##   Sample_1.low
## 1     3.421009
## 2     5.560802
## 3     5.097783
install.packages("rio")
library("rio")
Table <- import("data/readThisTable.csv")
Table[1:2,]
##   Gene_Name Sample_1.hi Sample_2.hi Sample_3.hi Sample_4.low Sample_5.low
## 1    Gene_a    4.570237    3.230467    3.351827     3.930877     4.098247
## 2    Gene_b    3.561733    3.632285    3.587523     4.185287     1.380976
##   Sample_1.low
## 1     4.418726
## 2     5.936990
Table <- import("data/readThisXLS.xls", 
                which=2)
Table <- import("data/readThisXLS.xls", 
                which="Metadata")
Table[1:2,]
##       Patient Condition   Treatment
## 1 Sample_1.hi         A           X
## 2 Sample_2.hi         A NoTreatment
ExpressionScores <- Table$ExpressionScores
export(ExpressionScores, file = "data/writeThisXLSX.xlsx")
my_df <- read.table("data/categoriesAndExpression.txt",sep="\t",header=T)
head(my_df)
##   geneName ofInterest    pathway Expression
## 1    Gene1   Selected Glycolysis   20.09519
## 2    Gene2   Selected Glycolysis   23.00306
## 3    Gene3   Selected Glycolysis   20.99712
## 4    Gene4   Selected Glycolysis   43.01145
## 5    Gene5   Selected Glycolysis   22.00567
## 6    Gene6   Selected Glycolysis   20.99162
order(my_df[,4])
##   [1]  28  50  49  60  43  29  15   1  37  31  52  34   6   3   9  22  33   5
##  [19]  54  27   8  14  59  48  36  21  30   2  42  51  45  56  39  35  24  11
##  [37]   7  25  12  40  46  57  10  44  55  38  23  19  32   4  53  79  64  96
##  [55]  90  73  98  65  91  80  74  97  83  85  68  77  94  62  70  87  69  17
##  [73]  86  71  88  16  63  89  78  72  95  99  92  66  81  75  82  84  67  18
##  [91]  93 100  61  76  20  58  41  47  26  13
my_df_ordered <- my_df[order(my_df[,4]),]

head(my_df_ordered)
##    geneName  ofInterest    pathway Expression
## 28   Gene28 NotSelected Glycolysis   19.94369
## 50   Gene50 NotSelected Glycolysis   19.95572
## 49   Gene49 NotSelected Glycolysis   19.95703
## 60   Gene60 NotSelected Glycolysis   19.97635
## 43   Gene43 NotSelected Glycolysis   19.98250
## 29   Gene29 NotSelected Glycolysis   20.02165
my_df_ordered <- my_df[order(my_df[,4], decreasing = T),]

head(my_df_ordered)
##    geneName  ofInterest    pathway Expression
## 13   Gene13    Selected Glycolysis   74.08310
## 26   Gene26 NotSelected Glycolysis   73.98877
## 47   Gene47 NotSelected Glycolysis   73.96610
## 41   Gene41 NotSelected Glycolysis   73.96022
## 58   Gene58 NotSelected Glycolysis   73.94659
## 20   Gene20    Selected       TGFb   66.09706
my_df_ordered$Expression > 70
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE
idx <- my_df_ordered$Expression > 70
my_df_ordered[idx,]
##    geneName  ofInterest    pathway Expression
## 13   Gene13    Selected Glycolysis   74.08310
## 26   Gene26 NotSelected Glycolysis   73.98877
## 47   Gene47 NotSelected Glycolysis   73.96610
## 41   Gene41 NotSelected Glycolysis   73.96022
## 58   Gene58 NotSelected Glycolysis   73.94659
my_df_ordered[my_df_ordered$Expression > 60 & my_df_ordered$pathway == "TGFb",]
##     geneName  ofInterest pathway Expression
## 20    Gene20    Selected    TGFb   66.09706
## 76    Gene76 NotSelected    TGFb   63.08147
## 61    Gene61 NotSelected    TGFb   63.04337
## 100  Gene100 NotSelected    TGFb   62.93485
## 93    Gene93 NotSelected    TGFb   62.93153
my_favorite_genes <- c("Gene1","Gene10","Gene15")
logical_index <- my_df$geneName %in% my_favorite_genes
logical_index
##   [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
##  [13] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE
my_df[logical_index,]
##    geneName ofInterest    pathway Expression
## 1     Gene1   Selected Glycolysis   20.09519
## 10   Gene10   Selected Glycolysis   34.91377
## 15   Gene15   Selected Glycolysis   20.06247
my_df2 <- read.table("data/gene_lengths.txt",sep="\t",header=T)

nrow(my_df2)
## [1] 15
head(my_df2)
##    Gene Length
## 1 Gene1   1788
## 2 Gene3    213
## 3 Gene5    529
## 4 Gene7    234
## 5 Gene8   1638
## 6 Gene9    917
nrow(my_df)
## [1] 100
merge_df <- merge(my_df, my_df2, by.x="geneName","Gene", all=FALSE)
merge_df
##    geneName ofInterest    pathway Expression Length
## 1     Gene1   Selected Glycolysis   20.09519   1788
## 2    Gene10   Selected Glycolysis   34.91377   1882
## 3    Gene12   Selected Glycolysis   27.01314    501
## 4    Gene13   Selected Glycolysis   74.08310   1045
## 5    Gene15   Selected Glycolysis   20.06247   1869
## 6    Gene16   Selected       TGFb   56.03506    851
## 7    Gene17   Selected       TGFb   54.00140   1807
## 8    Gene18   Selected       TGFb   59.04783    600
## 9    Gene19   Selected       TGFb   42.91023   1889
## 10   Gene20   Selected       TGFb   66.09706    992
## 11    Gene3   Selected Glycolysis   20.99712    213
## 12    Gene5   Selected Glycolysis   22.00567    529
## 13    Gene7   Selected Glycolysis   26.07826    234
## 14    Gene8   Selected Glycolysis   22.92961   1638
## 15    Gene9   Selected Glycolysis   21.02250    917
x <- 1:10
x[x < 4]
## [1] 1 2 3
x <- 10
y <- 4
if(x > y){
  message("The value of x is ",x," which is greater than ", y)
}
## The value of x is 10 which is greater than 4
y <- 20
if(x > y){
  message("The value of x is ",x," which is greater than ", y)
}
x <- 3
if(x < 5){
  message(x, " is less than to 5")
   }else{
     message(x," is greater than or equal to 5")
}
## 3 is less than to 5
x <- 10
if(x < 5){
  message(x, " is less than 5")
   }else{
     message(x," is greater than or equal to 5")
}
## 10 is greater than or equal to 5
x <- 5
if(x > 5){
  message(x," is greater than 5")
  }else if(x == 5){
    message(x," is 5")
  }else{
    message(x, " is less than 5")
  }
## 5 is 5
x <- 1:10
x
##  [1]  1  2  3  4  5  6  7  8  9 10
ifelse(x <= 3,"lessOrEqual","more") 
##  [1] "lessOrEqual" "lessOrEqual" "lessOrEqual" "more"        "more"       
##  [6] "more"        "more"        "more"        "more"        "more"
ifelse(x == 3,"same",
       ifelse(x < 3,"less","more")
      ) 
##  [1] "less" "less" "same" "more" "more" "more" "more" "more" "more" "more"
x <- 1:5
for(i in x){
  message(i," ", appendLF = F)
}
## 1 2 3 4 5
x <- toupper(letters[1:5])
for(i in x){
  message(i," ", appendLF = F)
}
## A B C D E
geneName <- c("Ikzf1","Myc","Igll1")
expression <- c(10.4,4.3,6.5)
1:length(geneName)
## [1] 1 2 3
for(i in 1:length(geneName)){
  message(geneName[i]," has an RPKM of ",expression[i])
}
## Ikzf1 has an RPKM of 10.4
## Myc has an RPKM of 4.3
## Igll1 has an RPKM of 6.5
x <- 1:13

for(i in 1:13){
  if(i > 10){
    message("Number ",i," is greater than 10")
  }else if(i == 10){
    message("Number ",i," is  10") 
  }else{
    message("Number ",i," is less than 10") 
  }
}
## Number 1 is less than  10
## Number 2 is less than  10
## Number 3 is less than  10
## Number 4 is less than  10
## Number 5 is less than  10
## Number 6 is less than  10
## Number 7 is less than  10
## Number 8 is less than  10
## Number 9 is less than  10
## Number 10 is  10
## Number 11 is greater than 10
## Number 12 is greater than 10
## Number 13 is greater than 10
exampleVector <- c(1,2,3,4,5)
exampleList <- list(1,2,3,4,5)
sapply(exampleVector, mean, na.rm=T)
## [1] 1 2 3 4 5
sapply(exampleList, mean, na.rm=T)
## [1] 1 2 3 4 5
exampleList <- list(row1=1:5, 
                    row2=6:10, 
                    row3=11:15)
exampleList
## $row1
## [1] 1 2 3 4 5
## 
## $row2
## [1]  6  7  8  9 10
## 
## $row3
## [1] 11 12 13 14 15
lapply(exampleList, quantile)
## $row1
##   0%  25%  50%  75% 100% 
##    1    2    3    4    5 
## 
## $row2
##   0%  25%  50%  75% 100% 
##    6    7    8    9   10 
## 
## $row3
##   0%  25%  50%  75% 100% 
##   11   12   13   14   15
sapply(exampleList, quantile)
##      row1 row2 row3
## 0%      1    6   11
## 25%     2    7   12
## 50%     3    8   13
## 75%     4    9   14
## 100%    5   10   15
exampleList <- list(df=data.frame(sample=paste0("patient",1:2), data=c(1,12)),
                    vec=c(1,3,4,5))
sapply(exampleList, summary)
## $df
##     sample               data      
##  Length:2           Min.   : 1.00  
##  Class :character   1st Qu.: 3.75  
##  Mode  :character   Median : 6.50  
##                     Mean   : 6.50  
##                     3rd Qu.: 9.25  
##                     Max.   :12.00  
## 
## $vec
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    2.50    3.50    3.25    4.25    5.00
plot(merge_df[,c(4,5)])
plot(merge_df[,3])
## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Error in plot.window(...) : need finite 'ylim' values
merge_df[,3] <- factor(merge_df[,3])
plot(merge_df[,3])
boxplot(Expression ~ pathway, merge_df)
library(ggplot2)

ggplot(merge_df, aes(x=pathway, y=Expression, fill=pathway))+
  geom_violin()+
  geom_jitter(width=0.1)+
  theme_linedraw()+
  ggtitle("Gene Expression in Glycolyis and TGFb pathways")

Gene_Name	Sample_1.hi	Sample_2.hi	Sample_3.hi
Gene_a	3.246166	2.870964	3.572678
Gene_b	2.759227	2.734914	4.042756
Gene_c	4.423293	2.349322	3.940265
Gene_d	3.657753	4.797010	4.606233
Gene_e	10.413732	11.695097	10.084415
Gene_f	9.402434	9.680749	10.386515
Gene_g	9.276792	10.523490	8.947378
Gene_h	10.805977	10.766365	9.230518

Introduction to R - Abridged, Session 2

Rockefeller University, Bioinformatics Resource Centre

http://rockefelleruniversity.github.io/RU_introtoR_abridged/

Introduction to R (part 2)

Overview

Reading and Writing Data

Data from external sources

First we need a file to read in

Data from text file with read.table()

Row names in read.table()

Data from other sources

Writing data to file

Reviewing your data

Reviewing your data

The rio (R io) package

The rio package

The rio package

The rio package

The rio package

The rio package

Ordering, selecting and merging

Working with your data

Lets get some input data

Ordering

Ordering

Ordering

Subsetting

Logical operators

Logical and indexing

Combining logical vectors

The %in% operator

The %in% operator

Merging

Merging data frames

Conditions and Loops

Conditions and Loops

Conditional branching

else following an if

else if

ifelse()

ifelse()

Loops

For loops

Looping through indices

Loops and conditionals

Functions to loop over data types

sapply()

sapply() example

sapply() example 2

sapply() example 3

Plotting

Base plotting

Base plotting

Base plotting

Base plotting

Beyond Base plots

Time for an exercise!

Answers to exercise

What we didn’t cover

Getting help