RU_tidyverse

These exercises cover the sections of tidyverse session.

Load in the tidyverse. Read in the fish ecology data in the csv file exercises_fish.csv, within the data directory. Read the file in again a second time but update the data types to reflect what they are i.e. 2 factors, an integer and a double. Save it as a variable called fish_df.

library(tidyverse)

read_csv("data/exercises_fish.csv")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   salmon_id = col_double(),
##   `common_name-size` = col_character(),
##   variable = col_character(),
##   value = col_double()
## )

## # A tibble: 194 x 4
##    salmon_id `common_name-size`                  variable   value
##        <dbl> <chr>                               <chr>      <dbl>
##  1     35032 Chinook salmon - yearling           length_mm  147  
##  2     35032 Chinook salmon - yearling           IGF1_ng_ml  41.3
##  3     35033 Chinook salmon - mixed age juvenile length_mm  444  
##  4     35033 Chinook salmon - mixed age juvenile IGF1_ng_ml  62.1
##  5     35034 Sockeye salmon - juvenile           length_mm  139  
##  6     35034 Sockeye salmon - juvenile           IGF1_ng_ml  NA  
##  7     35035 Sockeye salmon - juvenile           length_mm  121  
##  8     35035 Sockeye salmon - juvenile           IGF1_ng_ml  NA  
##  9     35036 Sockeye salmon - juvenile           length_mm  112  
## 10     35036 Sockeye salmon - juvenile           IGF1_ng_ml  NA  
## # … with 184 more rows

fish_df <- read_csv("data/exercises_fish.csv", col_types = cols(
  salmon_id = col_integer(),
  `common_name-size` = col_factor(),
  variable = col_factor(),
  value = col_double()))

fish_df

## # A tibble: 194 x 4
##    salmon_id `common_name-size`                  variable   value
##        <int> <fct>                               <fct>      <dbl>
##  1     35032 Chinook salmon - yearling           length_mm  147  
##  2     35032 Chinook salmon - yearling           IGF1_ng_ml  41.3
##  3     35033 Chinook salmon - mixed age juvenile length_mm  444  
##  4     35033 Chinook salmon - mixed age juvenile IGF1_ng_ml  62.1
##  5     35034 Sockeye salmon - juvenile           length_mm  139  
##  6     35034 Sockeye salmon - juvenile           IGF1_ng_ml  NA  
##  7     35035 Sockeye salmon - juvenile           length_mm  121  
##  8     35035 Sockeye salmon - juvenile           IGF1_ng_ml  NA  
##  9     35036 Sockeye salmon - juvenile           length_mm  112  
## 10     35036 Sockeye salmon - juvenile           IGF1_ng_ml  NA  
## # … with 184 more rows

Tidy fish_df using pipes to link everything together. Update fish_df with this tidied dataframe.

fish_df %<>% 
  separate(`common_name-size`, sep = " - ", into=c("common_name", "age"), remove=TRUE) %>%
  pivot_wider(names_from = variable, values_from = value)

From your tidied fish_df create a dataframe with the variables age and IGF for only the Steelhead fish

fish_df %>% filter(common_name == 'Steelhead') %>%
  select( age, IGF1_ng_ml)

## # A tibble: 38 x 2
##    age      IGF1_ng_ml
##    <chr>         <dbl>
##  1 juvenile      42.7 
##  2 juvenile      NA   
##  3 juvenile       9.92
##  4 juvenile      38.1 
##  5 juvenile      46.4 
##  6 juvenile      42.6 
##  7 juvenile      23.6 
##  8 juvenile       4.46
##  9 juvenile      NA   
## 10 juvenile      24.2 
## # … with 28 more rows

From your tidied fish_df create a dataframe with all the variables except the IGF values, for all fish that begin with S. Arrange the dataframe based on common_name and length (largest first).

fish_df %>% filter(str_starts(common_name,'S')) %>% 
  select( -IGF1_ng_ml) %>% arrange(common_name, desc(length_mm))

## # A tibble: 49 x 4
##    salmon_id common_name    age      length_mm
##        <int> <chr>          <chr>        <dbl>
##  1     35144 Sockeye salmon juvenile       140
##  2     35034 Sockeye salmon juvenile       139
##  3     35119 Sockeye salmon juvenile       122
##  4     35035 Sockeye salmon juvenile       121
##  5     35100 Sockeye salmon juvenile       118
##  6     35096 Sockeye salmon juvenile       115
##  7     35147 Sockeye salmon juvenile       115
##  8     35036 Sockeye salmon juvenile       112
##  9     35098 Sockeye salmon juvenile       112
## 10     35099 Sockeye salmon juvenile       111
## # … with 39 more rows

From your tidied fish_df perform these steps using a pipe:

Group by the variable age.
Filter to get the biggest 5 by the variable length in each group.
Summarize this data frame over the variable length by calculating the mean.

fish_df %>% 
  group_by( age) %>%
  filter(rank(-length_mm) <= 5) %>% 
  summarise( mean_length_mm = mean(length_mm, na.rm = T))

## # A tibble: 4 x 2
##   age                mean_length_mm
##   <chr>                       <dbl>
## 1 juvenile                     274 
## 2 mixed age juvenile           380.
## 3 subyearling                   90 
## 4 yearling                     213

From your tidied fish_df perform these steps using a pipe:

Filter to only look at yearling in variable age
Group by common_name
Create new variable that is z-score of length across groups (HINT: The scale() function can be used to calculate z-scores)
Create boxplot of grouped length z-scores

fish_df %>% 
  filter(age == 'yearling') %>%
  group_by(common_name) %>% 
  mutate(length_zscore = scale(length_mm)) %>%
  ggplot(aes(x = common_name, y = length_zscore)) +
  geom_boxplot(fill=c('gold','darkorange'))

Read in the count data tidy_counts.csv from the data directory. Try setting different data types i.e. factors and logicals. What happens when we extract the vector of these counts? Data types are coerced if the wrong data types are specified.

tidy_counts <- read_csv("data/tidy_counts.csv", col_types = cols(
    ENTREZ = col_character(),
    count_total = col_character(),
    Rep = col_logical()))

## Warning: 188 parsing failures.
## row col           expected actual                   file
##   3 Rep 1/0/T/F/TRUE/FALSE      2 'data/tidy_counts.csv'
##   4 Rep 1/0/T/F/TRUE/FALSE      2 'data/tidy_counts.csv'
##   7 Rep 1/0/T/F/TRUE/FALSE      2 'data/tidy_counts.csv'
##   8 Rep 1/0/T/F/TRUE/FALSE      2 'data/tidy_counts.csv'
##  11 Rep 1/0/T/F/TRUE/FALSE      2 'data/tidy_counts.csv'
## ... ... .................. ...... ......................
## See problems(...) for more details.

tidy_counts$ENTREZ %>% head(n=8)

## [1] "350" "350" "350" "350" "351" "351" "351" "351"

tidy_counts$count_total %>% head(n=8)

## [1] "307"   "307"   "307"   "307"   "26580" "26580" "26580" "26580"

tidy_counts$Rep %>% head(n=8)

## [1] TRUE TRUE   NA   NA TRUE TRUE   NA   NA

Re-read the tidy counts CSV into R. How many ATPase genes (starting with ATP) are there and what are they? Subset dataframe to just these ATPase genes. Create a new lowercase variable that has ‘ATP’ removed from the symbol.

Load in dataset and packages

tidy_counts <- read_csv("data/tidy_counts.csv")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ENTREZ = col_double(),
##   Sample = col_character(),
##   CellType = col_character(),
##   Rep = col_double(),
##   counts = col_double(),
##   count_total = col_double(),
##   CPM = col_double(),
##   SYMBOL = col_character(),
##   CHR = col_character(),
##   LENGTH = col_double(),
##   TPM = col_double()
## )

tidy_counts %>% 
  pull(SYMBOL) %>%
  unique() %>% 
  str_starts('ATP') %>% 
  length()

## [1] 94

tidy_counts %>% 
  pull(SYMBOL) %>%  
  str_starts('ATP') %>% 
  tidy_counts[.,] %>% 
  pull(SYMBOL) %>% 
  unique()

##  [1] "ATP1A1" "ATP1A2" "ATP1A3" "ATP12A" "ATP1A4" "ATP1B1" "ATP1B2" "ATP1B3"
##  [9] "ATP2A1" "ATP2A2" "ATP2A3" "ATP2B1" "ATP2B2" "ATP2B3" "ATP2B4"

tidy_counts %>%
  pull(SYMBOL) %>%  
  str_starts('ATP') %>% 
  tidy_counts[.,] %>%
  mutate(ATPtype = str_replace_all(SYMBOL,'ATP','' )) %>%
  mutate(ATPtype = str_to_lower(ATPtype))

## # A tibble: 60 x 12
##    ENTREZ Sample  CellType   Rep counts count_total     CPM SYMBOL CHR   LENGTH
##     <dbl> <chr>   <chr>    <dbl>  <dbl>       <dbl>   <dbl> <chr>  <chr>  <dbl>
##  1    476 CD34_1  CD34         1   4952       14023 33065.  ATP1A1 chr1    5912
##  2    476 ORTHO_1 ORTHO        1   3453       14023 41990.  ATP1A1 chr1    5912
##  3    476 CD34_2  CD34         2   4202       14023 39760.  ATP1A1 chr1    5912
##  4    476 ORTHO_2 ORTHO        2   1416       14023 43033.  ATP1A1 chr1    5912
##  5    477 CD34_1  CD34         1     13          44    86.8 ATP1A2 chr1    5972
##  6    477 ORTHO_1 ORTHO        1      3          44    36.5 ATP1A2 chr1    5972
##  7    477 CD34_2  CD34         2     26          44   246.  ATP1A2 chr1    5972
##  8    477 ORTHO_2 ORTHO        2      2          44    60.8 ATP1A2 chr1    5972
##  9    478 CD34_1  CD34         1     78         266   521.  ATP1A3 chr19   4054
## 10    478 ORTHO_1 ORTHO        1    121         266  1471.  ATP1A3 chr19   4054
## # … with 50 more rows, and 2 more variables: TPM <dbl>, ATPtype <chr>

RU_tidyverse

Rockefeller University, Bioinformatics Resource Centre

https://rockefelleruniversity.github.io/RU_tidyverse_core/

Load in dataset and packages