These exercises cover the sections of tidyverse session.
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## salmon_id = col_double(),
## `common_name-size` = col_character(),
## variable = col_character(),
## value = col_double()
## )
## # A tibble: 194 x 4
## salmon_id `common_name-size` variable value
## <dbl> <chr> <chr> <dbl>
## 1 35032 Chinook salmon - yearling length_mm 147
## 2 35032 Chinook salmon - yearling IGF1_ng_ml 41.3
## 3 35033 Chinook salmon - mixed age juvenile length_mm 444
## 4 35033 Chinook salmon - mixed age juvenile IGF1_ng_ml 62.1
## 5 35034 Sockeye salmon - juvenile length_mm 139
## 6 35034 Sockeye salmon - juvenile IGF1_ng_ml NA
## 7 35035 Sockeye salmon - juvenile length_mm 121
## 8 35035 Sockeye salmon - juvenile IGF1_ng_ml NA
## 9 35036 Sockeye salmon - juvenile length_mm 112
## 10 35036 Sockeye salmon - juvenile IGF1_ng_ml NA
## # … with 184 more rows
fish_df <- read_csv("data/exercises_fish.csv", col_types = cols(
salmon_id = col_integer(),
`common_name-size` = col_factor(),
variable = col_factor(),
value = col_double()))
fish_df
## # A tibble: 194 x 4
## salmon_id `common_name-size` variable value
## <int> <fct> <fct> <dbl>
## 1 35032 Chinook salmon - yearling length_mm 147
## 2 35032 Chinook salmon - yearling IGF1_ng_ml 41.3
## 3 35033 Chinook salmon - mixed age juvenile length_mm 444
## 4 35033 Chinook salmon - mixed age juvenile IGF1_ng_ml 62.1
## 5 35034 Sockeye salmon - juvenile length_mm 139
## 6 35034 Sockeye salmon - juvenile IGF1_ng_ml NA
## 7 35035 Sockeye salmon - juvenile length_mm 121
## 8 35035 Sockeye salmon - juvenile IGF1_ng_ml NA
## 9 35036 Sockeye salmon - juvenile length_mm 112
## 10 35036 Sockeye salmon - juvenile IGF1_ng_ml NA
## # … with 184 more rows
fish_df %<>%
separate(`common_name-size`, sep = " - ", into=c("common_name", "age"), remove=TRUE) %>%
pivot_wider(names_from = variable, values_from = value)
## # A tibble: 38 x 2
## age IGF1_ng_ml
## <chr> <dbl>
## 1 juvenile 42.7
## 2 juvenile NA
## 3 juvenile 9.92
## 4 juvenile 38.1
## 5 juvenile 46.4
## 6 juvenile 42.6
## 7 juvenile 23.6
## 8 juvenile 4.46
## 9 juvenile NA
## 10 juvenile 24.2
## # … with 28 more rows
fish_df %>% filter(str_starts(common_name,'S')) %>%
select( -IGF1_ng_ml) %>% arrange(common_name, desc(length_mm))
## # A tibble: 49 x 4
## salmon_id common_name age length_mm
## <int> <chr> <chr> <dbl>
## 1 35144 Sockeye salmon juvenile 140
## 2 35034 Sockeye salmon juvenile 139
## 3 35119 Sockeye salmon juvenile 122
## 4 35035 Sockeye salmon juvenile 121
## 5 35100 Sockeye salmon juvenile 118
## 6 35096 Sockeye salmon juvenile 115
## 7 35147 Sockeye salmon juvenile 115
## 8 35036 Sockeye salmon juvenile 112
## 9 35098 Sockeye salmon juvenile 112
## 10 35099 Sockeye salmon juvenile 111
## # … with 39 more rows
fish_df %>%
group_by( age) %>%
filter(rank(-length_mm) <= 5) %>%
summarise( mean_length_mm = mean(length_mm, na.rm = T))
## # A tibble: 4 x 2
## age mean_length_mm
## <chr> <dbl>
## 1 juvenile 274
## 2 mixed age juvenile 380.
## 3 subyearling 90
## 4 yearling 213
fish_df %>%
filter(age == 'yearling') %>%
group_by(common_name) %>%
mutate(length_zscore = scale(length_mm)) %>%
ggplot(aes(x = common_name, y = length_zscore)) +
geom_boxplot(fill=c('gold','darkorange'))
tidy_counts <- read_csv("data/tidy_counts.csv", col_types = cols(
ENTREZ = col_character(),
count_total = col_character(),
Rep = col_logical()))
## Warning: 188 parsing failures.
## row col expected actual file
## 3 Rep 1/0/T/F/TRUE/FALSE 2 'data/tidy_counts.csv'
## 4 Rep 1/0/T/F/TRUE/FALSE 2 'data/tidy_counts.csv'
## 7 Rep 1/0/T/F/TRUE/FALSE 2 'data/tidy_counts.csv'
## 8 Rep 1/0/T/F/TRUE/FALSE 2 'data/tidy_counts.csv'
## 11 Rep 1/0/T/F/TRUE/FALSE 2 'data/tidy_counts.csv'
## ... ... .................. ...... ......................
## See problems(...) for more details.
## [1] "350" "350" "350" "350" "351" "351" "351" "351"
## [1] "307" "307" "307" "307" "26580" "26580" "26580" "26580"
## [1] TRUE TRUE NA NA TRUE TRUE NA NA
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ENTREZ = col_double(),
## Sample = col_character(),
## CellType = col_character(),
## Rep = col_double(),
## counts = col_double(),
## count_total = col_double(),
## CPM = col_double(),
## SYMBOL = col_character(),
## CHR = col_character(),
## LENGTH = col_double(),
## TPM = col_double()
## )
## [1] 94
tidy_counts %>%
pull(SYMBOL) %>%
str_starts('ATP') %>%
tidy_counts[.,] %>%
pull(SYMBOL) %>%
unique()
## [1] "ATP1A1" "ATP1A2" "ATP1A3" "ATP12A" "ATP1A4" "ATP1B1" "ATP1B2" "ATP1B3"
## [9] "ATP2A1" "ATP2A2" "ATP2A3" "ATP2B1" "ATP2B2" "ATP2B3" "ATP2B4"
tidy_counts %>%
pull(SYMBOL) %>%
str_starts('ATP') %>%
tidy_counts[.,] %>%
mutate(ATPtype = str_replace_all(SYMBOL,'ATP','' )) %>%
mutate(ATPtype = str_to_lower(ATPtype))
## # A tibble: 60 x 12
## ENTREZ Sample CellType Rep counts count_total CPM SYMBOL CHR LENGTH
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 476 CD34_1 CD34 1 4952 14023 33065. ATP1A1 chr1 5912
## 2 476 ORTHO_1 ORTHO 1 3453 14023 41990. ATP1A1 chr1 5912
## 3 476 CD34_2 CD34 2 4202 14023 39760. ATP1A1 chr1 5912
## 4 476 ORTHO_2 ORTHO 2 1416 14023 43033. ATP1A1 chr1 5912
## 5 477 CD34_1 CD34 1 13 44 86.8 ATP1A2 chr1 5972
## 6 477 ORTHO_1 ORTHO 1 3 44 36.5 ATP1A2 chr1 5972
## 7 477 CD34_2 CD34 2 26 44 246. ATP1A2 chr1 5972
## 8 477 ORTHO_2 ORTHO 2 2 44 60.8 ATP1A2 chr1 5972
## 9 478 CD34_1 CD34 1 78 266 521. ATP1A3 chr19 4054
## 10 478 ORTHO_1 ORTHO 1 121 266 1471. ATP1A3 chr19 4054
## # … with 50 more rows, and 2 more variables: TPM <dbl>, ATPtype <chr>