Dr. Michael Mullarkey
# A tibble: 6 × 3
individual_id date_egg species
<chr> <date> <chr>
1 N100A1 2009-11-21 Chinstrap penguin (Pygoscelis antarctica)
2 N100A2 2009-11-21 Chinstrap penguin (Pygoscelis antarctica)
3 N10A1 2007-11-16 Adelie Penguin (Pygoscelis adeliae)
4 N10A2 2007-11-16 Adelie Penguin (Pygoscelis adeliae)
5 N11A1 2008-11-09 Gentoo penguin (Pygoscelis papua)
6 N11A1 2007-11-12 Adelie Penguin (Pygoscelis adeliae)
# A tibble: 6 × 3
individual_id date_egg species
<chr> <date> <chr>
1 N100A1 2009-11-21 Chinstrap penguin (Pygoscelis antarctica)
2 N100A2 2009-11-21 Chinstrap penguin (Pygoscelis antarctica)
3 N10A1 2007-11-16 Adelie Penguin (Pygoscelis adeliae)
4 N10A2 2007-11-16 Adelie Penguin (Pygoscelis adeliae)
5 N11A1 2008-11-09 Gentoo penguin (Pygoscelis papua)
6 N11A2 2008-11-09 Gentoo penguin (Pygoscelis papua)
penguins_raw %>%
clean_names() %>%
select(individual_id, date_egg, species) %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE) %>%
head()
penguins_raw_new %>%
clean_names() %>%
select(individual_id, date_egg, species) %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE) %>%
head()
penguins_raw %>%
clean_names() %>%
select(individual_id, date_egg, species) %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE) %>%
head()
penguins_raw_new %>%
clean_names() %>%
select(individual_id, date_egg, species) %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE) %>%
head()
penguins_raw_old %>%
clean_names() %>%
select(individual_id, date_egg, species) %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE) %>%
head()
penguins_raw %>%
clean_names() %>%
select(individual_id, date_egg, species) %>%
arrange(individual_id, date_egg) %>%
distinct(individual_id, .keep_all = TRUE) %>%
head()
penguins_raw_new %>%
clean_names() %>%
select(individual_id, date_egg, species) %>%
arrange(individual_id, date_egg) %>%
distinct(individual_id, .keep_all = TRUE) %>%
head()
penguins_raw_old %>%
clean_names() %>%
select(individual_id, date_egg, species) %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE) %>%
head()
penguins_raw %>%
clean_names() %>%
select(individual_id, date_egg, species) %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE) %>%
head()
clean_penguins <- function(.data) {
.data %>%
clean_names() %>%
select(individual_id, date_egg, species) %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE) %>%
head()
}
clean_penguins <- function(.data) {
.data %>%
clean_names() %>%
select(individual_id, date_egg, species) %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE) %>%
head()
}
map(list(penguins_raw,
penguins_raw_new,
penguins_raw_old),
~clean_penguins(.x))
[[1]]
# A tibble: 6 × 3
individual_id date_egg species
<chr> <date> <chr>
1 N100A1 2009-11-21 Chinstrap penguin (Pygoscelis antarctica)
2 N100A2 2009-11-21 Chinstrap penguin (Pygoscelis antarctica)
3 N10A1 2007-11-16 Adelie Penguin (Pygoscelis adeliae)
4 N10A2 2007-11-16 Adelie Penguin (Pygoscelis adeliae)
5 N11A1 2008-11-09 Gentoo penguin (Pygoscelis papua)
6 N11A2 2008-11-09 Gentoo penguin (Pygoscelis papua)
[[2]]
# A tibble: 6 × 3
individual_id date_egg species
<chr> <date> <chr>
1 N11A1 2007-11-12 Adelie Penguin (Pygoscelis adeliae)
2 N11A2 2008-11-09 Gentoo penguin (Pygoscelis papua)
3 N13A1 2007-11-10 Adelie Penguin (Pygoscelis adeliae)
4 N13A2 2009-11-20 Gentoo penguin (Pygoscelis papua)
5 N14A2 2009-11-25 Gentoo penguin (Pygoscelis papua)
6 N15A1 2009-11-25 Gentoo penguin (Pygoscelis papua)
[[3]]
# A tibble: 6 × 3
individual_id date_egg species
<chr> <date> <chr>
1 N100A2 2009-11-21 Chinstrap penguin (Pygoscelis antarctica)
2 N12A1 2007-11-12 Adelie Penguin (Pygoscelis adeliae)
3 N12A2 2007-11-12 Adelie Penguin (Pygoscelis adeliae)
4 N14A1 2009-11-25 Gentoo penguin (Pygoscelis papua)
5 N15A1 2009-11-25 Gentoo penguin (Pygoscelis papua)
6 N16A2 2008-11-03 Gentoo penguin (Pygoscelis papua)
do_it_all <- function(.data) {
.data %>%
clean_names() %>%
select(individual_id, date_egg,
species, flipper_length_mm,
island) %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE) %>%
group_by(island) %>%
summarise(
mean_flipper_length = mean(flipper_length_mm, na.rm = TRUE),
sd_flipper_length = sd(flipper_length_mm, na.rm = TRUE)
) %>%
arrange(desc(mean_flipper_length), .by_group = TRUE) %>%
slice_max(mean_flipper_length) %>%
pull(island)
}
[1] "Biscoe"
do_it_all <- function(.data) {
.data %>%
clean_names() %>%
select(individual_id, date_egg,
species, flipper_length_mm,
island) %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE) %>%
group_by(island) %>%
summarise(
mean_flipper_length = mean(flipper_length_mm, na.rm = TRUE),
sd_flipper_length = sd(flipper_length_mm, na.rm = TRUE)
) %>%
arrange(desc(mean_flipper_length), .by_group = TRUE) %>%
slice_max(mean_flipper_length) %>%
pull(island)
}
penguins_selected <- penguins_raw %>%
select_vars(vars_to_select = c("individual_id", "date_egg",
"species", "flipper_length_mm", "culmen_length_mm",
"island")) %>%
print()
# A tibble: 344 × 6
individual_id date_egg species flipper_length_mm culmen_length_mm island
<chr> <date> <chr> <dbl> <dbl> <chr>
1 N1A1 2007-11-11 Adelie Pe… 181 39.1 Torge…
2 N1A2 2007-11-11 Adelie Pe… 186 39.5 Torge…
3 N2A1 2007-11-16 Adelie Pe… 195 40.3 Torge…
4 N2A2 2007-11-16 Adelie Pe… NA NA Torge…
5 N3A1 2007-11-16 Adelie Pe… 193 36.7 Torge…
6 N3A2 2007-11-16 Adelie Pe… 190 39.3 Torge…
7 N4A1 2007-11-15 Adelie Pe… 181 38.9 Torge…
8 N4A2 2007-11-15 Adelie Pe… 195 39.2 Torge…
9 N5A1 2007-11-09 Adelie Pe… 193 34.1 Torge…
10 N5A2 2007-11-09 Adelie Pe… 190 42 Torge…
# ℹ 334 more rows
do_it_all <- function(.data) {
.data %>%
clean_names() %>%
select(individual_id, date_egg,
species, flipper_length_mm,
island) %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE) %>%
group_by(island) %>%
summarise(
mean_flipper_length = mean(flipper_length_mm, na.rm = TRUE),
sd_flipper_length = sd(flipper_length_mm, na.rm = TRUE)
) %>%
arrange(desc(mean_flipper_length), .by_group = TRUE) %>%
slice_max(mean_flipper_length) %>%
pull(island)
}
# A tibble: 190 × 6
individual_id date_egg species flipper_length_mm culmen_length_mm island
<chr> <date> <chr> <dbl> <dbl> <chr>
1 N100A1 2009-11-21 Chinstrap… 210 50.8 Dream
2 N100A2 2009-11-21 Chinstrap… 198 50.2 Dream
3 N10A1 2007-11-16 Adelie Pe… 184 34.4 Torge…
4 N10A2 2007-11-16 Adelie Pe… 194 46 Torge…
5 N11A1 2008-11-09 Gentoo pe… 210 45.5 Biscoe
6 N11A2 2008-11-09 Gentoo pe… 225 50.5 Biscoe
7 N12A1 2008-11-02 Gentoo pe… 213 44.9 Biscoe
8 N12A2 2008-11-02 Gentoo pe… 215 45.2 Biscoe
9 N13A1 2009-11-20 Gentoo pe… 218 43.4 Biscoe
10 N13A2 2009-11-20 Gentoo pe… 218 51.3 Biscoe
# ℹ 180 more rows
do_it_all <- function(.data) {
.data %>%
clean_names() %>%
select(individual_id, date_egg,
species, flipper_length_mm,
island) %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE) %>%
group_by(island) %>%
summarise(
mean_flipper_length = mean(flipper_length_mm, na.rm = TRUE),
sd_flipper_length = sd(flipper_length_mm, na.rm = TRUE)
) %>%
arrange(desc(mean_flipper_length), .by_group = TRUE) %>%
slice_max(mean_flipper_length) %>%
pull(island)
}
get_grouped_summary <- function(.data,
grouping_var,
summary_vars,
arrange_var) {
.data %>%
group_by(.data[[grouping_var]]) %>%
summarise(across(all_of(summary_vars),
list(
mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
median = ~median(., na.rm = TRUE)
)
)) %>%
arrange(desc({{ arrange_var }}), .by_group = TRUE)
}
penguins_summary <- penguins_latest %>%
get_grouped_summary(
grouping_var = "island",
summary_vars = c("flipper_length_mm","culmen_length_mm"),
arrange_var = "flipper_length_mm_mean"
) %>%
print()
# A tibble: 3 × 7
island flipper_length_mm_mean flipper_length_mm_sd flipper_length_mm_median
<chr> <dbl> <dbl> <dbl>
1 Biscoe 211. 13.1 214
2 Dream 194. 7.24 193
3 Torgersen 192. 6.77 191
# ℹ 3 more variables: culmen_length_mm_mean <dbl>, culmen_length_mm_sd <dbl>,
# culmen_length_mm_median <dbl>
do_it_all <- function(.data) {
.data %>%
clean_names() %>%
select(individual_id, date_egg,
species, flipper_length_mm,
island) %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE) %>%
group_by(island) %>%
summarise(
mean_flipper_length = mean(flipper_length_mm, na.rm = TRUE),
sd_flipper_length = sd(flipper_length_mm, na.rm = TRUE)
) %>%
arrange(desc(mean_flipper_length), .by_group = TRUE) %>%
slice_max(mean_flipper_length) %>%
pull(island)
}
do_it_all <- function(.data) {
.data %>%
clean_names() %>%
select(individual_id, date_egg,
species, flipper_length_mm,
island) %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE) %>%
group_by(island) %>%
summarise(
mean_flipper_length = mean(flipper_length_mm, na.rm = TRUE),
sd_flipper_length = sd(flipper_length_mm, na.rm = TRUE)
) %>%
arrange(desc(mean_flipper_length), .by_group = TRUE) %>%
slice_max(mean_flipper_length) %>%
pull(island)
}
do_it_all <- function(.data,
vars_to_select_func,
grouping_var_func,
summary_vars_func,
arrange_var_func,
max_var_func) {
selected_df <- .data %>%
select_vars(vars_to_select = vars_to_select_func)
latest_df <- selected_df %>%
get_only_latest()
group_summary_df <- latest_df %>%
get_grouped_summary(
grouping_var = grouping_var_func,
summary_vars = summary_vars_func,
arrange_var = arrange_var_func
)
max_group <- group_summary_df %>%
identify_max_group(
max_var = max_var_func,
group_var = grouping_var_func
)
return(max_group)
}
penguins_raw %>%
do_it_all(
vars_to_select_func = c("individual_id", "date_egg",
"species", "flipper_length_mm", "culmen_length_mm", "island"),
grouping_var_func = "island",
summary_vars_func = c("flipper_length_mm","culmen_length_mm"),
arrange_var_func = "flipper_length_mm_mean",
max_var_func = "flipper_length_mm_mean"
)
[1] "Biscoe"
do_it_all <- function(.data,
vars_to_select_func,
grouping_var_func,
summary_vars_func,
arrange_var_func,
max_var_func) {
selected_df <- .data %>%
select_vars(vars_to_select = vars_to_select_func)
latest_df <- selected_df %>%
get_only_latest()
# group_summary_df <- latest_df %>%
# get_grouped_summary(
# grouping_var = grouping_var_func,
# summary_vars = summary_vars_func,
# arrange_var = arrange_var_func
# )
#
# max_group <- group_summary_df %>%
# identify_max_group(
# max_var = max_var_func,
# group_var = grouping_var_func
# )
return(latest_df)
}
# A tibble: 6 × 3
individual_id date_egg species
<chr> <date> <chr>
1 N100A1 2009-11-21 Chinstrap penguin (Pygoscelis antarctica)
2 N100A2 2009-11-21 Chinstrap penguin (Pygoscelis antarctica)
3 N10A1 2007-11-16 Adelie Penguin (Pygoscelis adeliae)
4 N10A2 2007-11-16 Adelie Penguin (Pygoscelis adeliae)
5 N11A1 2008-11-09 Gentoo penguin (Pygoscelis papua)
6 N11A2 2008-11-09 Gentoo penguin (Pygoscelis papua)
# get_max_group.R
# Main Function
do_it_all <- function(.data,
vars_to_select_func,
grouping_var_func,
summary_vars_func,
arrange_var_func,
max_var_func) {
selected_df <- .data %>%
select_vars(vars_to_select = vars_to_select_func)
latest_df <- selected_df %>%
get_only_latest()
group_summary_df <- latest_df %>%
get_grouped_summary(
grouping_var = grouping_var_func,
summary_vars = summary_vars_func,
arrange_var = arrange_var_func
)
max_group <- group_summary_df %>%
identify_max_group(
max_var = max_var_func,
group_var = grouping_var_func
)
return(max_group)
}
# Helper Functions
select_vars <- function(.data, vars_to_select) {
.data %>%
clean_names() %>%
select(any_of(vars_to_select))
}
get_only_latest <- function(.data) {
.data %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE)
}
get_grouped_summary <- function(.data,
grouping_var,
summary_vars,
arrange_var) {
.data %>%
group_by(.data[[grouping_var]]) %>%
summarise(across(all_of(summary_vars),
list(
mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
median = ~median(., na.rm = TRUE)
)
)) %>%
arrange(desc({{ arrange_var }}), .by_group = TRUE)
}
identify_max_group <- function(.data,
max_var,
group_var) {
.data %>%
slice_max(order_by = .data[[max_var]], n = 1) %>%
pull({{ group_var }})
}
# Run Main Function
library(palmerpenguins)
output <- penguins_raw %>%
do_it_all(
vars_to_select_func = c("individual_id", "date_egg",
"species", "flipper_length_mm", "culmen_length_mm", "island"),
grouping_var_func = "island",
summary_vars_func = c("flipper_length_mm","culmen_length_mm"),
arrange_var_func = "flipper_length_mm_mean",
max_var_func = "flipper_length_mm_mean"
)
print(output)
# get_max_group.R
# Take in command line arguments
args <- commandArgs(trailingOnly = TRUE)
# Main Function
do_it_all <- function(.data,
vars_to_select_func,
grouping_var_func,
summary_vars_func,
arrange_var_func,
max_var_func) {
selected_df <- .data %>%
select_vars(vars_to_select = vars_to_select_func)
latest_df <- selected_df %>%
get_only_latest()
group_summary_df <- latest_df %>%
get_grouped_summary(
grouping_var = grouping_var_func,
summary_vars = summary_vars_func,
arrange_var = arrange_var_func
)
max_group <- group_summary_df %>%
identify_max_group(
max_var = max_var_func,
group_var = grouping_var_func
)
return(max_group)
}
# Helper Functions
select_vars <- function(.data, vars_to_select) {
.data %>%
clean_names() %>%
select(any_of(vars_to_select))
}
get_only_latest <- function(.data) {
.data %>%
arrange(individual_id, desc(date_egg)) %>%
distinct(individual_id, .keep_all = TRUE)
}
get_grouped_summary <- function(.data,
grouping_var,
summary_vars,
arrange_var) {
.data %>%
group_by(.data[[grouping_var]]) %>%
summarise(across(all_of(summary_vars),
list(
mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
median = ~median(., na.rm = TRUE)
)
)) %>%
arrange(desc({{ arrange_var }}), .by_group = TRUE)
}
identify_max_group <- function(.data,
max_var,
group_var) {
.data %>%
slice_max(order_by = .data[[max_var]], n = 1) %>%
pull({{ group_var }})
}
# Run Main Function
library(palmerpenguins)
output <- penguins_raw %>%
do_it_all(
vars_to_select_func = c("individual_id", "date_egg",
"species", "flipper_length_mm", "culmen_length_mm", "island"),
grouping_var_func = args[1],
summary_vars_func = c("flipper_length_mm","culmen_length_mm"),
arrange_var_func = "flipper_length_mm_mean",
max_var_func = "flipper_length_mm_mean"
)
print(output)