Palmer Penguins — predicting body mass

Author

Aparna Pandey and Stephan Peischl

Overview

This notebook uses Palmer Penguins as a continuous outcome example: we predict body_mass_g from bill and flipper measurements (and simple factors).

It complements the synthetic gene linear / LASSO notebook (linear-regression-lasso.Rmd), where we know the ground truth. Here we work with real measurements and the usual messiness (correlation, nonlinear hints, missing values). Concepts for shrinkage sit in Module 02; this file is the long lab to run in the IDE.

Load and clean

data("penguins", package = "palmerpenguins")
pg <- penguins |>
  tidyr::drop_na()
nrow(pg)

[1] 333

Exploratory plots

pg_num <- pg |>
  select(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)
cor_m <- cor(pg_num)
cor_df <- as.data.frame(as.table(cor_m)) |>
  rename(v1 = Var1, v2 = Var2, r = Freq)
ggplot(cor_df, aes(v1, v2, fill = r)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "steelblue", mid = "white", high = "firebrick", midpoint = 0, limits = c(-1, 1)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  coord_fixed() +
  labs(title = "Correlation among numeric penguin measurements")

GGally::ggpairs(
  pg_num,
  upper = list(continuous = wrap("cor", size = 2.2)),
  aes(color = pg$species, alpha = 0.25)
) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Train / test split and matrix of predictors

We model body_mass_g from morphometrics plus species, island, sex, and year (as numeric).

pred_df <- pg |>
  mutate(year = as.numeric(year)) |>
  select(body_mass_g, bill_length_mm, bill_depth_mm, flipper_length_mm, species, island, sex, year)

y <- pred_df$body_mass_g
X <- model.matrix(body_mass_g ~ ., data = pred_df)[, -1] # drop intercept column; dummy coding

set.seed(42)
split <- initial_split(pred_df, prop = 0.8)
train_df <- training(split)
test_df <- testing(split)
rm(split)

rec_scale <- recipe(body_mass_g ~ ., data = train_df) |>
  step_normalize(all_numeric_predictors())
prep_fit <- prep(rec_scale, train_df)
train_b <- bake(prep_fit, train_df)
test_b <- bake(prep_fit, test_df)

y_train <- train_b$body_mass_g
y_test <- test_b$body_mass_g
X_train_s <- model.matrix(body_mass_g ~ ., data = train_b)[, -1]
X_test_s <- model.matrix(body_mass_g ~ ., data = test_b)[, -1]

Ridge and LASSO with cross-validation

cv_ridge <- cv.glmnet(X_train_s, y_train, alpha = 0)
cv_lasso <- cv.glmnet(X_train_s, y_train, alpha = 1)
par(mfrow = c(1, 2))
plot(cv_ridge, main = "Ridge CV")
plot(cv_lasso, main = "LASSO CV")

best_lam_ridge <- cv_ridge$lambda.min
best_lam_lasso <- cv_lasso$lambda.min
fit_ridge <- glmnet(X_train_s, y_train, alpha = 0, lambda = best_lam_ridge)
fit_lasso <- glmnet(X_train_s, y_train, alpha = 1, lambda = best_lam_lasso)

pred_ridge <- as.numeric(predict(fit_ridge, newx = X_test_s))
pred_lasso <- as.numeric(predict(fit_lasso, newx = X_test_s))

reg_metrics <- metric_set(rmse, rsq, mae)
tibble(truth = y_test, .pred = pred_ridge) |>
  reg_metrics(truth, .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard     313.   
2 rsq     standard       0.851
3 mae     standard     254.

tibble(truth = y_test, .pred = pred_lasso) |>
  reg_metrics(truth, .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard     310.   
2 rsq     standard       0.853
3 mae     standard     249.

Coefficient path (LASSO)

fit_path <- glmnet(X_train_s, y_train, alpha = 1)
plot(fit_path, xvar = "lambda", label = TRUE, main = "LASSO coefficient paths (scaled predictors)")

Takeaways

Morphometrics and species explain much of the variation in mass — check whether a simpler scientific story (e.g. mass ~ flipper + species) is enough before chasing complexity.
Compare this open-ended exercise to the gene notebook, where we know which features are truly causal.

--- title: "Palmer Penguins — predicting body mass" author: "Aparna Pandey and Stephan Peischl" format: html: toc: true code-tools: true engine: knitr --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) library(palmerpenguins) library(dplyr) library(ggplot2) library(GGally) library(glmnet) library(tidymodels) library(tidyr) theme_set(theme_classic()) ``` # Overview This notebook uses **[Palmer Penguins](../data/cards/palmer-penguins.qmd)** as a **continuous outcome** example: we predict **`body_mass_g`** from bill and flipper measurements (and simple factors). It complements the **synthetic gene** linear / LASSO notebook (`linear-regression-lasso.Rmd`), where we know the ground truth. Here we work with **real measurements** and the usual messiness (correlation, nonlinear hints, missing values). Concepts for shrinkage sit in [Module 02](../modules/module-02-regularization.qmd); this file is the **long lab** to run in the IDE. ## Load and clean ```{r} data("penguins", package = "palmerpenguins") pg <- penguins |> tidyr::drop_na() nrow(pg) ``` ## Exploratory plots ```{r fig.width=8, fig.height=4.5} pg_num <- pg |> select(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g) cor_m <- cor(pg_num) cor_df <- as.data.frame(as.table(cor_m)) |> rename(v1 = Var1, v2 = Var2, r = Freq) ggplot(cor_df, aes(v1, v2, fill = r)) + geom_tile(color = "white") + scale_fill_gradient2(low = "steelblue", mid = "white", high = "firebrick", midpoint = 0, limits = c(-1, 1)) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + coord_fixed() + labs(title = "Correlation among numeric penguin measurements") ``` ```{r fig.width=8, fig.height=5.5} GGally::ggpairs( pg_num, upper = list(continuous = wrap("cor", size = 2.2)), aes(color = pg$species, alpha = 0.25) ) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ``` ## Train / test split and matrix of predictors We model `body_mass_g` from morphometrics plus `species`, `island`, `sex`, and `year` (as numeric). ```{r} pred_df <- pg |> mutate(year = as.numeric(year)) |> select(body_mass_g, bill_length_mm, bill_depth_mm, flipper_length_mm, species, island, sex, year) y <- pred_df$body_mass_g X <- model.matrix(body_mass_g ~ ., data = pred_df)[, -1] # drop intercept column; dummy coding set.seed(42) split <- initial_split(pred_df, prop = 0.8) train_df <- training(split) test_df <- testing(split) rm(split) rec_scale <- recipe(body_mass_g ~ ., data = train_df) |> step_normalize(all_numeric_predictors()) prep_fit <- prep(rec_scale, train_df) train_b <- bake(prep_fit, train_df) test_b <- bake(prep_fit, test_df) y_train <- train_b$body_mass_g y_test <- test_b$body_mass_g X_train_s <- model.matrix(body_mass_g ~ ., data = train_b)[, -1] X_test_s <- model.matrix(body_mass_g ~ ., data = test_b)[, -1] ``` ## Ridge and LASSO with cross-validation ```{r fig.width=7, fig.height=4} cv_ridge <- cv.glmnet(X_train_s, y_train, alpha = 0) cv_lasso <- cv.glmnet(X_train_s, y_train, alpha = 1) par(mfrow = c(1, 2)) plot(cv_ridge, main = "Ridge CV") plot(cv_lasso, main = "LASSO CV") ``` ```{r} best_lam_ridge <- cv_ridge$lambda.min best_lam_lasso <- cv_lasso$lambda.min fit_ridge <- glmnet(X_train_s, y_train, alpha = 0, lambda = best_lam_ridge) fit_lasso <- glmnet(X_train_s, y_train, alpha = 1, lambda = best_lam_lasso) pred_ridge <- as.numeric(predict(fit_ridge, newx = X_test_s)) pred_lasso <- as.numeric(predict(fit_lasso, newx = X_test_s)) reg_metrics <- metric_set(rmse, rsq, mae) tibble(truth = y_test, .pred = pred_ridge) |> reg_metrics(truth, .pred) tibble(truth = y_test, .pred = pred_lasso) |> reg_metrics(truth, .pred) ``` ## Coefficient path (LASSO) ```{r fig.width=7, fig.height=5} fit_path <- glmnet(X_train_s, y_train, alpha = 1) plot(fit_path, xvar = "lambda", label = TRUE, main = "LASSO coefficient paths (scaled predictors)") ``` ## Takeaways - Morphometrics and **species** explain much of the variation in mass — check whether a simpler scientific story (e.g. mass ~ flipper + species) is enough before chasing complexity. - Compare this open-ended exercise to the **gene** notebook, where we know which features are truly causal.