set.seed(7)
mic <- load_microbiome()
otu_cols <- mic_otu_cols(mic)
mic_model <- mic |> select(Label, all_of(otu_cols))Solution — Day 2 microbiome (tidymodels pipeline)
Tasks 2.1–2.5 on the lab exercises page. Outcome: Label (Early vs Late).
2.1 Recipe
rec <- recipe(Label ~ ., data = mic_model) |>
step_mutate(across(all_of(otu_cols), ~ log1p(.x))) |>
step_zv(all_predictors()) |>
step_normalize(all_numeric_predictors())
rec2.2 Tree spec
tree_spec <- decision_tree(tree_depth = 4, min_n = 10) |>
set_engine("rpart") |>
set_mode("classification")
tree_specDecision Tree Model Specification (classification)
Main Arguments:
tree_depth = 4
min_n = 10
Computational engine: rpart
2.3 Train / test — tree
split <- initial_split(mic_model, prop = 0.75, strata = Label)
wf_tree <- workflow() |> add_recipe(rec) |> add_model(tree_spec)
metrics_cls <- metric_set(accuracy, roc_auc)set.seed(7)
fit_tree <- last_fit(wf_tree, split, metrics = metrics_cls)
collect_metrics(fit_tree)# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.931 pre0_mod0_post0
2 roc_auc binary 0.952 pre0_mod0_post0
2.4 Logistic workflow
log_spec <- logistic_reg() |>
set_engine("glm") |>
set_mode("classification")
wf_log <- workflow() |> add_recipe(rec) |> add_model(log_spec)set.seed(7)
fit_log <- last_fit(wf_log, split, metrics = metrics_cls)
collect_metrics(fit_log)# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.431 pre0_mod0_post0
2 roc_auc binary 0.441 pre0_mod0_post0
2.5 Compare on test set
cmp <- bind_rows(
collect_metrics(fit_tree) |> mutate(model = "Decision tree (rpart)"),
collect_metrics(fit_log) |> mutate(model = "Logistic regression")
) |>
select(model, .metric, .estimate)
knitr::kable(cmp, digits = 3)| model | .metric | .estimate |
|---|---|---|
| Decision tree (rpart) | accuracy | 0.931 |
| Decision tree (rpart) | roc_auc | 0.952 |
| Logistic regression | accuracy | 0.431 |
| Logistic regression | roc_auc | 0.441 |
cmp |>
filter(.metric == "accuracy") |>
ggplot(aes(reorder(model, .estimate), .estimate, fill = model)) +
geom_col(show.legend = FALSE) +
labs(title = "Held-out test accuracy (75/25 split)", x = NULL, y = "Accuracy")
Leakage reminder: rows from the same Individual can appear in both train and test with a random split — discuss in class; research workflows use group_vfold_cv(group = Individual).
R version 4.4.3 (2025-02-28)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.4 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
time zone: UTC
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] yardstick_1.4.0 workflowsets_1.1.1 workflows_1.3.0 tune_2.1.0
[5] tidyr_1.3.2 tailor_0.1.0 rsample_1.3.2 recipes_1.3.3
[9] purrr_1.2.2 parsnip_1.6.0 modeldata_1.5.1 infer_1.1.0
[13] ggplot2_4.0.3 dplyr_1.2.1 dials_1.4.3 scales_1.4.0
[17] broom_1.0.13 tidymodels_1.5.0
loaded via a namespace (and not attached):
[1] tidyselect_1.2.1 timeDate_4052.112 farver_2.1.2
[4] S7_0.2.2 fastmap_1.2.0 digest_0.6.39
[7] rpart_4.1.24 timechange_0.4.0 lifecycle_1.0.5
[10] survival_3.8-3 magrittr_2.0.5 compiler_4.4.3
[13] rlang_1.2.0 tools_4.4.3 utf8_1.2.6
[16] yaml_2.3.12 data.table_1.18.4 knitr_1.51
[19] labeling_0.4.3 curl_7.1.0 bit_4.6.0
[22] DiceDesign_1.10 RColorBrewer_1.1-3 withr_3.0.2
[25] nnet_7.3-20 grid_4.4.3 sparsevctrs_0.3.6
[28] future_1.70.0 globals_0.19.1 MASS_7.3-64
[31] cli_3.6.6 crayon_1.5.3 rmarkdown_2.31
[34] generics_0.1.4 otel_0.2.0 rstudioapi_0.18.0
[37] future.apply_1.20.2 tzdb_0.5.0 splines_4.4.3
[40] parallel_4.4.3 vctrs_0.7.3 hardhat_1.4.3
[43] Matrix_1.7-2 jsonlite_2.0.0 hms_1.1.4
[46] bit64_4.8.2 listenv_0.10.1 gower_1.0.2
[49] glue_1.8.1 parallelly_1.47.0 codetools_0.2-20
[52] lubridate_1.9.5 gtable_0.3.6 tibble_3.3.1
[55] pillar_1.11.1 furrr_0.4.0 htmltools_0.5.9
[58] ipred_0.9-15 lava_1.9.1 R6_2.6.1
[61] vroom_1.7.1 evaluate_1.0.5 lattice_0.22-6
[64] readr_2.2.0 backports_1.5.1 class_7.3-23
[67] Rcpp_1.1.1-1.1 prodlim_2026.03.11 xfun_0.58
[70] pkgconfig_2.0.3