Solution — Day 2 microbiome (tidymodels pipeline)

Tasks 2.1–2.5 on the lab exercises page. Outcome: Label (Early vs Late).

2.1 Recipe

set.seed(7)
mic <- load_microbiome()
otu_cols <- mic_otu_cols(mic)
mic_model <- mic |> select(Label, all_of(otu_cols))
rec <- recipe(Label ~ ., data = mic_model) |>
  step_mutate(across(all_of(otu_cols), ~ log1p(.x))) |>
  step_zv(all_predictors()) |>
  step_normalize(all_numeric_predictors())
rec

2.2 Tree spec

tree_spec <- decision_tree(tree_depth = 4, min_n = 10) |>
  set_engine("rpart") |>
  set_mode("classification")
tree_spec
Decision Tree Model Specification (classification)

Main Arguments:
  tree_depth = 4
  min_n = 10

Computational engine: rpart 

2.3 Train / test — tree

split <- initial_split(mic_model, prop = 0.75, strata = Label)
wf_tree <- workflow() |> add_recipe(rec) |> add_model(tree_spec)
metrics_cls <- metric_set(accuracy, roc_auc)
set.seed(7)
fit_tree <- last_fit(wf_tree, split, metrics = metrics_cls)
collect_metrics(fit_tree)
# A tibble: 2 × 4
  .metric  .estimator .estimate .config        
  <chr>    <chr>          <dbl> <chr>          
1 accuracy binary         0.931 pre0_mod0_post0
2 roc_auc  binary         0.952 pre0_mod0_post0

2.4 Logistic workflow

log_spec <- logistic_reg() |>
  set_engine("glm") |>
  set_mode("classification")
wf_log <- workflow() |> add_recipe(rec) |> add_model(log_spec)
set.seed(7)
fit_log <- last_fit(wf_log, split, metrics = metrics_cls)
collect_metrics(fit_log)
# A tibble: 2 × 4
  .metric  .estimator .estimate .config        
  <chr>    <chr>          <dbl> <chr>          
1 accuracy binary         0.431 pre0_mod0_post0
2 roc_auc  binary         0.441 pre0_mod0_post0

2.5 Compare on test set

cmp <- bind_rows(
  collect_metrics(fit_tree) |> mutate(model = "Decision tree (rpart)"),
  collect_metrics(fit_log) |> mutate(model = "Logistic regression")
) |>
  select(model, .metric, .estimate)
knitr::kable(cmp, digits = 3)
model .metric .estimate
Decision tree (rpart) accuracy 0.931
Decision tree (rpart) roc_auc 0.952
Logistic regression accuracy 0.431
Logistic regression roc_auc 0.441
cmp |>
  filter(.metric == "accuracy") |>
  ggplot(aes(reorder(model, .estimate), .estimate, fill = model)) +
  geom_col(show.legend = FALSE) +
  labs(title = "Held-out test accuracy (75/25 split)", x = NULL, y = "Accuracy")

Leakage reminder: rows from the same Individual can appear in both train and test with a random split — discuss in class; research workflows use group_vfold_cv(group = Individual).

R version 4.4.3 (2025-02-28)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: UTC
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] yardstick_1.4.0    workflowsets_1.1.1 workflows_1.3.0    tune_2.1.0        
 [5] tidyr_1.3.2        tailor_0.1.0       rsample_1.3.2      recipes_1.3.3     
 [9] purrr_1.2.2        parsnip_1.6.0      modeldata_1.5.1    infer_1.1.0       
[13] ggplot2_4.0.3      dplyr_1.2.1        dials_1.4.3        scales_1.4.0      
[17] broom_1.0.13       tidymodels_1.5.0  

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.1    timeDate_4052.112   farver_2.1.2       
 [4] S7_0.2.2            fastmap_1.2.0       digest_0.6.39      
 [7] rpart_4.1.24        timechange_0.4.0    lifecycle_1.0.5    
[10] survival_3.8-3      magrittr_2.0.5      compiler_4.4.3     
[13] rlang_1.2.0         tools_4.4.3         utf8_1.2.6         
[16] yaml_2.3.12         data.table_1.18.4   knitr_1.51         
[19] labeling_0.4.3      curl_7.1.0          bit_4.6.0          
[22] DiceDesign_1.10     RColorBrewer_1.1-3  withr_3.0.2        
[25] nnet_7.3-20         grid_4.4.3          sparsevctrs_0.3.6  
[28] future_1.70.0       globals_0.19.1      MASS_7.3-64        
[31] cli_3.6.6           crayon_1.5.3        rmarkdown_2.31     
[34] generics_0.1.4      otel_0.2.0          rstudioapi_0.18.0  
[37] future.apply_1.20.2 tzdb_0.5.0          splines_4.4.3      
[40] parallel_4.4.3      vctrs_0.7.3         hardhat_1.4.3      
[43] Matrix_1.7-2        jsonlite_2.0.0      hms_1.1.4          
[46] bit64_4.8.2         listenv_0.10.1      gower_1.0.2        
[49] glue_1.8.1          parallelly_1.47.0   codetools_0.2-20   
[52] lubridate_1.9.5     gtable_0.3.6        tibble_3.3.1       
[55] pillar_1.11.1       furrr_0.4.0         htmltools_0.5.9    
[58] ipred_0.9-15        lava_1.9.1          R6_2.6.1           
[61] vroom_1.7.1         evaluate_1.0.5      lattice_0.22-6     
[64] readr_2.2.0         backports_1.5.1     class_7.3-23       
[67] Rcpp_1.1.1-1.1      prodlim_2026.03.11  xfun_0.58          
[70] pkgconfig_2.0.3