mic <- load_microbiome(prev = 0.10)
otu_cols <- mic_otu_cols(mic)
dim(mic)[1] 282 349
table(mic$Label)
Early Late
177 105
table(mic$Sex, useNA = "ifany")
F M
142 140
length(unique(mic$Individual))[1] 13
Tasks 1.1–1.8 on the lab exercises page. Classical R only (no tidymodels).
We have far more OTUs than stable degrees of freedom for a plain glm on every column. PCA reduces the table to a few orthogonal scores that capture most of the variance; tasks 1.4–1.5 fit logistic regression in that low-dimensional space.
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.36859050 0.66196186 -5.088798 3.603393e-07
PC1 0.05709940 0.03662666 1.558957 1.190065e-01
PC2 -1.09594304 0.17218274 -6.364999 1.952905e-10
PC3 -0.42286518 0.08410904 -5.027583 4.967002e-07
PC4 -0.05245695 0.07991552 -0.656405 5.115636e-01
PC5 -0.07698606 0.12151958 -0.633528 5.263889e-01
$forward
Label ~ PC2 + PC3 + PC1
$backward
Label ~ PC1 + PC2 + PC3
Lasso (below) works on all OTUs at once: the L1 penalty handles p > n by driving most coefficients to zero, so a separate PCA step is not required.
345 x 1 sparse Matrix of class "dgCMatrix"
lambda.min
(Intercept) -1.859189393
seq_2 .
seq_9 .
seq_8 .
seq_7 .
seq_13 .
seq_68 .
seq_4 .
seq_17 .
seq_5 .
seq_3 -0.479731878
seq_1 .
seq_10 .
seq_12 .
seq_43 .
seq_30 .
seq_16 .
seq_29 .
seq_79 .
seq_46 .
seq_40 -0.011516453
seq_91 .
seq_112 .
seq_20 .
seq_27 .
seq_102 .
seq_66 .
seq_38 .
seq_167 .
seq_228 .
seq_97 .
seq_49 .
seq_116 .
seq_76 .
seq_129 -0.169755074
seq_34 .
seq_323 .
seq_125 .
seq_190 .
seq_15 .
seq_71 .
seq_21 .
seq_100 -0.300069786
seq_122 .
seq_206 .
seq_271 .
seq_52 .
seq_41 .
seq_93 .
seq_32 .
seq_82 .
seq_47 .
seq_165 .
seq_333 .
seq_94 .
seq_354 .
seq_28 .
seq_26 .
seq_87 .
seq_101 .
seq_67 .
seq_279 .
seq_36 0.239252535
seq_291 .
seq_23 .
seq_24 .
seq_138 .
seq_92 .
seq_37 .
seq_301 .
seq_141 .
seq_22 .
seq_75 .
seq_210 .
seq_143 .
seq_25 .
seq_86 .
seq_128 .
seq_31 .
seq_57 .
seq_238 .
seq_6 .
seq_58 .
seq_195 .
seq_81 .
seq_135 .
seq_80 -0.103048881
seq_193 .
seq_65 .
seq_142 .
seq_207 .
seq_269 .
seq_289 .
seq_153 .
seq_104 .
seq_175 .
seq_85 .
seq_59 .
seq_133 .
seq_53 .
seq_99 .
seq_197 .
seq_209 .
seq_146 .
seq_157 .
seq_250 .
seq_14 .
seq_83 .
seq_113 .
seq_55 .
seq_78 -0.835945274
seq_61 .
seq_107 .
seq_202 .
seq_121 .
seq_147 .
seq_223 .
seq_62 .
seq_124 .
seq_151 .
seq_188 .
seq_11 .
seq_194 .
seq_18 .
seq_152 -0.275212663
seq_72 .
seq_110 .
seq_158 .
seq_162 .
seq_216 .
seq_95 .
seq_70 .
seq_176 .
seq_145 .
seq_56 -0.140895041
seq_308 .
seq_200 .
seq_109 0.406964001
seq_48 .
seq_139 .
seq_115 .
seq_232 .
seq_205 .
seq_334 .
seq_88 .
seq_132 .
seq_329 .
seq_172 .
seq_234 .
seq_84 .
seq_35 .
seq_69 .
seq_73 .
seq_189 .
seq_218 .
seq_196 .
seq_324 .
seq_371 .
seq_161 .
seq_166 .
seq_77 .
seq_114 .
seq_89 -1.198394839
seq_278 .
seq_90 .
seq_316 .
seq_117 .
seq_247 .
seq_119 .
seq_245 .
seq_164 .
seq_222 -0.124041803
seq_311 .
seq_64 .
seq_220 .
seq_244 .
seq_257 .
seq_103 .
seq_118 .
seq_134 -0.063208433
seq_156 .
seq_253 .
seq_277 .
seq_288 0.341013646
seq_179 .
seq_44 .
seq_331 .
seq_160 .
seq_181 .
seq_42 .
seq_299 .
seq_261 .
seq_292 .
seq_111 .
seq_241 .
seq_290 .
seq_54 .
seq_170 .
seq_131 0.292964889
seq_213 .
seq_96 .
seq_63 .
seq_106 .
seq_144 .
seq_33 .
seq_45 .
seq_126 .
seq_50 .
seq_177 .
seq_282 .
seq_154 .
seq_300 .
seq_108 .
seq_259 .
seq_341 .
seq_169 .
seq_178 .
seq_174 .
seq_140 .
seq_74 0.524607171
seq_386 .
seq_258 .
seq_276 .
seq_60 .
seq_155 .
seq_326 .
seq_242 .
seq_235 .
seq_298 .
seq_186 .
seq_233 .
seq_272 .
seq_251 .
seq_159 .
seq_191 .
seq_350 .
seq_310 .
seq_315 .
seq_136 .
seq_346 .
seq_359 .
seq_274 .
seq_317 .
seq_127 .
seq_287 .
seq_266 .
seq_168 .
seq_295 .
seq_130 .
seq_208 .
seq_254 .
seq_98 .
seq_318 .
seq_285 .
seq_150 .
seq_123 .
seq_224 .
seq_185 .
seq_356 .
seq_231 .
seq_148 .
seq_192 .
seq_173 .
seq_227 .
seq_201 .
seq_328 .
seq_183 .
seq_237 .
seq_325 .
seq_149 .
seq_296 .
seq_352 .
seq_255 .
seq_322 .
seq_204 .
seq_211 .
seq_361 .
seq_286 .
seq_319 .
seq_320 .
seq_198 .
seq_275 .
seq_221 .
seq_248 .
seq_307 .
seq_360 .
seq_270 .
seq_120 .
seq_268 0.007962963
seq_344 .
seq_294 .
seq_327 .
seq_283 .
seq_260 .
seq_313 .
seq_304 .
seq_199 .
seq_305 .
seq_256 .
seq_39 .
seq_345 .
seq_281 .
seq_339 .
seq_330 .
seq_297 .
seq_230 .
seq_306 .
seq_364 .
seq_365 .
seq_236 .
seq_332 .
seq_357 .
seq_309 .
seq_19 1.307048831
seq_262 .
seq_312 .
seq_180 .
seq_214 -0.108431328
seq_267 .
seq_51 .
seq_215 0.106330607
seq_137 2.303512736
seq_249 0.493518560
seq_335 .
seq_343 .
seq_355 -0.106934041
seq_340 .
seq_321 .
seq_353 .
seq_263 .
seq_366 .
seq_302 .
seq_348 .
seq_240 0.572247609
seq_105 .
seq_358 0.214128714
seq_314 .
seq_280 .
seq_229 .
seq_293 .
seq_243 .
seq_264 0.418629259
seq_273 .
seq_265 .
seq_370 .
pred_glm <- predict(m_glm, type = "response")
pred_glm_cls <- factor(ifelse(pred_glm > 0.5, "Late", "Early"), levels = levels(mic$Label))
acc_glm <- mean(pred_glm_cls == mic$Label)
pred_lasso <- predict(cv_fit, newx = x_lasso, s = "lambda.min", type = "response")
pred_lasso_cls <- factor(ifelse(pred_lasso > 0.5, "Late", "Early"), levels = levels(mic$Label))
acc_lasso <- mean(pred_lasso_cls == mic$Label)
n_coef_lasso <- sum(coef(cv_fit, s = "lambda.min") != 0) - 1
c(accuracy_glm_pcs = acc_glm, accuracy_lasso = acc_lasso, n_nonzero_lasso = n_coef_lasso)accuracy_glm_pcs accuracy_lasso n_nonzero_lasso
0.9397163 1.0000000 26.0000000
mic_sex <- mic |> filter(Sex %in% c("F", "M"))
otu_sex <- log1p(as.matrix(mic_sex[, otu_cols]))
zv_sex <- apply(otu_sex, 2, sd) > 1e-10
otu_sex <- otu_sex[, zv_sex, drop = FALSE]
pc_sex <- prcomp(otu_sex, center = TRUE, scale. = TRUE)
pc_s <- as.data.frame(pc_sex$x[, seq_len(n_pc), drop = FALSE])
colnames(pc_s) <- paste0("PC", seq_len(n_pc))
dat_sex <- cbind(Sex = factor(mic_sex$Sex, levels = c("F", "M")), pc_s)
m_sex <- glm(Sex ~ ., data = dat_sex, family = binomial)
m_sex_null <- glm(Sex ~ 1, data = dat_sex, family = binomial)
m_sex_fwd <- stepAIC(m_sex_null, scope = list(lower = m_sex_null, upper = m_sex), direction = "forward", trace = 0)
x_sex <- scale(otu_sex)
y_sex <- ifelse(mic_sex$Sex == "M", 1, 0)
set.seed(7)
cv_sex <- cv.glmnet(x_sex, y_sex, family = "binomial", alpha = 1, nfolds = 5)pred_sex_glm <- factor(ifelse(predict(m_sex, type = "response") > 0.5, "M", "F"), levels = c("F", "M"))
acc_sex_glm <- mean(pred_sex_glm == mic_sex$Sex)
pred_sex_lasso <- ifelse(predict(cv_sex, newx = x_sex, s = "lambda.min", type = "response") > 0.5, "M", "F")
acc_sex_lasso <- mean(factor(pred_sex_lasso, levels = c("F", "M")) == mic_sex$Sex)
c(accuracy_glm_pcs = acc_sex_glm, accuracy_lasso = acc_sex_lasso)accuracy_glm_pcs accuracy_lasso
0.6312057 1.0000000
acc_fwd <- mean(
factor(ifelse(predict(m_fwd, dat_label, type = "response") > 0.5, "Late", "Early"), levels = levels(mic$Label)) == mic$Label
)
acc_sex_fwd <- mean(
factor(ifelse(predict(m_sex_fwd, dat_sex, type = "response") > 0.5, "Male", "Female"), levels = c("Female", "Male")) == mic_sex$Sex
)
summary_tbl <- data.frame(
outcome = c("Label", "Label", "Label", "Sex", "Sex", "Sex"),
method = c("GLM (5 PCs)", "stepAIC forward", "Lasso (all OTUs)", "GLM (5 PCs)", "stepAIC forward", "Lasso (all OTUs)"),
n_predictors = c(
n_pc,
length(coef(m_fwd)) - 1,
n_coef_lasso,
n_pc,
length(coef(m_sex_fwd)) - 1,
sum(coef(cv_sex, s = "lambda.min") != 0) - 1
),
train_accuracy = c(acc_glm, acc_fwd, acc_lasso, acc_sex_glm, acc_sex_fwd, acc_sex_lasso),
row.names = NULL
)
knitr::kable(summary_tbl, digits = 3)| outcome | method | n_predictors | train_accuracy |
|---|---|---|---|
| Label | GLM (5 PCs) | 5 | 0.940 |
| Label | stepAIC forward | 3 | 0.933 |
| Label | Lasso (all OTUs) | 26 | 1.000 |
| Sex | GLM (5 PCs) | 5 | 0.631 |
| Sex | stepAIC forward | 3 | 0.000 |
| Sex | Lasso (all OTUs) | 65 | 1.000 |
Note: In-sample accuracy only — same data used to fit and score (matches Monday gene demos).
R version 4.4.3 (2025-02-28)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.4 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
time zone: UTC
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] glmnet_5.0 Matrix_1.7-2 MASS_7.3-64 ggplot2_4.0.3 dplyr_1.2.1
loaded via a namespace (and not attached):
[1] bit_4.6.0 gtable_0.3.6 jsonlite_2.0.0 crayon_1.5.3
[5] compiler_4.4.3 tidyselect_1.2.1 Rcpp_1.1.1-1.1 parallel_4.4.3
[9] splines_4.4.3 scales_1.4.0 yaml_2.3.12 fastmap_1.2.0
[13] lattice_0.22-6 readr_2.2.0 R6_2.6.1 labeling_0.4.3
[17] generics_0.1.4 curl_7.1.0 shape_1.4.6.1 knitr_1.51
[21] iterators_1.0.14 tibble_3.3.1 tzdb_0.5.0 pillar_1.11.1
[25] RColorBrewer_1.1-3 rlang_1.2.0 xfun_0.58 S7_0.2.2
[29] bit64_4.8.2 otel_0.2.0 cli_3.6.6 withr_3.0.2
[33] magrittr_2.0.5 digest_0.6.39 foreach_1.5.2 grid_4.4.3
[37] vroom_1.7.1 hms_1.1.4 lifecycle_1.0.5 vctrs_0.7.3
[41] evaluate_1.0.5 glue_1.8.1 farver_2.1.2 codetools_0.2-20
[45] survival_3.8-3 rmarkdown_2.31 tools_4.4.3 pkgconfig_2.0.3
[49] htmltools_0.5.9
---
title: "Solution — Day 1 microbiome (classical)"
format:
html:
toc: true
code-tools: true
---
Tasks **1.1–1.8** on the [lab exercises page](../index.qmd). Classical R only (no `tidymodels`).
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
suppressPackageStartupMessages({
library(dplyr)
library(ggplot2)
library(MASS)
library(glmnet)
})
theme_set(theme_minimal())
source("_load_microbiome.R")
```
## 1.1 Load and filter
```{r load}
mic <- load_microbiome(prev = 0.10)
otu_cols <- mic_otu_cols(mic)
dim(mic)
table(mic$Label)
table(mic$Sex, useNA = "ifany")
length(unique(mic$Individual))
```
## 1.2 log1p transform
```{r log1p}
otu_mat <- as.matrix(mic[, otu_cols])
log_otu <- log1p(otu_mat)
mic$lib_size <- rowSums(otu_mat)
```
```{r lib-size, fig.width=7, fig.height=4}
ggplot(mic, aes(lib_size, fill = Label)) +
geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
scale_x_log10() +
labs(title = "Library size after prevalence filter", x = "Total counts")
```
## 1.3 PCA
We have far more OTUs than stable degrees of freedom for a plain `glm` on every column. **PCA** reduces the table to a few orthogonal scores that capture most of the variance; tasks **1.4–1.5** fit logistic regression in that low-dimensional space.
```{r pca, fig.width=7, fig.height=5}
pc <- prcomp(log_otu, center = TRUE, scale. = TRUE)
pc_df <- as.data.frame(pc$x[, 1:2]) |>
mutate(sample_id = mic$sample_id, Label = mic$Label, Sex = mic$Sex)
ggplot(pc_df, aes(PC1, PC2, color = Label, shape = Sex)) +
geom_point(alpha = 0.7) +
labs(title = "PCA of log1p OTU abundances")
```
## 1.4–1.6 Label (Early vs Late)
```{r label-pcs}
n_pc <- 5
pc_scores <- as.data.frame(pc$x[, seq_len(n_pc), drop = FALSE])
colnames(pc_scores) <- paste0("PC", seq_len(n_pc))
dat_label <- cbind(Label = mic$Label, pc_scores)
m_glm <- glm(Label ~ ., data = dat_label, family = binomial)
summary(m_glm)$coefficients
```
```{r label-aic}
m_null <- glm(Label ~ 1, data = dat_label, family = binomial)
m_fwd <- stepAIC(m_null, scope = list(lower = m_null, upper = m_glm), direction = "forward", trace = 0)
m_bwd <- stepAIC(m_glm, direction = "backward", trace = 0)
list(forward = formula(m_fwd), backward = formula(m_bwd))
```
**Lasso** (below) works on **all** OTUs at once: the L1 penalty handles **p > n** by driving most coefficients to zero, so a separate PCA step is not required.
```{r label-lasso}
x_lasso <- scale(log_otu)
y_lasso <- ifelse(mic$Label == "Late", 1, 0)
set.seed(7)
cv_fit <- cv.glmnet(x_lasso, y_lasso, family = "binomial", alpha = 1, nfolds = 5)
coef(cv_fit, s = "lambda.min")
```
```{r label-metrics}
pred_glm <- predict(m_glm, type = "response")
pred_glm_cls <- factor(ifelse(pred_glm > 0.5, "Late", "Early"), levels = levels(mic$Label))
acc_glm <- mean(pred_glm_cls == mic$Label)
pred_lasso <- predict(cv_fit, newx = x_lasso, s = "lambda.min", type = "response")
pred_lasso_cls <- factor(ifelse(pred_lasso > 0.5, "Late", "Early"), levels = levels(mic$Label))
acc_lasso <- mean(pred_lasso_cls == mic$Label)
n_coef_lasso <- sum(coef(cv_fit, s = "lambda.min") != 0) - 1
c(accuracy_glm_pcs = acc_glm, accuracy_lasso = acc_lasso, n_nonzero_lasso = n_coef_lasso)
```
## 1.7 Sex (Female vs Male)
```{r sex-filter}
mic_sex <- mic |> filter(Sex %in% c("F", "M"))
otu_sex <- log1p(as.matrix(mic_sex[, otu_cols]))
zv_sex <- apply(otu_sex, 2, sd) > 1e-10
otu_sex <- otu_sex[, zv_sex, drop = FALSE]
pc_sex <- prcomp(otu_sex, center = TRUE, scale. = TRUE)
pc_s <- as.data.frame(pc_sex$x[, seq_len(n_pc), drop = FALSE])
colnames(pc_s) <- paste0("PC", seq_len(n_pc))
dat_sex <- cbind(Sex = factor(mic_sex$Sex, levels = c("F", "M")), pc_s)
m_sex <- glm(Sex ~ ., data = dat_sex, family = binomial)
m_sex_null <- glm(Sex ~ 1, data = dat_sex, family = binomial)
m_sex_fwd <- stepAIC(m_sex_null, scope = list(lower = m_sex_null, upper = m_sex), direction = "forward", trace = 0)
x_sex <- scale(otu_sex)
y_sex <- ifelse(mic_sex$Sex == "M", 1, 0)
set.seed(7)
cv_sex <- cv.glmnet(x_sex, y_sex, family = "binomial", alpha = 1, nfolds = 5)
```
```{r sex-metrics}
pred_sex_glm <- factor(ifelse(predict(m_sex, type = "response") > 0.5, "M", "F"), levels = c("F", "M"))
acc_sex_glm <- mean(pred_sex_glm == mic_sex$Sex)
pred_sex_lasso <- ifelse(predict(cv_sex, newx = x_sex, s = "lambda.min", type = "response") > 0.5, "M", "F")
acc_sex_lasso <- mean(factor(pred_sex_lasso, levels = c("F", "M")) == mic_sex$Sex)
c(accuracy_glm_pcs = acc_sex_glm, accuracy_lasso = acc_sex_lasso)
```
## 1.8 Summary table
```{r summary}
acc_fwd <- mean(
factor(ifelse(predict(m_fwd, dat_label, type = "response") > 0.5, "Late", "Early"), levels = levels(mic$Label)) == mic$Label
)
acc_sex_fwd <- mean(
factor(ifelse(predict(m_sex_fwd, dat_sex, type = "response") > 0.5, "Male", "Female"), levels = c("Female", "Male")) == mic_sex$Sex
)
summary_tbl <- data.frame(
outcome = c("Label", "Label", "Label", "Sex", "Sex", "Sex"),
method = c("GLM (5 PCs)", "stepAIC forward", "Lasso (all OTUs)", "GLM (5 PCs)", "stepAIC forward", "Lasso (all OTUs)"),
n_predictors = c(
n_pc,
length(coef(m_fwd)) - 1,
n_coef_lasso,
n_pc,
length(coef(m_sex_fwd)) - 1,
sum(coef(cv_sex, s = "lambda.min") != 0) - 1
),
train_accuracy = c(acc_glm, acc_fwd, acc_lasso, acc_sex_glm, acc_sex_fwd, acc_sex_lasso),
row.names = NULL
)
knitr::kable(summary_tbl, digits = 3)
```
**Note:** In-sample accuracy only — same data used to fit and score (matches Monday gene demos).
```{r session, echo=FALSE}
sessionInfo()
```