From 115fdd0adf3dbf6da1fb51580455efe996e36be7 Mon Sep 17 00:00:00 2001
From: Florence Bockting <florence.bockting@aalto.fi>
Date: Tue, 30 Jun 2026 12:38:19 +0300
Subject: [PATCH 1/6] feat: use mirai and mori in do_importance_sampling

---
 R/importance_sampling.R | 52 +++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/R/importance_sampling.R b/R/importance_sampling.R
index f8dfd4d3..59562f7e 100644
--- a/R/importance_sampling.R
+++ b/R/importance_sampling.R
@@ -199,27 +199,18 @@ do_importance_sampling <- function(log_ratios, r_eff, cores, method) {
   }
 
   if (cores == 1) {
-    lw_list <- lapply(seq_len(N), function(i)
-      is_fun(log_ratios_i = log_ratios[, i], tail_len_i = tail_len[i]))
+    lw_list <- lapply(seq_len(N), do_is_i, is_fun, log_ratios, tail_len)
   } else {
-    if (!os_is_windows()) {
-      lw_list <- parallel::mclapply(
-        X = seq_len(N),
-        mc.cores = cores,
-        FUN = function(i)
-          is_fun(log_ratios_i = log_ratios[, i], tail_len_i = tail_len[i])
-      )
-    } else {
-      cl <- parallel::makePSOCKcluster(cores)
-      on.exit(parallel::stopCluster(cl))
-      lw_list <-
-        parallel::parLapply(
-          cl = cl,
-          X = seq_len(N),
-          fun = function(i)
-            is_fun(log_ratios_i = log_ratios[, i], tail_len_i = tail_len[i])
-        )
-    }
+    shared_lr <- mori::share(log_ratios)
+    lw_list <- with(
+      mirai::daemons(cores),
+      mirai::mirai_map(
+        seq_len(N),
+        do_is_i,
+        .args = list(is_fun = is_fun, log_ratios = shared_lr,
+        tail_len = tail_len)
+      )[]
+    )
   }
 
   log_weights <- psis_apply(lw_list, "log_weights", fun_val = numeric(S))
@@ -234,3 +225,24 @@ do_importance_sampling <- function(log_ratios, r_eff, cores, method) {
     method = rep(method, length(pareto_k)) # Conform to other attr that exist per obs.
   )
 }
+
+#' Apply an importance sampling method to a single observation
+#'
+#' @noRd
+#' @keywords internal
+#' @description
+#' Worker function mapped over observations (matrix columns) by
+#' [do_importance_sampling()], either serially via [lapply()] or in parallel
+#' via [mirai::mirai_map()]. 
+#' @param i Integer column index of the observation to process.
+#' @param is_fun The per-observation importance sampling function to apply, one
+#'   of [do_psis_i()], [do_tis_i()], or [do_sis_i()].
+#' @param log_ratios Matrix of log ratios (`-loglik`). May be a shared-memory
+#'   object created by [mori::share()] to avoid copying to each worker.
+#' @param tail_len Vector of tail lengths used to fit the GPD, one per
+#'   observation.
+#' @return The result of `is_fun` for observation `i` (a list with elements
+#'   such as `log_weights` and `pareto_k`).
+do_is_i <- function(i, is_fun, log_ratios, tail_len) {
+  is_fun(log_ratios_i = log_ratios[, i], tail_len_i = tail_len[i])
+}
\ No newline at end of file

From 4b6bbc3ba1b7c4567f3d3ed4f4aff69b846746af Mon Sep 17 00:00:00 2001
From: Florence Bockting <florence.bockting@aalto.fi>
Date: Tue, 30 Jun 2026 14:33:37 +0300
Subject: [PATCH 2/6] refactor: use mirai/mori for parallelization

---
 .gitignore                     |   4 +
 DESCRIPTION                    |   2 +
 R/effective_sample_sizes.R     | 101 ++++++--------
 R/importance_sampling.R        |  29 ++--
 R/loo.R                        |  67 +++------
 R/loo_model_weights.R          |  27 ++--
 R/loo_moment_matching.R        |  87 ++++++++----
 R/loo_subsample.R              |  23 +--
 R/parallel.R                   | 181 ++++++++++++++++++++++++
 tests/testthat/test_parallel.R | 246 +++++++++++++++++++++++++++++++++
 10 files changed, 603 insertions(+), 164 deletions(-)
 create mode 100644 R/parallel.R
 create mode 100644 tests/testthat/test_parallel.R

diff --git a/.gitignore b/.gitignore
index 9c69942e..f070106a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,7 @@ tests/testthat/Rplots.pdf
 cran-comments.md
 CRAN-RELEASE
 release-prep.R
+
+agent/*
+data/*
+scratch-files/*
\ No newline at end of file
diff --git a/DESCRIPTION b/DESCRIPTION
index 29f41847..f673389a 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -37,6 +37,8 @@ Depends:
 Imports:
     checkmate,
     matrixStats (>= 0.52),
+    mirai,
+    mori,
     parallel,
     posterior (>= 1.7.0),
     stats
diff --git a/R/effective_sample_sizes.R b/R/effective_sample_sizes.R
index 360a3098..ab565555 100644
--- a/R/effective_sample_sizes.R
+++ b/R/effective_sample_sizes.R
@@ -62,32 +62,29 @@ relative_eff.array <- function(x, ..., cores = getOption("mc.cores", 1)) {
   stopifnot(length(dim(x)) == 3)
   S <- prod(dim(x)[1:2]) # posterior sample size = iter * chains
 
-  if (cores == 1) {
-    n_eff_vec <- apply(x, 3, posterior::ess_mean)
-  } else {
-    if (!os_is_windows()) {
-      n_eff_list <-
-        parallel::mclapply(
-          mc.cores = cores,
-          X = seq_len(dim(x)[3]),
-          FUN = function(i) posterior::ess_mean(x[, , i, drop = TRUE])
-        )
-    } else {
-      cl <- parallel::makePSOCKcluster(cores)
-      on.exit(parallel::stopCluster(cl))
-      n_eff_list <-
-        parallel::parLapply(
-          cl = cl,
-          X = seq_len(dim(x)[3]),
-          fun = function(i) posterior::ess_mean(x[, , i, drop = TRUE])
-        )
-    }
-    n_eff_vec <- unlist(n_eff_list, use.names = FALSE)
-  }
+  # The full draws array is reused across observations, so it is broadcast via
+  # shared memory on a local pool. Each worker reads only its slice `x[, , i]`.
+  n_eff_list <- with_loo_daemons(
+    cores,
+    loo_map(
+      seq_len(dim(x)[3]),
+      relative_eff_i_array,
+      cores = cores,
+      broadcast = list(x = x)
+    )
+  )
+  n_eff_vec <- unlist(n_eff_list, use.names = FALSE)
 
   return(n_eff_vec / S)
 }
 
+#' Worker computing `ess_mean()` for a single slice of a draws array
+#' @noRd
+#' @keywords internal
+relative_eff_i_array <- function(i, x) {
+  posterior::ess_mean(x[, , i, drop = TRUE])
+}
+
 #' @export
 #' @templateVar fn relative_eff
 #' @template function
@@ -104,46 +101,36 @@ relative_eff.function <-
     f_i <- validate_llfun(x) # not really an llfun, should return exp(ll) or exp(-ll)
     N <- dim(data)[1]
 
-    if (cores == 1) {
-      n_eff_list <-
-        lapply(
-          X = seq_len(N),
-          FUN = function(i) {
-            val_i <- f_i(data_i = data[i, , drop = FALSE], draws = draws, ...)
-            relative_eff.default(as.vector(val_i), chain_id = chain_id, cores = 1)
-          }
-        )
-    } else {
-      if (!os_is_windows()) {
-        n_eff_list <-
-          parallel::mclapply(
-            X = seq_len(N),
-            FUN = function(i) {
-              val_i <- f_i(data_i = data[i, , drop = FALSE], draws = draws, ...)
-              relative_eff.default(as.vector(val_i), chain_id = chain_id, cores = 1)
-            },
-            mc.cores = cores
-          )
-      } else {
-        cl <- parallel::makePSOCKcluster(cores)
-        parallel::clusterExport(cl=cl, varlist=c("draws", "chain_id", "data"), envir=environment())
-        on.exit(parallel::stopCluster(cl))
-        n_eff_list <-
-          parallel::parLapply(
-            cl = cl,
-            X = seq_len(N),
-            fun = function(i) {
-              val_i <- f_i(data_i = data[i, , drop = FALSE], draws = draws, ...)
-              relative_eff.default(as.vector(val_i), chain_id = chain_id, cores = 1)
-            }
-          )
-      }
-    }
+    # `data` and `draws` are reused for every observation, so they are broadcast
+    # via shared memory on a local pool and serialized on a remote pool.
+    n_eff_list <- with_loo_daemons(
+      cores,
+      loo_map(
+        seq_len(N),
+        relative_eff_i_function,
+        f_i = f_i,
+        chain_id = chain_id,
+        re_dots = list(...),
+        cores = cores,
+        broadcast = list(data = data, draws = draws)
+      )
+    )
 
     n_eff_vec <- unlist(n_eff_list, use.names = FALSE)
     return(n_eff_vec)
   }
 
+#' Worker computing the relative effective sample size for observation `i`
+#' @noRd
+#' @keywords internal
+relative_eff_i_function <- function(i, f_i, data, draws, chain_id, re_dots) {
+  val_i <- do.call(
+    f_i,
+    c(list(data_i = data[i, , drop = FALSE], draws = draws), re_dots)
+  )
+  relative_eff.default(as.vector(val_i), chain_id = chain_id, cores = 1)
+}
+
 #' @export
 #' @describeIn relative_eff
 #'   If `x` is an object of class `"psis"`, `relative_eff()` simply returns
diff --git a/R/importance_sampling.R b/R/importance_sampling.R
index 59562f7e..59d776c3 100644
--- a/R/importance_sampling.R
+++ b/R/importance_sampling.R
@@ -198,20 +198,23 @@ do_importance_sampling <- function(log_ratios, r_eff, cores, method) {
     stop("Incorrect IS method.")
   }
 
-  if (cores == 1) {
-    lw_list <- lapply(seq_len(N), do_is_i, is_fun, log_ratios, tail_len)
-  } else {
-    shared_lr <- mori::share(log_ratios)
-    lw_list <- with(
-      mirai::daemons(cores),
-      mirai::mirai_map(
-        seq_len(N),
-        do_is_i,
-        .args = list(is_fun = is_fun, log_ratios = shared_lr,
-        tail_len = tail_len)
-      )[]
+  # Each observation needs a different column of `log_ratios`, but the whole
+  # matrix is reused across the map, so it is a broadcast object: `loo_map()`
+  # shares it via shared memory on a local pool (zero-copy column access) and
+  # falls back to serialization on a remote pool. Serial work runs as a plain
+  # lapply(). `with_loo_daemons()` provides a pool when this is the top-level
+  # call (e.g. psis()) and reuses an outer pool when called from loo().
+  lw_list <- with_loo_daemons(
+    cores,
+    loo_map(
+      seq_len(N),
+      do_is_i,
+      is_fun = is_fun,
+      tail_len = tail_len,
+      cores = cores,
+      broadcast = list(log_ratios = log_ratios)
     )
-  }
+  )
 
   log_weights <- psis_apply(lw_list, "log_weights", fun_val = numeric(S))
   pareto_k <- psis_apply(lw_list, "pareto_k")
diff --git a/R/loo.R b/R/loo.R
index 10b1bdc7..6830951e 100644
--- a/R/loo.R
+++ b/R/loo.R
@@ -665,52 +665,23 @@ parallel_importance_sampling_list <- function(N, .loo_i, .llfun,
                                               data, draws, r_eff,
                                               save_psis, cores,
                                               method, ...){
-  if (cores == 1) {
-    psis_list <-
-      lapply(
-        X = seq_len(N),
-        FUN = .loo_i,
-        llfun = .llfun,
-        data = data,
-        draws = draws,
-        r_eff = r_eff,
-        save_psis = save_psis,
-        is_method = method,
-        ...
-      )
-  } else {
-    if (!os_is_windows()) {
-      # On Mac or Linux use mclapply() for multiple cores
-      psis_list <-
-        parallel::mclapply(
-          mc.cores = cores,
-          X = seq_len(N),
-          FUN = .loo_i,
-          llfun = .llfun,
-          data = data,
-          draws = draws,
-          r_eff = r_eff,
-          save_psis = save_psis,
-          is_method = method,
-          ...
-        )
-    } else {
-      # On Windows use makePSOCKcluster() and parLapply() for multiple cores
-      cl <- parallel::makePSOCKcluster(cores)
-      on.exit(parallel::stopCluster(cl))
-      psis_list <-
-        parallel::parLapply(
-          cl = cl,
-          X = seq_len(N),
-          fun = .loo_i,
-          llfun = .llfun,
-          data = data,
-          draws = draws,
-          r_eff = r_eff,
-          save_psis = save_psis,
-          is_method = method,
-          ...
-        )
-    }
-  }
+  # `draws` (and `data`) are reused identically for every observation, so they
+  # are broadcast objects: shared once via shared memory on a local pool
+  # (recovering the copy-on-write benefit fork gave the old mclapply() path)
+  # and serialized on a remote pool. A single cross-platform code path replaces
+  # the previous mclapply()/parLapply() branching.
+  with_loo_daemons(
+    cores,
+    loo_map(
+      seq_len(N),
+      .loo_i,
+      llfun = .llfun,
+      r_eff = r_eff,
+      save_psis = save_psis,
+      is_method = method,
+      ...,
+      cores = cores,
+      broadcast = list(data = data, draws = draws)
+    )
+  )
 }
diff --git a/R/loo_model_weights.R b/R/loo_model_weights.R
index 946dc7c3..3eb8d63c 100644
--- a/R/loo_model_weights.R
+++ b/R/loo_model_weights.R
@@ -188,15 +188,24 @@ loo_model_weights.default <-
       N <- ncol(x[[1]]) # number of data points
       validate_log_lik_list(x)
       validate_r_eff_list(r_eff_list, K, N)
-      lpd_point <- matrix(NA, N, K)
-      elpd_loo <- rep(NA, K)
-      for (k in 1:K) {
-        r_eff_k <- r_eff_list[[k]] # possibly NULL
-        log_likelihood <- x[[k]]
-        loo_object <- loo(log_likelihood, r_eff = r_eff_k, cores = cores)
-        lpd_point[, k] <- loo_object$pointwise[, "elpd_loo"]    #calculate log(p_k (y_i | y_-i))
-        elpd_loo[k] <- loo_object$estimates["elpd_loo", "Estimate"]
-      }
+      # Establish a single daemon pool for all K models so each inner loo()
+      # reuses it instead of spinning a pool up and down K times.
+      loo_objects <- with_loo_daemons(
+        cores,
+        lapply(seq_len(K), function(k) {
+          loo(x[[k]], r_eff = r_eff_list[[k]], cores = cores)
+        })
+      )
+      lpd_point <- vapply(
+        loo_objects,
+        function(o) o$pointwise[, "elpd_loo"], #calculate log(p_k (y_i | y_-i))
+        FUN.VALUE = numeric(N)
+      )
+      elpd_loo <- vapply(
+        loo_objects,
+        function(o) o$estimates["elpd_loo", "Estimate"],
+        FUN.VALUE = numeric(1)
+      )
     } else if (is.psis_loo(x[[1]])) {
       validate_psis_loo_list(x)
       lpd_point <- do.call(cbind, lapply(x, function(obj) obj$pointwise[, "elpd_loo"]))
diff --git a/R/loo_moment_matching.R b/R/loo_moment_matching.R
index 110eff93..c37f9102 100644
--- a/R/loo_moment_matching.R
+++ b/R/loo_moment_matching.R
@@ -111,32 +111,27 @@ loo_moment_match.default <- function(x, loo, post_draws, log_lik_i,
   kfs <- rep(0,N)
   I <- which(ks > k_threshold)
 
-  loo_moment_match_i_fun <- function(i) {
-    loo_moment_match_i(i = i, x = x, log_lik_i = log_lik_i,
-                       unconstrain_pars = unconstrain_pars,
-                       log_prob_upars = log_prob_upars,
-                       log_lik_i_upars = log_lik_i_upars,
-                       max_iters = max_iters, k_threshold = k_threshold,
-                       split = split, cov = cov, N = N, S = S, upars = upars,
-                       orig_log_prob = orig_log_prob, k = ks[i],
-                       is_method = is_method, npars = npars, ...)
-  }
-
-  if (cores == 1) {
-    mm_list <- lapply(X = I, FUN = function(i) loo_moment_match_i_fun(i))
-  }
-  else {
-    if (!os_is_windows()) {
-      mm_list <- parallel::mclapply(X = I, mc.cores = cores,
-                                    FUN = function(i) loo_moment_match_i_fun(i))
-    }
-    else {
-      cl <- parallel::makePSOCKcluster(cores)
-      on.exit(parallel::stopCluster(cl))
-      mm_list <- parallel::parLapply(cl = cl, X = I,
-                                    fun = function(i) loo_moment_match_i_fun(i))
-    }
-  }
+  # The large unconstrained-draws matrix `upars` and the `orig_log_prob` vector
+  # are reused for every high-Pareto-k observation, so they are broadcast via
+  # shared memory on a local pool. The worker is the namespace-level
+  # `loo_moment_match_i_worker()` (rather than a closure over this frame) so the
+  # broadcast objects are not also dragged along inside a captured environment.
+  mm_list <- with_loo_daemons(
+    cores,
+    loo_map(
+      I,
+      loo_moment_match_i_worker,
+      x = x, ks = ks, log_lik_i = log_lik_i,
+      unconstrain_pars = unconstrain_pars,
+      log_prob_upars = log_prob_upars,
+      log_lik_i_upars = log_lik_i_upars,
+      max_iters = max_iters, k_threshold = k_threshold,
+      split = split, cov = cov, N = N, S = S,
+      is_method = is_method, npars = npars, mm_dots = list(...),
+      cores = cores,
+      broadcast = list(upars = upars, orig_log_prob = orig_log_prob)
+    )
+  )
 
   # update results
   for (ii in seq_along(I)) {
@@ -230,6 +225,46 @@ loo_moment_match.default <- function(x, loo, post_draws, log_lik_i,
 #' @param ... Further arguments passed to the custom functions documented above.
 #' @return List with the updated elpd values and diagnostics
 #'
+#' Worker wrapper around [loo_moment_match_i()] for parallel mapping
+#'
+#' @noRd
+#' @keywords internal
+#' @description
+#' A namespace-level (non-closure) adapter mapped over high-Pareto-k
+#' observation indices by [loo_map()]. Keeping it at namespace scope means it
+#' does not capture the calling frame, so large objects shared via
+#' [mori::share()] (`upars`, `orig_log_prob`) are not duplicated inside a
+#' serialized closure environment. The per-observation Pareto k is selected
+#' here from the full `ks` vector, and any extra arguments are forwarded
+#' through `mm_dots`.
+#' @param i Integer observation index.
+#' @param ks Full vector of Pareto k estimates; `ks[i]` is used for this fold.
+#' @param mm_dots A list of additional arguments forwarded to
+#'   [loo_moment_match_i()] (the `...` from [loo_moment_match()]).
+#' @return The result of [loo_moment_match_i()] for observation `i`.
+loo_moment_match_i_worker <- function(i, x, ks, log_lik_i, unconstrain_pars,
+                                      log_prob_upars, log_lik_i_upars,
+                                      max_iters, k_threshold, split, cov,
+                                      N, S, upars, orig_log_prob, is_method,
+                                      npars, mm_dots) {
+  do.call(
+    loo_moment_match_i,
+    c(
+      list(
+        i = i, x = x, log_lik_i = log_lik_i,
+        unconstrain_pars = unconstrain_pars,
+        log_prob_upars = log_prob_upars,
+        log_lik_i_upars = log_lik_i_upars,
+        max_iters = max_iters, k_threshold = k_threshold,
+        split = split, cov = cov, N = N, S = S, upars = upars,
+        orig_log_prob = orig_log_prob, k = ks[i],
+        is_method = is_method, npars = npars
+      ),
+      mm_dots
+    )
+  )
+}
+
 loo_moment_match_i <- function(i,
                                x,
                                log_lik_i,
diff --git a/R/loo_subsample.R b/R/loo_subsample.R
index bcac4b17..5912d993 100644
--- a/R/loo_subsample.R
+++ b/R/loo_subsample.R
@@ -494,17 +494,18 @@ lpd_i <- function(i, llfun, data, draws) {
 #' @noRd
 #' @return a vector of computed log probability densities
 compute_lpds <- function(N, data, draws, llfun, cores) {
-  if (cores == 1) {
-    lpds <- lapply(X = seq_len(N), FUN = lpd_i, llfun, data, draws)
-  } else {
-    if (.Platform$OS.type != "windows") {
-      lpds <- mclapply(X = seq_len(N), mc.cores = cores, FUN = lpd_i, llfun, data, draws)
-    } else {
-      cl <- makePSOCKcluster(cores)
-      on.exit(stopCluster(cl))
-      lpds <- parLapply(cl, X = seq_len(N), fun = lpd_i, llfun, data, draws)
-    }
-  }
+  # `draws` (and `data`) are reused for every observation, so they are shared
+  # once via shared memory on a local pool and serialized on a remote pool.
+  lpds <- with_loo_daemons(
+    cores,
+    loo_map(
+      seq_len(N),
+      lpd_i,
+      llfun = llfun,
+      cores = cores,
+      broadcast = list(data = data, draws = draws)
+    )
+  )
 
   unlist(lpds)
 }
diff --git a/R/parallel.R b/R/parallel.R
new file mode 100644
index 00000000..4c8fbe3c
--- /dev/null
+++ b/R/parallel.R
@@ -0,0 +1,181 @@
+#' Evaluate parallel work with an appropriate mirai daemon pool
+#'
+#' @noRd
+#' @keywords internal
+#' @description
+#' Central entry point used by loo's parallel code paths to ensure a
+#' [mirai::daemons()] pool exists for the duration of a computation. It is
+#' deliberately a good citizen of the user's session:
+#'
+#' * `cores <= 1`: runs `code` serially without touching daemons.
+#' * A daemon pool is already configured (e.g. the user called
+#'   [mirai::daemons()] themselves, possibly with remote/HPC daemons): `code`
+#'   runs on the existing pool, which is left untouched.
+#' * Otherwise: a pool of `cores` local daemons is created for the duration of
+#'   `code` and automatically reset afterwards (via the scoped
+#'   `with(mirai::daemons(), ...)` method), so no daemon processes are left
+#'   running once the call returns.
+#'
+#' This keeps a single pool alive across the whole top-level computation
+#' (rather than spinning daemons up and down for each unit of work) while
+#' respecting any pool the user has already declared. Because it reuses an
+#' existing pool, it is safe to nest: an inner call made while an outer call
+#' already established a pool simply reuses it instead of creating another.
+#'
+#' @param cores Integer number of cores requested by the user.
+#' @param code Expression to evaluate. Lazily evaluated in the calling
+#'   environment, after any daemon pool has been set up.
+#' @return The value of `code`.
+with_loo_daemons <- function(cores, code) {
+  if (cores <= 1 || loo_has_pool()) {
+    # Serial work, or reuse the daemon pool the user (or an outer loo call)
+    # already configured.
+    return(code)
+  }
+  # No pool configured: create one scoped to this computation and reset it on
+  # exit. `code` (including result collection via `[]`) is forced before the
+  # daemons are torn down.
+  with(mirai::daemons(cores), code)
+}
+
+#' Is a mirai daemon pool currently connected?
+#'
+#' @noRd
+#' @keywords internal
+#' @return `TRUE` if at least one daemon connection exists for the active
+#'   compute profile, otherwise `FALSE`.
+loo_has_pool <- function() {
+  conns <- tryCatch(mirai::status()$connections, error = function(e) 0L)
+  isTRUE(as.integer(conns) > 0L)
+}
+
+#' Number of workers available for chunking decisions
+#'
+#' @noRd
+#' @keywords internal
+#' @description
+#' Returns the number of connected daemons when a pool exists (so chunking
+#' matches the actual worker count, including user-supplied or remote pools),
+#' otherwise falls back to the requested `cores`.
+#' @param cores Integer number of cores requested by the user.
+loo_n_workers <- function(cores) {
+  conns <- tryCatch(mirai::status()$connections, error = function(e) 0L)
+  conns <- as.integer(conns)
+  if (length(conns) != 1L || is.na(conns) || conns < 1L) {
+    return(as.integer(cores))
+  }
+  conns
+}
+
+#' Is the active daemon pool on the local machine?
+#'
+#' @noRd
+#' @keywords internal
+#' @description
+#' Determines whether shared memory ([mori::share()]) can be used safely with
+#' the active pool. Shared memory only works when workers run on the same
+#' physical machine, so we only treat same-host transports as local:
+#'
+#' * `abstract://` and `ipc://` are same-machine inter-process transports used
+#'   by local [mirai::daemons()] pools, so these are treated as local.
+#' * `tcp://` (and anything else) may be a remote pool, or the host URL that
+#'   remote SSH/HPC daemons dial back to, so it is treated as **not** local.
+#'   loo then falls back to ordinary serialization instead of shared memory.
+#'
+#' This is intentionally conservative: an incorrect "local" classification
+#' would produce wrong results on a remote pool, whereas an incorrect "remote"
+#' classification merely forgoes the zero-copy optimisation.
+#' @return `TRUE` if the pool is confirmed local, otherwise `FALSE`.
+loo_pool_is_local <- function() {
+  urls <- tryCatch(mirai::status()$daemons, error = function(e) NULL)
+  if (!is.character(urls) || length(urls) == 0L) {
+    return(FALSE)
+  }
+  all(grepl("^(abstract|ipc)://", urls))
+}
+
+#' Map a worker over elements, serially or across a mirai daemon pool
+#'
+#' @noRd
+#' @keywords internal
+#' @description
+#' Single cross-platform entry point for loo's per-observation parallelism.
+#' Replaces the previous platform-branching
+#' [parallel::mclapply()] / [parallel::parLapply()] code paths with a single
+#' [mirai::mirai_map()] path, while preserving the serial [lapply()] behaviour
+#' when no parallelism is requested or available.
+#'
+#' Object transport is chosen automatically:
+#'
+#' * `broadcast` objects are reused identically by every element (e.g. the
+#'   posterior `draws` matrix). On a local pool they are written once into
+#'   shared memory with [mori::share()] so each daemon maps the same physical
+#'   pages (zero-copy). On a remote pool, where shared memory is unavailable,
+#'   they are serialized instead; chunking bounds the number of copies sent to
+#'   roughly one per worker.
+#' * Small per-call arguments are passed through `...`.
+#'
+#' @param X A vector or list to iterate over. Each element is passed as the
+#'   first argument to `FUN`.
+#' @param FUN Worker function. Called as `FUN(x, <broadcast>, <...>)`; the
+#'   names in `broadcast` and `...` must match `FUN`'s formals.
+#' @param ... Small constant arguments forwarded to `FUN` for every element.
+#' @param cores Integer number of cores requested by the user. Parallelism is
+#'   only used when `cores > 1` and a daemon pool is connected.
+#' @param broadcast Named list of large objects reused by every element. See
+#'   Description for how these are transported.
+#' @param chunk Chunking strategy. `"auto"` (default) splits `X` into roughly
+#'   one chunk per worker to amortise per-task overhead -- best for cheap
+#'   per-element work over many elements. `"never"` dispatches one task per
+#'   element for finer load balancing -- best for expensive, uneven per-element
+#'   work. `"never"` is automatically promoted to `"auto"` on a remote pool
+#'   that carries `broadcast` objects, to avoid re-sending them per task.
+#' @return A list of `FUN` results in the same order as `X`.
+loo_map <- function(X, FUN, ..., cores = 1L, broadcast = list(),
+                    chunk = c("auto", "never")) {
+  chunk <- match.arg(chunk)
+  dots <- list(...)
+
+  if (!(cores > 1L && loo_has_pool())) {
+    # Serial path: identical behaviour to a plain lapply() with the broadcast
+    # and constant arguments supplied by name.
+    return(do.call(lapply, c(list(X, FUN), broadcast, dots)))
+  }
+
+  local_pool <- loo_pool_is_local()
+  if (length(broadcast) > 0L) {
+    if (local_pool) {
+      # Zero-copy: write once to shared memory, ship tiny references.
+      broadcast <- lapply(broadcast, mori::share)
+    } else if (chunk == "never") {
+      # Remote pool: avoid re-serializing large broadcast objects once per
+      # task by collapsing to one chunk per worker instead.
+      chunk <- "auto"
+    }
+  }
+  const_args <- c(broadcast, dots)
+
+  if (chunk == "never") {
+    return(
+      mirai::mirai_map(
+        X,
+        function(.x, .FUN, .const) do.call(.FUN, c(list(.x), .const)),
+        .args = list(.FUN = FUN, .const = const_args)
+      )[mirai::.stop]
+    )
+  }
+
+  n_chunks <- min(loo_n_workers(cores), length(X))
+  positions <- parallel::splitIndices(length(X), n_chunks)
+  chunks <- lapply(positions, function(p) X[p])
+  chunk_results <- mirai::mirai_map(
+    chunks,
+    function(.chunk, .FUN, .const) {
+      lapply(.chunk, function(.x) do.call(.FUN, c(list(.x), .const)))
+    },
+    .args = list(.FUN = FUN, .const = const_args)
+  )[mirai::.stop]
+  # splitIndices() returns contiguous ascending groups, so concatenating the
+  # per-chunk lists restores the original order of X.
+  do.call(c, chunk_results)
+}
diff --git a/tests/testthat/test_parallel.R b/tests/testthat/test_parallel.R
new file mode 100644
index 00000000..dc887c04
--- /dev/null
+++ b/tests/testthat/test_parallel.R
@@ -0,0 +1,246 @@
+options(mc.cores = 1)
+set.seed(123)
+
+# Make sure no daemon pool leaks in from another test file.
+mirai::daemons(0)
+
+LLarr <- example_loglik_array()
+LLmat <- example_loglik_matrix()
+chain_id <- rep(1:2, each = dim(LLarr)[1])
+r_eff <- relative_eff(exp(LLarr))
+
+# Shared data for the function-method end-to-end checks.
+set.seed(1)
+S_fn <- 200
+N_fn <- 30
+draws_fn <- cbind(mu = rnorm(S_fn), sigma = abs(rnorm(S_fn)) + 0.5)
+data_fn <- data.frame(y = rnorm(N_fn))
+llfun_test <- function(data_i, draws, ...) {
+  dnorm(data_i$y, mean = draws[, "mu"], sd = draws[, "sigma"], log = TRUE)
+}
+
+
+# Pool-introspection helpers -------------------------------------------------
+
+test_that("loo_has_pool() and loo_pool_is_local() detect a local pool", {
+  mirai::daemons(0)
+  expect_false(loo:::loo_has_pool())
+  expect_false(loo:::loo_pool_is_local())
+
+  skip_on_cran()
+  mirai::daemons(2)
+  on.exit(mirai::daemons(0), add = TRUE)
+  expect_true(loo:::loo_has_pool())
+  expect_true(loo:::loo_pool_is_local())
+  # Chunking uses the connected daemon count, not the requested cores.
+  expect_equal(loo:::loo_n_workers(1), 2L)
+})
+
+test_that("loo_pool_is_local() is FALSE for a tcp pool (remote-safety gate)", {
+  skip_on_cran()
+  mirai::daemons(0)
+  mirai::daemons(n = 2, url = mirai::local_url(tcp = TRUE))
+  on.exit(mirai::daemons(0), add = TRUE)
+  # The locality gate reads the configured transport URL (available
+  # immediately, regardless of connection timing). tcp:// may be a remote/SSH
+  # pool, so shared memory must not be assumed.
+  expect_false(loo:::loo_pool_is_local())
+})
+
+
+# loo_map() ------------------------------------------------------------------
+
+test_that("loo_map() runs serially when no pool is available", {
+  mirai::daemons(0)
+  res <- loo:::loo_map(1:5, function(x, m) x * m, m = 2, cores = 4)
+  expect_identical(res, as.list((1:5) * 2))
+})
+
+test_that("loo_map() runs serially when cores <= 1 even with a pool", {
+  skip_on_cran()
+  mirai::daemons(2)
+  on.exit(mirai::daemons(0), add = TRUE)
+  res <- loo:::loo_map(1:5, function(x, m) x * m, m = 3, cores = 1)
+  expect_identical(res, as.list((1:5) * 3))
+})
+
+test_that("loo_map() parallel matches serial and preserves order", {
+  skip_on_cran()
+  worker <- function(i, mat, add) sum(mat[, i]) + add
+  mat <- matrix(as.numeric(1:60), nrow = 6) # 6 x 10
+  N <- ncol(mat)
+  expected <- lapply(seq_len(N), worker, mat = mat, add = 100)
+
+  mirai::daemons(3)
+  on.exit(mirai::daemons(0), add = TRUE)
+
+  # broadcast object shared via mori on a local pool; both chunk strategies
+  res_auto <- loo:::loo_map(
+    seq_len(N), worker, add = 100, cores = 3,
+    broadcast = list(mat = mat), chunk = "auto"
+  )
+  res_never <- loo:::loo_map(
+    seq_len(N), worker, add = 100, cores = 3,
+    broadcast = list(mat = mat), chunk = "never"
+  )
+  expect_identical(res_auto, expected)
+  expect_identical(res_never, expected)
+})
+
+test_that("loo_map() works when there are more workers than elements", {
+  skip_on_cran()
+  mirai::daemons(4)
+  on.exit(mirai::daemons(0), add = TRUE)
+  res <- loo:::loo_map(1:2, function(x) x + 1L, cores = 4)
+  expect_identical(res, list(2L, 3L))
+})
+
+test_that("loo_map() propagates worker errors", {
+  skip_on_cran()
+  mirai::daemons(2)
+  on.exit(mirai::daemons(0), add = TRUE)
+  expect_error(
+    loo:::loo_map(1:4, function(x) if (x == 3L) stop("boom") else x, cores = 2),
+    "boom"
+  )
+})
+
+
+# End-to-end: importance sampling --------------------------------------------
+
+test_that("psis() parallel equals serial", {
+  skip_on_cran()
+  ps_serial <- suppressWarnings(psis(-LLmat, r_eff = r_eff, cores = 1))
+
+  mirai::daemons(2)
+  on.exit(mirai::daemons(0), add = TRUE)
+  ps_parallel <- suppressWarnings(psis(-LLmat, r_eff = r_eff, cores = 2))
+
+  expect_equal(ps_serial$log_weights, ps_parallel$log_weights)
+  expect_equal(ps_serial$diagnostics, ps_parallel$diagnostics)
+})
+
+test_that("tis() and sis() parallel equal serial", {
+  skip_on_cran()
+  tis_serial <- suppressWarnings(tis(-LLmat, r_eff = r_eff, cores = 1))
+  sis_serial <- suppressWarnings(sis(-LLmat, r_eff = r_eff, cores = 1))
+
+  mirai::daemons(2)
+  on.exit(mirai::daemons(0), add = TRUE)
+  tis_parallel <- suppressWarnings(tis(-LLmat, r_eff = r_eff, cores = 2))
+  sis_parallel <- suppressWarnings(sis(-LLmat, r_eff = r_eff, cores = 2))
+
+  expect_equal(tis_serial$log_weights, tis_parallel$log_weights)
+  expect_equal(sis_serial$log_weights, sis_parallel$log_weights)
+})
+
+
+# End-to-end: loo() function method (broadcast draws/data) -------------------
+
+test_that("loo.function parallel equals serial", {
+  skip_on_cran()
+  loo_serial <- suppressWarnings(
+    loo(llfun_test, data = data_fn, draws = draws_fn, cores = 1)
+  )
+
+  mirai::daemons(2)
+  on.exit(mirai::daemons(0), add = TRUE)
+  loo_parallel <- suppressWarnings(
+    loo(llfun_test, data = data_fn, draws = draws_fn, cores = 2)
+  )
+
+  expect_equal(loo_serial$pointwise, loo_parallel$pointwise)
+  expect_equal(loo_serial$estimates, loo_parallel$estimates)
+})
+
+test_that("loo.function reuses an existing (user-configured) pool", {
+  skip_on_cran()
+  loo_serial <- suppressWarnings(
+    loo(llfun_test, data = data_fn, draws = draws_fn, cores = 1)
+  )
+
+  # User sets up the pool themselves; loo should reuse it untouched.
+  mirai::daemons(3)
+  on.exit(mirai::daemons(0), add = TRUE)
+  loo_reuse <- suppressWarnings(
+    loo(llfun_test, data = data_fn, draws = draws_fn, cores = 2)
+  )
+  # Pool is still alive after the call (loo did not tear it down).
+  expect_true(loo:::loo_has_pool())
+  expect_equal(loo_serial$pointwise, loo_reuse$pointwise)
+})
+
+
+# End-to-end: relative_eff ---------------------------------------------------
+
+test_that("relative_eff() array and function methods are parallel-invariant", {
+  skip_on_cran()
+  re_arr_serial <- relative_eff(exp(LLarr), cores = 1)
+  re_fn_serial <- relative_eff(
+    llfun_test, chain_id = rep(1, S_fn),
+    data = data_fn, draws = draws_fn, cores = 1
+  )
+
+  mirai::daemons(2)
+  on.exit(mirai::daemons(0), add = TRUE)
+  re_arr_parallel <- relative_eff(exp(LLarr), cores = 2)
+  re_fn_parallel <- relative_eff(
+    llfun_test, chain_id = rep(1, S_fn),
+    data = data_fn, draws = draws_fn, cores = 2
+  )
+
+  expect_equal(re_arr_serial, re_arr_parallel)
+  expect_equal(re_fn_serial, re_fn_parallel)
+})
+
+
+# End-to-end: loo_subsample --------------------------------------------------
+
+test_that("loo_subsample() parallel equals serial", {
+  skip_on_cran()
+  # Reset RNG before each call so the same subsample is drawn.
+  set.seed(4242)
+  ss_serial <- suppressWarnings(loo_subsample(
+    llfun_test, data = data_fn, draws = draws_fn,
+    observations = 20, loo_approximation = "plpd", cores = 1
+  ))
+
+  mirai::daemons(2)
+  on.exit(mirai::daemons(0), add = TRUE)
+  set.seed(4242)
+  ss_parallel <- suppressWarnings(loo_subsample(
+    llfun_test, data = data_fn, draws = draws_fn,
+    observations = 20, loo_approximation = "plpd", cores = 2
+  ))
+
+  expect_equal(ss_serial$estimates, ss_parallel$estimates)
+  expect_equal(ss_serial$pointwise, ss_parallel$pointwise)
+})
+
+
+# End-to-end: loo_model_weights (single pool across K models) ----------------
+
+test_that("loo_model_weights() parallel equals serial", {
+  skip_on_cran()
+  set.seed(11)
+  ll_list <- list(
+    matrix(rnorm(200 * 25), nrow = 200),
+    matrix(rnorm(200 * 25), nrow = 200),
+    matrix(rnorm(200 * 25), nrow = 200)
+  )
+  wts_serial <- suppressWarnings(
+    loo_model_weights(ll_list, method = "stacking", cores = 1)
+  )
+
+  mirai::daemons(2)
+  on.exit(mirai::daemons(0), add = TRUE)
+  wts_parallel <- suppressWarnings(
+    loo_model_weights(ll_list, method = "stacking", cores = 2)
+  )
+
+  expect_equal(as.numeric(wts_serial), as.numeric(wts_parallel),
+               tolerance = 1e-6)
+})
+
+# Final safety net in case any test above exited early with a live pool.
+mirai::daemons(0)

From d7ef7bc32236c7b9b4f9da5c0f4d46f99f89a919 Mon Sep 17 00:00:00 2001
From: Florence Bockting <florence.bockting@aalto.fi>
Date: Tue, 30 Jun 2026 16:50:20 +0300
Subject: [PATCH 3/6] update: docs, benchmark, vignette

---
 .Rbuildignore                     |   1 +
 .gitignore                        |   3 +-
 DESCRIPTION                       |   1 +
 R/parallel.R                      | 124 +++++++++++++++-
 _pkgdown.yml                      |   1 +
 benchmark/README.md               | 136 ++++++++++++++++++
 benchmark/bench-comparison.md     |  52 +++++++
 benchmark/benchmark-parallel.R    | 123 ++++++++++++++++
 benchmark/compare.R               | 218 ++++++++++++++++++++++++++++
 benchmark/peak-mem-run.R          |  47 ++++++
 benchmark/peak-mem.sh             |  52 +++++++
 man-roxygen/cores.R               |  18 +++
 man/ap_psis.Rd                    |  20 +++
 man/importance_sampling.Rd        |  19 +++
 man/loo.Rd                        |  20 +++
 man/loo_approximate_posterior.Rd  |  20 +++
 man/loo_model_weights.Rd          |  20 +++
 man/loo_moment_match.Rd           |  20 +++
 man/loo_moment_match_split.Rd     |  20 +++
 man/loo_subsample.Rd              |  20 +++
 man/parallel_psis_list.Rd         |  20 +++
 man/psis.Rd                       |  22 ++-
 man/psis_approximate_posterior.Rd |  20 +++
 man/sis.Rd                        |  22 ++-
 man/tis.Rd                        |  22 ++-
 man/update.psis_loo_ss.Rd         |  20 +++
 tests/testthat/test_parallel.R    | 113 +++++++++++++++
 vignettes/loo2-parallel.Rmd       | 230 ++++++++++++++++++++++++++++++
 28 files changed, 1397 insertions(+), 7 deletions(-)
 create mode 100644 benchmark/README.md
 create mode 100644 benchmark/bench-comparison.md
 create mode 100644 benchmark/benchmark-parallel.R
 create mode 100644 benchmark/compare.R
 create mode 100644 benchmark/peak-mem-run.R
 create mode 100755 benchmark/peak-mem.sh
 create mode 100644 vignettes/loo2-parallel.Rmd

diff --git a/.Rbuildignore b/.Rbuildignore
index 63463781..f23c768d 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -1,3 +1,4 @@
+^benchmark$
 ^CRAN-RELEASE$
 ^.*\.Rproj$
 ^\.Rproj\.user$
diff --git a/.gitignore b/.gitignore
index f070106a..55337eba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,4 +29,5 @@ release-prep.R
 
 agent/*
 data/*
-scratch-files/*
\ No newline at end of file
+scratch-files/*
+notes/*
\ No newline at end of file
diff --git a/DESCRIPTION b/DESCRIPTION
index f673389a..0b33f259 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -64,3 +64,4 @@ LazyData: TRUE
 Roxygen: list(markdown = TRUE)
 SystemRequirements: pandoc (>= 1.12.3), pandoc-citeproc
 Config/roxygen2/version: 8.0.0
+RoxygenNote: 7.3.3
diff --git a/R/parallel.R b/R/parallel.R
index 4c8fbe3c..e7ff0f0b 100644
--- a/R/parallel.R
+++ b/R/parallel.R
@@ -1,3 +1,104 @@
+#' Package-internal state for the parallel backend
+#'
+#' @noRd
+#' @keywords internal
+#' @description
+#' Holds small bits of session-scoped state used by the parallel helpers:
+#'
+#' * `cleanup_registered`: guards `loo_register_daemon_cleanup()` so the
+#'   session-exit finalizer is only registered once.
+#' * `warned_bad_daemons`: guards the malformed-config warning in
+#'   `loo_persist_config()` so it is only emitted once per session.
+#'
+#' It also serves as the object the daemon-cleanup finalizer is attached to.
+.loo_internal <- new.env(parent = emptyenv())
+
+#' Resolve the persistent local daemon pool size from user configuration
+#'
+#' @noRd
+#' @keywords internal
+#' @description
+#' Reads the opt-in "persistent local pool" size from, in precedence order:
+#'
+#' 1. the R option `loo.daemons`,
+#' 2. the environment variable `LOO_DAEMONS`,
+#' 3. otherwise the feature is off.
+#'
+#' This knob enables a local [mirai::daemons()] pool that is created lazily on
+#' first parallel use and kept warm for the rest of the session (see
+#' `with_loo_daemons()`), which avoids paying pool spawn/teardown overhead on
+#' every top-level `loo()`/`psis()` call (useful for simulations, benchmarks
+#' and batch/HPC scripts).
+#'
+#' @return A single integer `>= 2` giving the persistent pool size, or
+#'   `NA_integer_` when the feature is off (unset, `0`/`1`, or a non-integer
+#'   value). Genuinely malformed (non-coercible) values warn once per session
+#'   and then disable the feature.
+loo_persist_config <- function() {
+  raw <- getOption("loo.daemons")
+  if (is.null(raw)) {
+    raw <- Sys.getenv("LOO_DAEMONS", unset = NA_character_)
+  }
+  if (length(raw) != 1L) {
+    return(NA_integer_)
+  }
+  if (is.na(raw) || (is.character(raw) && !nzchar(trimws(raw)))) {
+    # Unset / empty -> feature off.
+    return(NA_integer_)
+  }
+  n <- suppressWarnings(as.numeric(raw))
+  if (is.na(n) || !is.finite(n)) {
+    # Non-numeric garbage -> off, but tell the user once that it was ignored.
+    loo_warn_bad_daemons(raw)
+    return(NA_integer_)
+  }
+  if (n < 2 || n != trunc(n)) {
+    # 0/1 (serial) or a non-integer value -> feature off, silently.
+    return(NA_integer_)
+  }
+  as.integer(n)
+}
+
+#' Warn (once per session) about a malformed persistent-pool configuration
+#'
+#' @noRd
+#' @keywords internal
+loo_warn_bad_daemons <- function(value) {
+  if (isTRUE(.loo_internal$warned_bad_daemons)) {
+    return(invisible(NULL))
+  }
+  .loo_internal$warned_bad_daemons <- TRUE
+  warning(
+    "Ignoring invalid persistent-pool size ", encodeString(value, quote = "'"),
+    " from 'loo.daemons'/'LOO_DAEMONS'; expected a single integer >= 2.",
+    call. = FALSE
+  )
+  invisible(NULL)
+}
+
+#' Register a one-time session-exit cleanup for the persistent daemon pool
+#'
+#' @noRd
+#' @keywords internal
+#' @description
+#' Attaches a finalizer (only once per session) that resets any local daemon
+#' pool with `mirai::daemons(0)` when the R session exits. mirai already
+#' terminates local daemons when the host session ends; this is a
+#' belt-and-suspenders guard so the lazily created persistent pool never leaves
+#' orphan processes behind in batch/HPC scripts.
+loo_register_daemon_cleanup <- function() {
+  if (isTRUE(.loo_internal$cleanup_registered)) {
+    return(invisible(NULL))
+  }
+  .loo_internal$cleanup_registered <- TRUE
+  reg.finalizer(
+    .loo_internal,
+    function(e) try(mirai::daemons(0), silent = TRUE),
+    onexit = TRUE
+  )
+  invisible(NULL)
+}
+
 #' Evaluate parallel work with an appropriate mirai daemon pool
 #'
 #' @noRd
@@ -10,7 +111,14 @@
 #' * `cores <= 1`: runs `code` serially without touching daemons.
 #' * A daemon pool is already configured (e.g. the user called
 #'   [mirai::daemons()] themselves, possibly with remote/HPC daemons): `code`
-#'   runs on the existing pool, which is left untouched.
+#'   runs on the existing pool, which is left untouched. This always takes
+#'   precedence over the options below.
+#' * Otherwise, if the user opted in to a persistent session pool via the
+#'   `loo.daemons` option or `LOO_DAEMONS` environment variable (see
+#'   `loo_persist_config()`): a local pool of that size is created lazily on
+#'   this first parallel call and left warm for the rest of the session, with a
+#'   session-exit finalizer registered for cleanup. Subsequent calls reuse it
+#'   via the existing-pool branch above.
 #' * Otherwise: a pool of `cores` local daemons is created for the duration of
 #'   `code` and automatically reset afterwards (via the scoped
 #'   `with(mirai::daemons(), ...)` method), so no daemon processes are left
@@ -22,14 +130,24 @@
 #' existing pool, it is safe to nest: an inner call made while an outer call
 #' already established a pool simply reuses it instead of creating another.
 #'
-#' @param cores Integer number of cores requested by the user.
+#' @param cores Integer number of cores requested by the user. Acts as the
+#'   per-call "enable parallel" switch; the persistent pool size, when enabled,
+#'   comes from `loo_persist_config()` rather than from `cores`.
 #' @param code Expression to evaluate. Lazily evaluated in the calling
 #'   environment, after any daemon pool has been set up.
 #' @return The value of `code`.
 with_loo_daemons <- function(cores, code) {
   if (cores <= 1 || loo_has_pool()) {
     # Serial work, or reuse the daemon pool the user (or an outer loo call)
-    # already configured.
+    # already configured. This always wins over the persistent-pool option.
+    return(code)
+  }
+  persist <- loo_persist_config()
+  if (!is.na(persist)) {
+    # Opt-in persistent pool: create once, leave warm for the session, and
+    # register a finalizer to tidy up at session exit. No per-call teardown.
+    mirai::daemons(persist)
+    loo_register_daemon_cleanup()
     return(code)
   }
   # No pool configured: create one scoped to this computation and reset it on
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 0a216a02..4878e8c2 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -52,6 +52,7 @@ articles:
       - loo2-non-factorized
       - loo2-lfo
       - loo2-large-data
+      - loo2-parallel
       - loo2-moment-matching
       - loo2-mixis
   - title: Frequently asked questions
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 00000000..40174ff0
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,136 @@
+# loo parallel benchmarks
+
+These scripts measure the performance of loo's parallel code paths and compare
+two installed versions of the package side by side:
+
+- **`baseline`** — a pre-`mirai` version (the old `mclapply`/`parLapply`
+  backend), e.g. the released version from CRAN.
+- **`new`** — the current working tree (the `mirai` + `mori` backend, including
+  the persistent session pool controlled by `options(loo.daemons = k)` /
+  `LOO_DAEMONS`).
+
+The same user-facing calls (`psis()`, `loo()`) are timed for every version; only
+the internal parallel backend differs. For the `new` version we additionally
+time a **persist** mode that opts in to the persistent session pool, so we can
+separate per-call daemon spawn/teardown overhead from the steady-state cost.
+
+## Files
+
+| File | Purpose |
+|---|---|
+| `benchmark-parallel.R` | Times `psis()`/`loo()` across cores for one installed version; writes `/tmp/bench-<label>.rds`. |
+| `compare.R` | Reads the two `.rds` files and prints a wall-clock + memory comparison table. |
+| `peak-mem-run.R` | A single large-`draws` `loo()` run used for peak-memory measurement. |
+| `peak-mem.sh` | Linux-only driver that samples the RSS of the whole process tree during `peak-mem-run.R`. |
+
+## Prerequisites
+
+The benchmark scripts need the `bench` package (in addition to whatever loo
+needs):
+
+```bash
+Rscript -e 'install.packages("bench")'
+```
+
+## Step 1 — Install each version into its own library
+
+Each run reads the library path from the `LOO_LIB` environment variable, so put
+each version in its own directory:
+
+```bash
+mkdir -p /tmp/loo-base-lib /tmp/loo-new-lib
+
+# "new" = this working tree
+R CMD INSTALL --library=/tmp/loo-new-lib .
+
+# "baseline" = a pre-mirai version to compare against (CRAN release shown here;
+# alternatively check out an older git ref elsewhere and install that)
+Rscript -e 'install.packages("loo", lib = "/tmp/loo-base-lib")'
+```
+
+## Step 2 — Run the timing benchmark once per version
+
+Run from the package root. `BENCH_LABEL` tags the output file
+(`/tmp/bench-<label>.rds`):
+
+```bash
+LOO_LIB=/tmp/loo-base-lib BENCH_LABEL=baseline Rscript benchmark/benchmark-parallel.R
+LOO_LIB=/tmp/loo-new-lib  BENCH_LABEL=new      Rscript benchmark/benchmark-parallel.R
+```
+
+The `persist` mode is only measured for `BENCH_LABEL=new`, because
+`options(loo.daemons)` is a no-op in the baseline version.
+
+## Step 3 — Aggregate and compare
+
+```bash
+Rscript benchmark/compare.R
+```
+
+This prints two tables (median wall-clock time with speedups, and main-process
+memory) with these columns:
+
+- `base` — baseline version.
+- `new/call` — new version, default per-call pool (created and torn down each
+  call).
+- `new/per` — new version, persistent session pool (`options(loo.daemons = k)`),
+  reused across calls.
+
+A speedup `> 1` means the new version is faster than the baseline. The report
+also states how many iterations each median is based on (recorded from
+`benchmark-parallel.R`'s `iters` setting).
+
+In addition to the console output, `compare.R` writes a Markdown report with the
+same tables (handy for pasting into issues/PRs). By default it is written next to
+the script as `benchmark/bench-comparison.md`; override the path with the
+`BENCH_MD` environment variable:
+
+```bash
+BENCH_MD=/tmp/benchmark-results.md Rscript benchmark/compare.R
+```
+
+## Optional — Peak memory (Linux only)
+
+`peak-mem.sh` samples the total RSS of the R process and all of its workers
+during one large-`draws` `loo()` run. It takes positional arguments
+`LOO_LIB BENCH_LABEL MODE CORES`, where `MODE` is `per-call` or `persist`:
+
+```bash
+benchmark/peak-mem.sh /tmp/loo-base-lib baseline per-call 8
+benchmark/peak-mem.sh /tmp/loo-new-lib  new      per-call 8
+benchmark/peak-mem.sh /tmp/loo-new-lib  new      persist  8
+```
+
+For the `new` version it also prints the `mori` transport size of the broadcast
+`draws` object (raw serialized MB vs the shared-memory reference in bytes),
+which shows the zero-copy benefit on a local pool.
+
+Each run also appends its peak-RSS result to a tab-separated file (default
+`/tmp/bench-peakmem.tsv`, override with `PEAK_OUT`). If that file is present when
+you run `compare.R`, the peak-RSS numbers are folded into the report as an extra
+"Peak RSS of the whole process tree" table. Delete the file between fresh runs so
+stale rows aren't mixed in (only the most recent row per label/mode/cores is
+kept).
+
+The Markdown report also ends with a **"How to read these numbers"** section
+explaining the difference between median time/speedup, main-process heap
+allocation (churn, not peak, workers excluded), and peak RSS (the whole process
+tree's real footprint).
+
+## Tuning
+
+Edit the top of `benchmark-parallel.R` to match your machine / problem sizes:
+
+- `cores_grid` (default `c(1, 4, 8)`) — the core counts to sweep.
+- `iters` (default `5`) — iterations per `bench::mark()` measurement.
+- `psis_sizes` and the `loo.function` `draws` dimensions — the problem sizes.
+
+For `peak-mem-run.R`, adjust `S`, `P`, and `Nf` to change the size of the
+broadcast `draws` matrix.
+
+## Notes
+
+- All output is written to `/tmp`. Remove `/tmp/bench-*.rds` between experiments
+  if you change the scenarios, so stale rows aren't mixed in.
+- Results are platform-dependent; run baseline and new back to back on an idle
+  machine for a fair comparison.
diff --git a/benchmark/bench-comparison.md b/benchmark/bench-comparison.md
new file mode 100644
index 00000000..487b38f5
--- /dev/null
+++ b/benchmark/bench-comparison.md
@@ -0,0 +1,52 @@
+# loo parallel benchmark comparison
+
+_Generated 2026-06-30 16:29:48._
+
+**Columns / modes.** `base` is the baseline version. `new/call` is the new version's default per-call pool (created and torn down each call). `new/persist` is the new version's persistent session pool (`options(loo.daemons = k)`), reused across calls. `cores = 1` rows are fully serial (the parallel backend is never used).
+
+## Median wall-clock time (s) and speedup vs baseline
+
+Median over 10 iterations of one `psis()`/`loo()` call (a warm-up run is excluded). `speedup = base / new`, so a value `> 1` means the new version is faster. Expect `new/persist` to win when many calls reuse the pool, and `new/call` to look slower than `base` for cheap problems because it pays pool start-up/teardown on every call. The `cores = 1` `base` and `new/call` numbers should be roughly equal (both serial); sizeable gaps there are run-to-run noise, not real differences.
+
+| scenario | cores | base | new/call | new/persist | speedup (call) | speedup (persist) |
+|:---|---:|---:|---:|---:|---:|---:|
+| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 1 | 0.458 | 0.406 | — | 1.13x | — |
+| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 4 | 0.670 | 1.315 | 0.175 | 0.51x | 3.83x |
+| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 8 | 0.536 | 1.433 | 0.133 | 0.37x | 4.02x |
+| psis S=2000 N=1000 | 1 | 0.416 | 0.403 | — | 1.03x | — |
+| psis S=2000 N=1000 | 4 | 0.237 | 1.262 | 0.158 | 0.19x | 1.51x |
+| psis S=2000 N=1000 | 8 | 0.205 | 1.276 | 0.136 | 0.16x | 1.51x |
+| psis S=4000 N=4000 | 1 | 2.998 | 2.628 | — | 1.14x | — |
+| psis S=4000 N=4000 | 4 | 2.253 | 2.385 | 1.455 | 0.94x | 1.55x |
+| psis S=4000 N=4000 | 8 | 1.707 | 2.258 | 1.050 | 0.76x | 1.63x |
+
+## Main-process memory allocation (MB)
+
+Total bytes allocated on the R heap by the *coordinator* process during the call (cumulative churn, **not** peak and **not** net), as measured by `bench`'s allocation profiler. It does **not** include memory used by worker processes, nor off-heap memory such as the `mori` shared-memory segment for the broadcast `draws`. That is why parallel rows are tiny: the heavy allocation happens in the workers, out of the profiler's view. Use it to gauge allocation pressure on the main process, not total footprint.
+
+| scenario | cores | base | new/call | new/persist |
+|:---|---:|---:|---:|---:|
+| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 1 | 803.292 | 803.292 | — |
+| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 4 | — | 0.720 | 0.700 |
+| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 8 | — | 1.297 | 1.276 |
+| psis S=2000 N=1000 | 1 | 304.130 | 304.130 | — |
+| psis S=2000 N=1000 | 4 | — | 120.805 | 120.785 |
+| psis S=2000 N=1000 | 8 | — | 121.322 | 121.302 |
+| psis S=4000 N=4000 | 1 | 2306.371 | 2306.371 | — |
+| psis S=4000 N=4000 | 4 | — | 961.519 | 961.498 |
+| psis S=4000 N=4000 | 8 | — | 962.035 | 962.015 |
+
+## Peak RSS of the whole process tree (MB)
+
+From `peak-mem.sh` (single large-`draws` `loo()` run; Linux only). Maximum summed resident memory of the *entire* process tree (main process plus all workers), sampled during the run. This is the metric for the job's real memory footprint. On a local pool, `mori` shares the `draws` matrix across workers (zero-copy), so peak RSS stays close to a single copy rather than growing with the number of workers.
+
+| label | mode | cores | peak RSS (MB) |
+|:---|:---|---:|---:|
+| baseline | per-call | 8 | 2418 |
+| new | per-call | 8 | 343 |
+| new | persist | 8 | 343 |
+
+## Caveats
+
+Results are platform-dependent and sensitive to machine load; run the baseline and new versions back to back on an idle machine, and ignore differences smaller than the run-to-run noise.
+
diff --git a/benchmark/benchmark-parallel.R b/benchmark/benchmark-parallel.R
new file mode 100644
index 00000000..4b45d68c
--- /dev/null
+++ b/benchmark/benchmark-parallel.R
@@ -0,0 +1,123 @@
+# Benchmark loo's parallel code paths for a single installed version.
+#
+# Run once per version, pointing at an isolated library and tagging the output:
+#   LOO_LIB=/tmp/loo-base-lib BENCH_LABEL=baseline Rscript benchmark/benchmark-parallel.R
+#   LOO_LIB=/tmp/loo-new-lib  BENCH_LABEL=new      Rscript benchmark/benchmark-parallel.R
+#
+# Results are written to /tmp/bench-<label>.rds; compare.R aggregates them.
+#
+# The same user-facing calls (psis(), loo()) are timed for every version; the
+# parallel backend (mclapply/parLapply vs mirai+mori) differs internally. For
+# the new version we additionally time a "persist" mode that opts in to loo's
+# persistent session pool via `options(loo.daemons = k)` (equivalently the
+# `LOO_DAEMONS` environment variable). loo then creates the local mirai pool
+# lazily on the first (warm-up) call and reuses it for every later call, so the
+# timed iterations measure steady-state cost with no per-call daemon
+# spawn/teardown overhead.
+
+lib <- Sys.getenv("LOO_LIB")
+label <- Sys.getenv("BENCH_LABEL", unset = "unknown")
+stopifnot(nzchar(lib))
+
+suppressMessages({
+  library(loo, lib.loc = lib)
+  library(bench)
+})
+
+is_new <- identical(label, "new")
+cores_grid <- c(1L, 4L, 8L)
+iters <- 10L
+
+rows <- list()
+record <- function(scenario, mode, cores, expr) {
+  expr <- substitute(expr)
+  pf <- parent.frame()
+  eval(expr, pf) # warm up (process spawn / pool / JIT)
+  b <- tryCatch(
+    bench::mark(
+      eval(expr, pf),
+      iterations = iters, check = FALSE, memory = TRUE, filter_gc = FALSE
+    ),
+    error = function(e) {
+      bench::mark(
+        eval(expr, pf),
+        iterations = iters, check = FALSE, memory = FALSE, filter_gc = FALSE
+      )
+    }
+  )
+  rows[[length(rows) + 1L]] <<- data.frame(
+    label = label, scenario = scenario, mode = mode, cores = cores,
+    iters = iters,
+    median_s = as.numeric(b$median),
+    mem_mb = if ("mem_alloc" %in% names(b) && !is.na(b$mem_alloc[1])) {
+      as.numeric(b$mem_alloc) / 1e6
+    } else {
+      NA_real_
+    },
+    stringsAsFactors = FALSE
+  )
+  cat(sprintf(
+    "  [%s] %-22s %-10s cores=%d  median=%.3fs\n",
+    label, scenario, mode, cores, as.numeric(b$median)
+  ))
+}
+
+# ---------------------------------------------------------------------------
+# Scenario 1: standalone PSIS over a log-ratio matrix (partitioned columns).
+# ---------------------------------------------------------------------------
+psis_sizes <- list(c(S = 2000, N = 1000), c(S = 4000, N = 4000))
+for (sz in psis_sizes) {
+  set.seed(2024) # identical inputs across versions
+  S <- sz[["S"]]
+  N <- sz[["N"]]
+  LL <- matrix(rnorm(S * N), nrow = S)
+  re <- rep(1, N)
+  scen <- sprintf("psis S=%d N=%d", S, N)
+  for (k in cores_grid) {
+    mode <- if (k == 1L) "serial" else "per-call"
+    record(scen, mode, k, suppressWarnings(psis(-LL, r_eff = re, cores = k)))
+    if (is_new && k > 1L) {
+      # Opt in to loo's persistent session pool; the warm-up call inside
+      # record() creates it and the timed iterations reuse it.
+      old_opt <- options(loo.daemons = k)
+      record(scen, "persist", k, suppressWarnings(psis(-LL, r_eff = re, cores = k)))
+      mirai::daemons(0)
+      options(old_opt)
+    }
+  }
+}
+
+# ---------------------------------------------------------------------------
+# Scenario 2: loo.function with a large broadcast `draws` matrix (the case
+# where fork shares memory for free and mori must recover that benefit).
+# ---------------------------------------------------------------------------
+set.seed(7)
+S2 <- 8000L
+P <- 150L
+Nf <- 400L
+draws_big <- matrix(rnorm(S2 * P), nrow = S2, dimnames = list(NULL, paste0("p", seq_len(P))))
+data_f <- data.frame(y = rnorm(Nf))
+llfun_b <- function(data_i, draws, ...) {
+  dnorm(data_i$y, mean = draws[, "p1"], sd = abs(draws[, "p2"]) + 0.5, log = TRUE)
+}
+scen <- sprintf("loo.function S=%d P=%d N=%d (draws=%.1fMB)", S2, P, Nf, S2 * P * 8 / 1e6)
+for (k in cores_grid) {
+  mode <- if (k == 1L) "serial" else "per-call"
+  record(scen, mode, k, suppressWarnings(
+    loo(llfun_b, data = data_f, draws = draws_big, cores = k)
+  ))
+  if (is_new && k > 1L) {
+    # Opt in to loo's persistent session pool; the warm-up call inside
+    # record() creates it and the timed iterations reuse it.
+    old_opt <- options(loo.daemons = k)
+    record(scen, "persist", k, suppressWarnings(
+      loo(llfun_b, data = data_f, draws = draws_big, cores = k)
+    ))
+    mirai::daemons(0)
+    options(old_opt)
+  }
+}
+
+out <- do.call(rbind, rows)
+saveRDS(out, sprintf("/tmp/bench-%s.rds", label))
+cat(sprintf("\nSaved %d rows to /tmp/bench-%s.rds\n", nrow(out), label))
diff --git a/benchmark/compare.R b/benchmark/compare.R
new file mode 100644
index 00000000..135cf4fb
--- /dev/null
+++ b/benchmark/compare.R
@@ -0,0 +1,218 @@
+# Aggregate and compare benchmark results from benchmark-parallel.R.
+#   Rscript benchmark/compare.R
+#
+# Prints the comparison to the console and also writes a Markdown report
+# (default /tmp/bench-comparison.md, override with the BENCH_MD env var) with
+# the same two tables, for easy sharing.
+
+# Directory containing this script, so the report lands next to it regardless
+# of the working directory the script is launched from.
+.this_file <- sub("^--file=", "", grep("^--file=", commandArgs(FALSE), value = TRUE))
+script_dir <- if (length(.this_file) == 1L) {
+  dirname(normalizePath(.this_file))
+} else {
+  "benchmark"
+}
+
+md_out <- Sys.getenv("BENCH_MD", unset = file.path(script_dir, "bench-comparison.md"))
+peak_out <- Sys.getenv("PEAK_OUT", unset = "/tmp/bench-peakmem.tsv")
+
+base <- readRDS("/tmp/bench-baseline.rds")
+new <- readRDS("/tmp/bench-new.rds")
+all <- rbind(base, new)
+
+key <- function(d) paste(d$scenario, d$cores)
+
+# Median seconds, keyed by version/mode.
+med <- function(df, lab, mode) {
+  sel <- df[df$label == lab & df$mode == mode, ]
+  setNames(sel$median_s, key(sel))
+}
+mem <- function(df, lab, mode) {
+  sel <- df[df$label == lab & df$mode == mode, ]
+  setNames(sel$mem_mb, key(sel))
+}
+
+scen_cores <- unique(all[, c("scenario", "cores")])
+scen_cores <- scen_cores[order(scen_cores$scenario, scen_cores$cores), ]
+
+# Iterations per measurement (recorded by benchmark-parallel.R). Older result
+# files may predate this column.
+iters_used <- if ("iters" %in% names(all)) {
+  iv <- sort(unique(all$iters[!is.na(all$iters)]))
+  if (length(iv) == 1L) as.character(iv) else paste(range(iv), collapse = "-")
+} else {
+  NA_character_
+}
+iters_label <- if (is.na(iters_used)) "a few" else iters_used
+
+b_serial <- med(base, "baseline", "serial")
+b_par <- med(base, "baseline", "per-call")
+n_serial <- med(new, "new", "serial")
+n_call <- med(new, "new", "per-call")
+n_persist <- med(new, "new", "persist")
+
+base_med <- c(b_serial, b_par)
+new_call_med <- c(n_serial, n_call)
+
+get1 <- function(vec, k) {
+  if (!is.null(vec) && k %in% names(vec)) vec[[k]] else NA_real_
+}
+fmt <- function(x) ifelse(is.na(x), "      -", sprintf("%7.3f", x))
+spd <- function(num, den) ifelse(is.na(num) | is.na(den), "    -", sprintf("%4.2fx", num / den))
+
+cat(sprintf("\n=== Median wall-clock time (s) and speedup vs baseline (%s iterations) ===\n", iters_label))
+cat(sprintf(
+  "%-42s %5s | %8s %8s %8s | %7s %7s\n",
+  "scenario", "cores", "base", "new/call", "new/per",
+  "call", "persist"
+))
+cat(strrep("-", 96), "\n")
+for (i in seq_len(nrow(scen_cores))) {
+  k <- paste(scen_cores$scenario[i], scen_cores$cores[i])
+  bm <- get1(base_med, k)
+  nc <- get1(new_call_med, k)
+  np <- get1(n_persist, k)
+  cat(sprintf(
+    "%-42s %5d | %8s %8s %8s | %7s %7s\n",
+    scen_cores$scenario[i], scen_cores$cores[i],
+    fmt(bm), fmt(nc), fmt(np),
+    spd(bm, nc), spd(bm, np)
+  ))
+}
+
+cat("\n=== Main-process memory allocation (MB) ===\n")
+b_mem <- c(mem(base, "baseline", "serial"), mem(base, "baseline", "per-call"))
+n_mem <- c(mem(new, "new", "serial"), mem(new, "new", "per-call"))
+p_mem <- mem(new, "new", "persist")
+cat(sprintf("%-42s %5s | %8s %8s %8s\n", "scenario", "cores", "base", "new/call", "new/per"))
+cat(strrep("-", 80), "\n")
+for (i in seq_len(nrow(scen_cores))) {
+  k <- paste(scen_cores$scenario[i], scen_cores$cores[i])
+  cat(sprintf(
+    "%-42s %5d | %8s %8s %8s\n",
+    scen_cores$scenario[i], scen_cores$cores[i],
+    fmt(get1(b_mem, k)), fmt(get1(n_mem, k)), fmt(get1(p_mem, k))
+  ))
+}
+cat("\nspeedup > 1 means new is faster than baseline.\n")
+
+
+# Peak-memory results (optional) ---------------------------------------------
+# peak-mem.sh appends "<label>\t<mode>\t<cores>\t<peak_mb>" rows to peak_out.
+
+peak <- NULL
+if (file.exists(peak_out) && file.info(peak_out)$size > 0) {
+  peak <- utils::read.delim(
+    peak_out, header = FALSE, stringsAsFactors = FALSE,
+    col.names = c("label", "mode", "cores", "peak_mb")
+  )
+  # Keep only the most recent record for each label/mode/cores combination.
+  peak$.k <- paste(peak$label, peak$mode, peak$cores)
+  peak <- peak[!duplicated(peak$.k, fromLast = TRUE), ]
+  peak <- peak[order(peak$cores, peak$label, peak$mode), ]
+
+  cat("\n=== Peak RSS of whole process tree (MB) ===\n")
+  cat(sprintf("%-10s %-10s %5s | %10s\n", "label", "mode", "cores", "peak_mb"))
+  cat(strrep("-", 44), "\n")
+  for (i in seq_len(nrow(peak))) {
+    cat(sprintf(
+      "%-10s %-10s %5s | %10.0f\n",
+      peak$label[i], peak$mode[i], peak$cores[i], peak$peak_mb[i]
+    ))
+  }
+} else {
+  cat(sprintf(
+    "\n(no peak-memory results found at %s; run benchmark/peak-mem.sh to add them)\n",
+    peak_out
+  ))
+}
+
+
+# Markdown report ------------------------------------------------------------
+# Same numbers as above, formatted as Markdown tables for easy sharing.
+
+md_num <- function(x) ifelse(is.na(x), "—", sprintf("%.3f", x))
+md_int <- function(x) ifelse(is.na(x), "—", sprintf("%.0f", x))
+md_spd <- function(num, den) {
+  ifelse(is.na(num) | is.na(den), "—", sprintf("%.2fx", num / den))
+}
+
+md <- c(
+  "# loo parallel benchmark comparison",
+  "",
+  sprintf("_Generated %s._", format(Sys.time(), "%Y-%m-%d %H:%M:%S")),
+  "",
+  "**Columns / modes.** `base` is the baseline version. `new/call` is the new version's default per-call pool (created and torn down each call). `new/persist` is the new version's persistent session pool (`options(loo.daemons = k)`), reused across calls. `cores = 1` rows are fully serial (the parallel backend is never used).",
+  "",
+  "## Median wall-clock time (s) and speedup vs baseline",
+  "",
+  sprintf("Median over %s iterations of one `psis()`/`loo()` call (a warm-up run is excluded). `speedup = base / new`, so a value `> 1` means the new version is faster. Expect `new/persist` to win when many calls reuse the pool, and `new/call` to look slower than `base` for cheap problems because it pays pool start-up/teardown on every call. The `cores = 1` `base` and `new/call` numbers should be roughly equal (both serial); sizeable gaps there are run-to-run noise, not real differences.", iters_label),
+  "",
+  "| scenario | cores | base | new/call | new/persist | speedup (call) | speedup (persist) |",
+  "|:---|---:|---:|---:|---:|---:|---:|"
+)
+for (i in seq_len(nrow(scen_cores))) {
+  k <- paste(scen_cores$scenario[i], scen_cores$cores[i])
+  bm <- get1(base_med, k)
+  nc <- get1(new_call_med, k)
+  np <- get1(n_persist, k)
+  md <- c(md, sprintf(
+    "| %s | %d | %s | %s | %s | %s | %s |",
+    scen_cores$scenario[i], scen_cores$cores[i],
+    md_num(bm), md_num(nc), md_num(np),
+    md_spd(bm, nc), md_spd(bm, np)
+  ))
+}
+
+md <- c(
+  md,
+  "",
+  "## Main-process memory allocation (MB)",
+  "",
+  "Total bytes allocated on the R heap by the *coordinator* process during the call (cumulative churn, **not** peak and **not** net), as measured by `bench`'s allocation profiler. It does **not** include memory used by worker processes, nor off-heap memory such as the `mori` shared-memory segment for the broadcast `draws`. That is why parallel rows are tiny: the heavy allocation happens in the workers, out of the profiler's view. Use it to gauge allocation pressure on the main process, not total footprint.",
+  "",
+  "| scenario | cores | base | new/call | new/persist |",
+  "|:---|---:|---:|---:|---:|"
+)
+for (i in seq_len(nrow(scen_cores))) {
+  k <- paste(scen_cores$scenario[i], scen_cores$cores[i])
+  md <- c(md, sprintf(
+    "| %s | %d | %s | %s | %s |",
+    scen_cores$scenario[i], scen_cores$cores[i],
+    md_num(get1(b_mem, k)), md_num(get1(n_mem, k)), md_num(get1(p_mem, k))
+  ))
+}
+
+# Peak-RSS table (only when peak-mem.sh results are available).
+if (!is.null(peak)) {
+  md <- c(
+    md,
+    "",
+    "## Peak RSS of the whole process tree (MB)",
+    "",
+    "From `peak-mem.sh` (single large-`draws` `loo()` run; Linux only). Maximum summed resident memory of the *entire* process tree (main process plus all workers), sampled during the run. This is the metric for the job's real memory footprint. On a local pool, `mori` shares the `draws` matrix across workers (zero-copy), so peak RSS stays close to a single copy rather than growing with the number of workers.",
+    "",
+    "| label | mode | cores | peak RSS (MB) |",
+    "|:---|:---|---:|---:|"
+  )
+  for (i in seq_len(nrow(peak))) {
+    md <- c(md, sprintf(
+      "| %s | %s | %s | %s |",
+      peak$label[i], peak$mode[i], peak$cores[i], md_int(peak$peak_mb[i])
+    ))
+  }
+}
+
+# Caveats footer.
+md <- c(
+  md,
+  "",
+  "## Caveats",
+  "",
+  "Results are platform-dependent and sensitive to machine load; run the baseline and new versions back to back on an idle machine, and ignore differences smaller than the run-to-run noise.",
+  ""
+)
+
+writeLines(md, md_out)
+cat(sprintf("\nMarkdown report written to %s\n", md_out))
diff --git a/benchmark/peak-mem-run.R b/benchmark/peak-mem-run.R
new file mode 100644
index 00000000..971f9046
--- /dev/null
+++ b/benchmark/peak-mem-run.R
@@ -0,0 +1,47 @@
+# Single large-draws loo.function run for peak-memory measurement.
+# Driven by peak-mem.sh, which samples the total RSS of this process tree.
+#   LOO_LIB=... BENCH_LABEL=baseline|new MODE=per-call|persist CORES=8 Rscript peak-mem-run.R
+#
+# In "persist" mode (new version only) we opt in to loo's persistent session
+# pool via `options(loo.daemons = cores)`; loo creates the local mirai pool
+# lazily during the loo() call and would keep it warm for the session.
+
+lib <- Sys.getenv("LOO_LIB")
+label <- Sys.getenv("BENCH_LABEL", unset = "unknown")
+mode <- Sys.getenv("MODE", unset = "per-call")
+cores <- as.integer(Sys.getenv("CORES", unset = "8"))
+stopifnot(nzchar(lib))
+suppressMessages(library(loo, lib.loc = lib))
+
+set.seed(7)
+S <- 30000L
+P <- 400L # draws ~ 30000 * 400 * 8 = 96 MB
+Nf <- 300L
+draws_big <- matrix(rnorm(S * P), nrow = S, dimnames = list(NULL, paste0("p", seq_len(P))))
+data_f <- data.frame(y = rnorm(Nf))
+llfun_b <- function(data_i, draws, ...) {
+  dnorm(data_i$y, mean = draws[, "p1"], sd = abs(draws[, "p2"]) + 0.5, log = TRUE)
+}
+
+# Per-worker transport size of the broadcast object (mori metric).
+if (label == "new") {
+  raw_bytes <- length(serialize(draws_big, NULL))
+  shared <- mori::share(draws_big)
+  ref_bytes <- length(serialize(shared, NULL))
+  cat(sprintf(
+    "TRANSPORT draws raw=%.1fMB  mori_ref=%d bytes  (%.0fx smaller)\n",
+    raw_bytes / 1e6, ref_bytes, raw_bytes / ref_bytes
+  ))
+}
+
+if (label == "new" && mode == "persist") {
+  options(loo.daemons = cores)
+}
+
+invisible(suppressWarnings(loo(llfun_b, data = data_f, draws = draws_big, cores = cores)))
+
+if (label == "new" && mode == "persist") {
+  mirai::daemons(0)
+  options(loo.daemons = NULL)
+}
+cat("RUN COMPLETE\n")
diff --git a/benchmark/peak-mem.sh b/benchmark/peak-mem.sh
new file mode 100755
index 00000000..0973fece
--- /dev/null
+++ b/benchmark/peak-mem.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Measure peak total RSS (MB) of an R run and its entire process tree
+# (mclapply forks or mirai daemons), by sampling /proc every 20 ms.
+#
+# Usage: peak-mem.sh LOO_LIB BENCH_LABEL MODE CORES   (MODE = per-call|persist)
+set -u
+LOO_LIB="$1"; BENCH_LABEL="$2"; MODE="$3"; CORES="$4"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+LOO_LIB="$LOO_LIB" BENCH_LABEL="$BENCH_LABEL" MODE="$MODE" CORES="$CORES" \
+  Rscript "$SCRIPT_DIR/peak-mem-run.R" &
+RPID=$!
+
+tree_rss_kb() {
+  # Sum RSS (KB) of the root pid and all descendants.
+  local root=$1
+  local pids=$root
+  local frontier=$root
+  while [ -n "$frontier" ]; do
+    local next=""
+    for p in $frontier; do
+      local kids
+      kids=$(pgrep -P "$p" 2>/dev/null | tr '\n' ' ')
+      next="$next $kids"
+    done
+    frontier=$(echo "$next" | xargs)
+    pids="$pids $frontier"
+  done
+  local total=0 rss
+  for p in $pids; do
+    rss=$(awk '/^VmRSS:/{print $2}' "/proc/$p/status" 2>/dev/null)
+    [ -n "${rss:-}" ] && total=$((total + rss))
+  done
+  echo "$total"
+}
+
+PEAK=0
+while kill -0 "$RPID" 2>/dev/null; do
+  CUR=$(tree_rss_kb "$RPID")
+  [ "$CUR" -gt "$PEAK" ] && PEAK=$CUR
+  sleep 0.02
+done
+wait "$RPID"
+PEAK_MB=$(echo "$PEAK/1024" | bc -l)
+
+# Append a parseable record so compare.R can fold the peak-RSS results into its
+# report. Override the destination with PEAK_OUT; delete it between fresh runs.
+PEAK_OUT="${PEAK_OUT:-/tmp/bench-peakmem.tsv}"
+printf "%s\t%s\t%s\t%.0f\n" "$BENCH_LABEL" "$MODE" "$CORES" "$PEAK_MB" >> "$PEAK_OUT"
+
+printf "PEAK_RSS %s/%s/cores=%s : %.0f MB  (appended to %s)\n" \
+  "$BENCH_LABEL" "$MODE" "$CORES" "$PEAK_MB" "$PEAK_OUT"
diff --git a/man-roxygen/cores.R b/man-roxygen/cores.R
index 6fd28d3f..5422612e 100644
--- a/man-roxygen/cores.R
+++ b/man-roxygen/cores.R
@@ -10,3 +10,21 @@
 #'     the `.Rprofile` file to set `mc.cores` (using the `cores` argument or
 #'     setting `mc.cores` interactively or in a script is fine).
 #'
+#'   Parallelism is implemented with the \pkg{mirai} package. There are three
+#'   ways to control the backend, in increasing order of precedence:
+#'   * `cores > 1` (the default behaviour): a local daemon pool is created for
+#'     the duration of the call and torn down automatically when it returns, so
+#'     no worker processes are left running. This is convenient for one-off
+#'     calls but pays a small pool start-up/teardown cost on every call.
+#'   * `options(loo.daemons = k)` or the environment variable `LOO_DAEMONS=k`
+#'     (with `k >= 2`): opt in to a *persistent* local pool of `k` daemons that
+#'     is created lazily on the first parallel call and kept warm for the rest
+#'     of the R session, then cleaned up automatically at session exit. This
+#'     avoids repeated pool start-up/teardown and is ideal for simulations,
+#'     benchmarks, and batch/HPC scripts that call `loo()`/`psis()` many times.
+#'     Local pools automatically use zero-copy shared memory (via \pkg{mori})
+#'     for the shared posterior draws.
+#'   * A pool you configure yourself with [mirai::daemons()] (including
+#'     remote/SSH/HPC daemons via `mirai::daemons(url = ...)`) always takes
+#'     precedence: loo reuses it and never tears it down.
+#'
diff --git a/man/ap_psis.Rd b/man/ap_psis.Rd
index 271718c8..28d56fbb 100644
--- a/man/ap_psis.Rd
+++ b/man/ap_psis.Rd
@@ -39,6 +39,26 @@ recommend using as many (or close to as many) cores as possible.
 \href{https://github.com/stan-dev/loo/issues/94}{recommended} to avoid using
 the \code{.Rprofile} file to set \code{mc.cores} (using the \code{cores} argument or
 setting \code{mc.cores} interactively or in a script is fine).
+}
+
+Parallelism is implemented with the \pkg{mirai} package. There are three
+ways to control the backend, in increasing order of precedence:
+\itemize{
+\item \code{cores > 1} (the default behaviour): a local daemon pool is created for
+the duration of the call and torn down automatically when it returns, so
+no worker processes are left running. This is convenient for one-off
+calls but pays a small pool start-up/teardown cost on every call.
+\item \code{options(loo.daemons = k)} or the environment variable \code{LOO_DAEMONS=k}
+(with \code{k >= 2}): opt in to a \emph{persistent} local pool of \code{k} daemons that
+is created lazily on the first parallel call and kept warm for the rest
+of the R session, then cleaned up automatically at session exit. This
+avoids repeated pool start-up/teardown and is ideal for simulations,
+benchmarks, and batch/HPC scripts that call \code{loo()}/\code{psis()} many times.
+Local pools automatically use zero-copy shared memory (via \pkg{mori})
+for the shared posterior draws.
+\item A pool you configure yourself with \code{\link[mirai:daemons]{mirai::daemons()}} (including
+remote/SSH/HPC daemons via \code{mirai::daemons(url = ...)}) always takes
+precedence: loo reuses it and never tears it down.
 }}
 }
 \description{
diff --git a/man/importance_sampling.Rd b/man/importance_sampling.Rd
index b9cdf75d..3d3bd72a 100644
--- a/man/importance_sampling.Rd
+++ b/man/importance_sampling.Rd
@@ -68,6 +68,25 @@ recommend using as many (or close to as many) cores as possible.
 \href{https://github.com/stan-dev/loo/issues/94}{recommended} to avoid using
 the \code{.Rprofile} file to set \code{mc.cores} (using the \code{cores} argument or
 setting \code{mc.cores} interactively or in a script is fine).
+}
+Parallelism is implemented with the \pkg{mirai} package. There are three
+ways to control the backend, in increasing order of precedence:
+\itemize{
+\item \code{cores > 1} (the default behaviour): a local daemon pool is created for
+the duration of the call and torn down automatically when it returns, so
+no worker processes are left running. This is convenient for one-off
+calls but pays a small pool start-up/teardown cost on every call.
+\item \code{options(loo.daemons = k)} or the environment variable \code{LOO_DAEMONS=k}
+(with \code{k >= 2}): opt in to a \emph{persistent} local pool of \code{k} daemons that
+is created lazily on the first parallel call and kept warm for the rest
+of the R session, then cleaned up automatically at session exit. This
+avoids repeated pool start-up/teardown and is ideal for simulations,
+benchmarks, and batch/HPC scripts that call \code{loo()}/\code{psis()} many times.
+Local pools automatically use zero-copy shared memory (via \pkg{mori})
+for the shared posterior draws.
+\item A pool you configure yourself with \code{\link[mirai:daemons]{mirai::daemons()}} (including
+remote/SSH/HPC daemons via \code{mirai::daemons(url = ...)}) always takes
+precedence: loo reuses it and never tears it down.
 }}
 }
 \description{
diff --git a/man/loo.Rd b/man/loo.Rd
index e6b48075..13ca47f5 100644
--- a/man/loo.Rd
+++ b/man/loo.Rd
@@ -85,6 +85,26 @@ recommend using as many (or close to as many) cores as possible.
 \href{https://github.com/stan-dev/loo/issues/94}{recommended} to avoid using
 the \code{.Rprofile} file to set \code{mc.cores} (using the \code{cores} argument or
 setting \code{mc.cores} interactively or in a script is fine).
+}
+
+Parallelism is implemented with the \pkg{mirai} package. There are three
+ways to control the backend, in increasing order of precedence:
+\itemize{
+\item \code{cores > 1} (the default behaviour): a local daemon pool is created for
+the duration of the call and torn down automatically when it returns, so
+no worker processes are left running. This is convenient for one-off
+calls but pays a small pool start-up/teardown cost on every call.
+\item \code{options(loo.daemons = k)} or the environment variable \code{LOO_DAEMONS=k}
+(with \code{k >= 2}): opt in to a \emph{persistent} local pool of \code{k} daemons that
+is created lazily on the first parallel call and kept warm for the rest
+of the R session, then cleaned up automatically at session exit. This
+avoids repeated pool start-up/teardown and is ideal for simulations,
+benchmarks, and batch/HPC scripts that call \code{loo()}/\code{psis()} many times.
+Local pools automatically use zero-copy shared memory (via \pkg{mori})
+for the shared posterior draws.
+\item A pool you configure yourself with \code{\link[mirai:daemons]{mirai::daemons()}} (including
+remote/SSH/HPC daemons via \code{mirai::daemons(url = ...)}) always takes
+precedence: loo reuses it and never tears it down.
 }}
 
 \item{is_method}{The importance sampling method to use. The following methods
diff --git a/man/loo_approximate_posterior.Rd b/man/loo_approximate_posterior.Rd
index a17d59a2..8f3e88ea 100644
--- a/man/loo_approximate_posterior.Rd
+++ b/man/loo_approximate_posterior.Rd
@@ -66,6 +66,26 @@ recommend using as many (or close to as many) cores as possible.
 \href{https://github.com/stan-dev/loo/issues/94}{recommended} to avoid using
 the \code{.Rprofile} file to set \code{mc.cores} (using the \code{cores} argument or
 setting \code{mc.cores} interactively or in a script is fine).
+}
+
+Parallelism is implemented with the \pkg{mirai} package. There are three
+ways to control the backend, in increasing order of precedence:
+\itemize{
+\item \code{cores > 1} (the default behaviour): a local daemon pool is created for
+the duration of the call and torn down automatically when it returns, so
+no worker processes are left running. This is convenient for one-off
+calls but pays a small pool start-up/teardown cost on every call.
+\item \code{options(loo.daemons = k)} or the environment variable \code{LOO_DAEMONS=k}
+(with \code{k >= 2}): opt in to a \emph{persistent} local pool of \code{k} daemons that
+is created lazily on the first parallel call and kept warm for the rest
+of the R session, then cleaned up automatically at session exit. This
+avoids repeated pool start-up/teardown and is ideal for simulations,
+benchmarks, and batch/HPC scripts that call \code{loo()}/\code{psis()} many times.
+Local pools automatically use zero-copy shared memory (via \pkg{mori})
+for the shared posterior draws.
+\item A pool you configure yourself with \code{\link[mirai:daemons]{mirai::daemons()}} (including
+remote/SSH/HPC daemons via \code{mirai::daemons(url = ...)}) always takes
+precedence: loo reuses it and never tears it down.
 }}
 
 \item{data, draws, ...}{For the \code{loo_approximate_posterior.function()} method,
diff --git a/man/loo_model_weights.Rd b/man/loo_model_weights.Rd
index e7323912..091382c5 100644
--- a/man/loo_model_weights.Rd
+++ b/man/loo_model_weights.Rd
@@ -85,6 +85,26 @@ recommend using as many (or close to as many) cores as possible.
 \href{https://github.com/stan-dev/loo/issues/94}{recommended} to avoid using
 the \code{.Rprofile} file to set \code{mc.cores} (using the \code{cores} argument or
 setting \code{mc.cores} interactively or in a script is fine).
+}
+
+Parallelism is implemented with the \pkg{mirai} package. There are three
+ways to control the backend, in increasing order of precedence:
+\itemize{
+\item \code{cores > 1} (the default behaviour): a local daemon pool is created for
+the duration of the call and torn down automatically when it returns, so
+no worker processes are left running. This is convenient for one-off
+calls but pays a small pool start-up/teardown cost on every call.
+\item \code{options(loo.daemons = k)} or the environment variable \code{LOO_DAEMONS=k}
+(with \code{k >= 2}): opt in to a \emph{persistent} local pool of \code{k} daemons that
+is created lazily on the first parallel call and kept warm for the rest
+of the R session, then cleaned up automatically at session exit. This
+avoids repeated pool start-up/teardown and is ideal for simulations,
+benchmarks, and batch/HPC scripts that call \code{loo()}/\code{psis()} many times.
+Local pools automatically use zero-copy shared memory (via \pkg{mori})
+for the shared posterior draws.
+\item A pool you configure yourself with \code{\link[mirai:daemons]{mirai::daemons()}} (including
+remote/SSH/HPC daemons via \code{mirai::daemons(url = ...)}) always takes
+precedence: loo reuses it and never tears it down.
 }}
 
 \item{lpd_point}{If calling \code{stacking_weights()} or \code{pseudobma_weights()}
diff --git a/man/loo_moment_match.Rd b/man/loo_moment_match.Rd
index 6b1f648d..6d552387 100644
--- a/man/loo_moment_match.Rd
+++ b/man/loo_moment_match.Rd
@@ -78,6 +78,26 @@ recommend using as many (or close to as many) cores as possible.
 \href{https://github.com/stan-dev/loo/issues/94}{recommended} to avoid using
 the \code{.Rprofile} file to set \code{mc.cores} (using the \code{cores} argument or
 setting \code{mc.cores} interactively or in a script is fine).
+}
+
+Parallelism is implemented with the \pkg{mirai} package. There are three
+ways to control the backend, in increasing order of precedence:
+\itemize{
+\item \code{cores > 1} (the default behaviour): a local daemon pool is created for
+the duration of the call and torn down automatically when it returns, so
+no worker processes are left running. This is convenient for one-off
+calls but pays a small pool start-up/teardown cost on every call.
+\item \code{options(loo.daemons = k)} or the environment variable \code{LOO_DAEMONS=k}
+(with \code{k >= 2}): opt in to a \emph{persistent} local pool of \code{k} daemons that
+is created lazily on the first parallel call and kept warm for the rest
+of the R session, then cleaned up automatically at session exit. This
+avoids repeated pool start-up/teardown and is ideal for simulations,
+benchmarks, and batch/HPC scripts that call \code{loo()}/\code{psis()} many times.
+Local pools automatically use zero-copy shared memory (via \pkg{mori})
+for the shared posterior draws.
+\item A pool you configure yourself with \code{\link[mirai:daemons]{mirai::daemons()}} (including
+remote/SSH/HPC daemons via \code{mirai::daemons(url = ...)}) always takes
+precedence: loo reuses it and never tears it down.
 }}
 }
 \value{
diff --git a/man/loo_moment_match_split.Rd b/man/loo_moment_match_split.Rd
index 91a6d826..1ebe2600 100644
--- a/man/loo_moment_match_split.Rd
+++ b/man/loo_moment_match_split.Rd
@@ -64,6 +64,26 @@ recommend using as many (or close to as many) cores as possible.
 \href{https://github.com/stan-dev/loo/issues/94}{recommended} to avoid using
 the \code{.Rprofile} file to set \code{mc.cores} (using the \code{cores} argument or
 setting \code{mc.cores} interactively or in a script is fine).
+}
+
+Parallelism is implemented with the \pkg{mirai} package. There are three
+ways to control the backend, in increasing order of precedence:
+\itemize{
+\item \code{cores > 1} (the default behaviour): a local daemon pool is created for
+the duration of the call and torn down automatically when it returns, so
+no worker processes are left running. This is convenient for one-off
+calls but pays a small pool start-up/teardown cost on every call.
+\item \code{options(loo.daemons = k)} or the environment variable \code{LOO_DAEMONS=k}
+(with \code{k >= 2}): opt in to a \emph{persistent} local pool of \code{k} daemons that
+is created lazily on the first parallel call and kept warm for the rest
+of the R session, then cleaned up automatically at session exit. This
+avoids repeated pool start-up/teardown and is ideal for simulations,
+benchmarks, and batch/HPC scripts that call \code{loo()}/\code{psis()} many times.
+Local pools automatically use zero-copy shared memory (via \pkg{mori})
+for the shared posterior draws.
+\item A pool you configure yourself with \code{\link[mirai:daemons]{mirai::daemons()}} (including
+remote/SSH/HPC daemons via \code{mirai::daemons(url = ...)}) always takes
+precedence: loo reuses it and never tears it down.
 }}
 
 \item{is_method}{The importance sampling method to use. The following methods
diff --git a/man/loo_subsample.Rd b/man/loo_subsample.Rd
index 6f381db6..7454bde2 100644
--- a/man/loo_subsample.Rd
+++ b/man/loo_subsample.Rd
@@ -80,6 +80,26 @@ recommend using as many (or close to as many) cores as possible.
 \href{https://github.com/stan-dev/loo/issues/94}{recommended} to avoid using
 the \code{.Rprofile} file to set \code{mc.cores} (using the \code{cores} argument or
 setting \code{mc.cores} interactively or in a script is fine).
+}
+
+Parallelism is implemented with the \pkg{mirai} package. There are three
+ways to control the backend, in increasing order of precedence:
+\itemize{
+\item \code{cores > 1} (the default behaviour): a local daemon pool is created for
+the duration of the call and torn down automatically when it returns, so
+no worker processes are left running. This is convenient for one-off
+calls but pays a small pool start-up/teardown cost on every call.
+\item \code{options(loo.daemons = k)} or the environment variable \code{LOO_DAEMONS=k}
+(with \code{k >= 2}): opt in to a \emph{persistent} local pool of \code{k} daemons that
+is created lazily on the first parallel call and kept warm for the rest
+of the R session, then cleaned up automatically at session exit. This
+avoids repeated pool start-up/teardown and is ideal for simulations,
+benchmarks, and batch/HPC scripts that call \code{loo()}/\code{psis()} many times.
+Local pools automatically use zero-copy shared memory (via \pkg{mori})
+for the shared posterior draws.
+\item A pool you configure yourself with \code{\link[mirai:daemons]{mirai::daemons()}} (including
+remote/SSH/HPC daemons via \code{mirai::daemons(url = ...)}) always takes
+precedence: loo reuses it and never tears it down.
 }}
 
 \item{loo_approximation}{What type of approximation of the loo_i's should be used?
diff --git a/man/parallel_psis_list.Rd b/man/parallel_psis_list.Rd
index f9c0224c..f822311a 100644
--- a/man/parallel_psis_list.Rd
+++ b/man/parallel_psis_list.Rd
@@ -75,6 +75,26 @@ recommend using as many (or close to as many) cores as possible.
 \href{https://github.com/stan-dev/loo/issues/94}{recommended} to avoid using
 the \code{.Rprofile} file to set \code{mc.cores} (using the \code{cores} argument or
 setting \code{mc.cores} interactively or in a script is fine).
+}
+
+Parallelism is implemented with the \pkg{mirai} package. There are three
+ways to control the backend, in increasing order of precedence:
+\itemize{
+\item \code{cores > 1} (the default behaviour): a local daemon pool is created for
+the duration of the call and torn down automatically when it returns, so
+no worker processes are left running. This is convenient for one-off
+calls but pays a small pool start-up/teardown cost on every call.
+\item \code{options(loo.daemons = k)} or the environment variable \code{LOO_DAEMONS=k}
+(with \code{k >= 2}): opt in to a \emph{persistent} local pool of \code{k} daemons that
+is created lazily on the first parallel call and kept warm for the rest
+of the R session, then cleaned up automatically at session exit. This
+avoids repeated pool start-up/teardown and is ideal for simulations,
+benchmarks, and batch/HPC scripts that call \code{loo()}/\code{psis()} many times.
+Local pools automatically use zero-copy shared memory (via \pkg{mori})
+for the shared posterior draws.
+\item A pool you configure yourself with \code{\link[mirai:daemons]{mirai::daemons()}} (including
+remote/SSH/HPC daemons via \code{mirai::daemons(url = ...)}) always takes
+precedence: loo reuses it and never tears it down.
 }}
 
 \item{method}{See \code{is_method} for \code{\link[=loo]{loo()}}}
diff --git a/man/psis.Rd b/man/psis.Rd
index 534792d8..21dfe0cc 100644
--- a/man/psis.Rd
+++ b/man/psis.Rd
@@ -56,6 +56,26 @@ recommend using as many (or close to as many) cores as possible.
 \href{https://github.com/stan-dev/loo/issues/94}{recommended} to avoid using
 the \code{.Rprofile} file to set \code{mc.cores} (using the \code{cores} argument or
 setting \code{mc.cores} interactively or in a script is fine).
+}
+
+Parallelism is implemented with the \pkg{mirai} package. There are three
+ways to control the backend, in increasing order of precedence:
+\itemize{
+\item \code{cores > 1} (the default behaviour): a local daemon pool is created for
+the duration of the call and torn down automatically when it returns, so
+no worker processes are left running. This is convenient for one-off
+calls but pays a small pool start-up/teardown cost on every call.
+\item \code{options(loo.daemons = k)} or the environment variable \code{LOO_DAEMONS=k}
+(with \code{k >= 2}): opt in to a \emph{persistent} local pool of \code{k} daemons that
+is created lazily on the first parallel call and kept warm for the rest
+of the R session, then cleaned up automatically at session exit. This
+avoids repeated pool start-up/teardown and is ideal for simulations,
+benchmarks, and batch/HPC scripts that call \code{loo()}/\code{psis()} many times.
+Local pools automatically use zero-copy shared memory (via \pkg{mori})
+for the shared posterior draws.
+\item A pool you configure yourself with \code{\link[mirai:daemons]{mirai::daemons()}} (including
+remote/SSH/HPC daemons via \code{mirai::daemons(url = ...)}) always takes
+precedence: loo reuses it and never tears it down.
 }}
 
 \item{x}{For \code{is.psis()}, an object to check.}
@@ -82,7 +102,7 @@ page for details.
 }
 }
 
-Objects of class \code{"psis"} also have the following \link{attributes}:
+Objects of class \code{"psis"} also have the following \link[=attributes]{attributes}:
 \describe{
 \item{\code{norm_const_log}}{
 Vector of precomputed values of \code{colLogSumExps(log_weights)} that are
diff --git a/man/psis_approximate_posterior.Rd b/man/psis_approximate_posterior.Rd
index a86300ed..57afb12f 100644
--- a/man/psis_approximate_posterior.Rd
+++ b/man/psis_approximate_posterior.Rd
@@ -38,6 +38,26 @@ recommend using as many (or close to as many) cores as possible.
 \href{https://github.com/stan-dev/loo/issues/94}{recommended} to avoid using
 the \code{.Rprofile} file to set \code{mc.cores} (using the \code{cores} argument or
 setting \code{mc.cores} interactively or in a script is fine).
+}
+
+Parallelism is implemented with the \pkg{mirai} package. There are three
+ways to control the backend, in increasing order of precedence:
+\itemize{
+\item \code{cores > 1} (the default behaviour): a local daemon pool is created for
+the duration of the call and torn down automatically when it returns, so
+no worker processes are left running. This is convenient for one-off
+calls but pays a small pool start-up/teardown cost on every call.
+\item \code{options(loo.daemons = k)} or the environment variable \code{LOO_DAEMONS=k}
+(with \code{k >= 2}): opt in to a \emph{persistent} local pool of \code{k} daemons that
+is created lazily on the first parallel call and kept warm for the rest
+of the R session, then cleaned up automatically at session exit. This
+avoids repeated pool start-up/teardown and is ideal for simulations,
+benchmarks, and batch/HPC scripts that call \code{loo()}/\code{psis()} many times.
+Local pools automatically use zero-copy shared memory (via \pkg{mori})
+for the shared posterior draws.
+\item A pool you configure yourself with \code{\link[mirai:daemons]{mirai::daemons()}} (including
+remote/SSH/HPC daemons via \code{mirai::daemons(url = ...)}) always takes
+precedence: loo reuses it and never tears it down.
 }}
 
 \item{save_psis}{Should the \code{psis} object created internally by \code{loo()} be
diff --git a/man/sis.Rd b/man/sis.Rd
index d39af007..6b8c26a4 100644
--- a/man/sis.Rd
+++ b/man/sis.Rd
@@ -45,6 +45,26 @@ recommend using as many (or close to as many) cores as possible.
 \href{https://github.com/stan-dev/loo/issues/94}{recommended} to avoid using
 the \code{.Rprofile} file to set \code{mc.cores} (using the \code{cores} argument or
 setting \code{mc.cores} interactively or in a script is fine).
+}
+
+Parallelism is implemented with the \pkg{mirai} package. There are three
+ways to control the backend, in increasing order of precedence:
+\itemize{
+\item \code{cores > 1} (the default behaviour): a local daemon pool is created for
+the duration of the call and torn down automatically when it returns, so
+no worker processes are left running. This is convenient for one-off
+calls but pays a small pool start-up/teardown cost on every call.
+\item \code{options(loo.daemons = k)} or the environment variable \code{LOO_DAEMONS=k}
+(with \code{k >= 2}): opt in to a \emph{persistent} local pool of \code{k} daemons that
+is created lazily on the first parallel call and kept warm for the rest
+of the R session, then cleaned up automatically at session exit. This
+avoids repeated pool start-up/teardown and is ideal for simulations,
+benchmarks, and batch/HPC scripts that call \code{loo()}/\code{psis()} many times.
+Local pools automatically use zero-copy shared memory (via \pkg{mori})
+for the shared posterior draws.
+\item A pool you configure yourself with \code{\link[mirai:daemons]{mirai::daemons()}} (including
+remote/SSH/HPC daemons via \code{mirai::daemons(url = ...)}) always takes
+precedence: loo reuses it and never tears it down.
 }}
 }
 \value{
@@ -67,7 +87,7 @@ A named list containing one vector:
 }
 }
 
-Objects of class \code{"sis"} also have the following \link{attributes}:
+Objects of class \code{"sis"} also have the following \link[=attributes]{attributes}:
 \describe{
 \item{\code{norm_const_log}}{
 Vector of precomputed values of \code{colLogSumExps(log_weights)} that are
diff --git a/man/tis.Rd b/man/tis.Rd
index 1747a64f..0c452f05 100644
--- a/man/tis.Rd
+++ b/man/tis.Rd
@@ -47,6 +47,26 @@ recommend using as many (or close to as many) cores as possible.
 \href{https://github.com/stan-dev/loo/issues/94}{recommended} to avoid using
 the \code{.Rprofile} file to set \code{mc.cores} (using the \code{cores} argument or
 setting \code{mc.cores} interactively or in a script is fine).
+}
+
+Parallelism is implemented with the \pkg{mirai} package. There are three
+ways to control the backend, in increasing order of precedence:
+\itemize{
+\item \code{cores > 1} (the default behaviour): a local daemon pool is created for
+the duration of the call and torn down automatically when it returns, so
+no worker processes are left running. This is convenient for one-off
+calls but pays a small pool start-up/teardown cost on every call.
+\item \code{options(loo.daemons = k)} or the environment variable \code{LOO_DAEMONS=k}
+(with \code{k >= 2}): opt in to a \emph{persistent} local pool of \code{k} daemons that
+is created lazily on the first parallel call and kept warm for the rest
+of the R session, then cleaned up automatically at session exit. This
+avoids repeated pool start-up/teardown and is ideal for simulations,
+benchmarks, and batch/HPC scripts that call \code{loo()}/\code{psis()} many times.
+Local pools automatically use zero-copy shared memory (via \pkg{mori})
+for the shared posterior draws.
+\item A pool you configure yourself with \code{\link[mirai:daemons]{mirai::daemons()}} (including
+remote/SSH/HPC daemons via \code{mirai::daemons(url = ...)}) always takes
+precedence: loo reuses it and never tears it down.
 }}
 }
 \value{
@@ -69,7 +89,7 @@ A named list containing one vector:
 }
 }
 
-Objects of class \code{"tis"} also have the following \link{attributes}:
+Objects of class \code{"tis"} also have the following \link[=attributes]{attributes}:
 \describe{
 \item{\code{norm_const_log}}{
 Vector of precomputed values of \code{colLogSumExps(log_weights)} that are
diff --git a/man/update.psis_loo_ss.Rd b/man/update.psis_loo_ss.Rd
index eed2eb34..8b82436a 100644
--- a/man/update.psis_loo_ss.Rd
+++ b/man/update.psis_loo_ss.Rd
@@ -62,6 +62,26 @@ recommend using as many (or close to as many) cores as possible.
 \href{https://github.com/stan-dev/loo/issues/94}{recommended} to avoid using
 the \code{.Rprofile} file to set \code{mc.cores} (using the \code{cores} argument or
 setting \code{mc.cores} interactively or in a script is fine).
+}
+
+Parallelism is implemented with the \pkg{mirai} package. There are three
+ways to control the backend, in increasing order of precedence:
+\itemize{
+\item \code{cores > 1} (the default behaviour): a local daemon pool is created for
+the duration of the call and torn down automatically when it returns, so
+no worker processes are left running. This is convenient for one-off
+calls but pays a small pool start-up/teardown cost on every call.
+\item \code{options(loo.daemons = k)} or the environment variable \code{LOO_DAEMONS=k}
+(with \code{k >= 2}): opt in to a \emph{persistent} local pool of \code{k} daemons that
+is created lazily on the first parallel call and kept warm for the rest
+of the R session, then cleaned up automatically at session exit. This
+avoids repeated pool start-up/teardown and is ideal for simulations,
+benchmarks, and batch/HPC scripts that call \code{loo()}/\code{psis()} many times.
+Local pools automatically use zero-copy shared memory (via \pkg{mori})
+for the shared posterior draws.
+\item A pool you configure yourself with \code{\link[mirai:daemons]{mirai::daemons()}} (including
+remote/SSH/HPC daemons via \code{mirai::daemons(url = ...)}) always takes
+precedence: loo reuses it and never tears it down.
 }}
 
 \item{loo_approximation}{What type of approximation of the loo_i's should be used?
diff --git a/tests/testthat/test_parallel.R b/tests/testthat/test_parallel.R
index dc887c04..3c0eb874 100644
--- a/tests/testthat/test_parallel.R
+++ b/tests/testthat/test_parallel.R
@@ -242,5 +242,118 @@ test_that("loo_model_weights() parallel equals serial", {
                tolerance = 1e-6)
 })
 
+# Persistent session pool (loo.daemons / LOO_DAEMONS) -----------------------
+
+test_that("loo_persist_config() resolves option/env var and rejects bad values", {
+  # Start from a known-clean configuration and restore it afterwards.
+  old_opt <- options(loo.daemons = NULL)
+  on.exit(options(old_opt), add = TRUE)
+  old_env <- Sys.getenv("LOO_DAEMONS", unset = NA)
+  Sys.unsetenv("LOO_DAEMONS")
+  on.exit(
+    if (!is.na(old_env)) {
+      Sys.setenv(LOO_DAEMONS = old_env)
+    } else {
+      Sys.unsetenv("LOO_DAEMONS")
+    },
+    add = TRUE
+  )
+
+  # Unset -> feature off.
+  expect_identical(loo:::loo_persist_config(), NA_integer_)
+
+  # Environment variable is parsed when the option is unset.
+  Sys.setenv(LOO_DAEMONS = "3")
+  expect_identical(loo:::loo_persist_config(), 3L)
+
+  # Option takes precedence over the environment variable.
+  options(loo.daemons = 4)
+  expect_identical(loo:::loo_persist_config(), 4L)
+
+  # 0/1, non-integer, and garbage values all disable the feature (no error).
+  options(loo.daemons = 1)
+  expect_identical(loo:::loo_persist_config(), NA_integer_)
+  options(loo.daemons = 0)
+  expect_identical(loo:::loo_persist_config(), NA_integer_)
+  options(loo.daemons = 2.5)
+  expect_identical(loo:::loo_persist_config(), NA_integer_)
+  options(loo.daemons = "garbage")
+  expect_identical(suppressWarnings(loo:::loo_persist_config()), NA_integer_)
+})
+
+test_that("persistent pool is created lazily and reused across calls", {
+  skip_on_cran()
+  mirai::daemons(0)
+  expect_false(loo:::loo_has_pool())
+
+  old_opt <- options(loo.daemons = 2)
+  on.exit(options(old_opt), add = TRUE)
+  on.exit(mirai::daemons(0), add = TRUE)
+
+  ps_serial <- suppressWarnings(psis(-LLmat, r_eff = r_eff, cores = 1))
+  # cores = 1 work must not spin up the persistent pool.
+  expect_false(loo:::loo_has_pool())
+
+  # First parallel call creates the pool and leaves it warm.
+  ps1 <- suppressWarnings(psis(-LLmat, r_eff = r_eff, cores = 2))
+  expect_true(loo:::loo_has_pool())
+  expect_equal(loo:::loo_n_workers(2), 2L)
+
+  # Second call reuses the same pool (still 2 connected daemons).
+  ps2 <- suppressWarnings(psis(-LLmat, r_eff = r_eff, cores = 2))
+  expect_true(loo:::loo_has_pool())
+  expect_equal(loo:::loo_n_workers(2), 2L)
+
+  # Results match the serial computation.
+  expect_equal(ps1$log_weights, ps_serial$log_weights)
+  expect_equal(ps2$log_weights, ps_serial$log_weights)
+})
+
+test_that("a user-configured pool takes precedence over loo.daemons", {
+  skip_on_cran()
+  mirai::daemons(0)
+  old_opt <- options(loo.daemons = 4)
+  on.exit(options(old_opt), add = TRUE)
+
+  # User sets up their own pool; loo must reuse it untouched and not replace
+  # it with a persistent pool of the configured size.
+  mirai::daemons(2)
+  on.exit(mirai::daemons(0), add = TRUE)
+
+  ps <- suppressWarnings(psis(-LLmat, r_eff = r_eff, cores = 2))
+  expect_true(loo:::loo_has_pool())
+  expect_equal(loo:::loo_n_workers(2), 2L)
+})
+
+test_that("a persistent-pool child process exits cleanly (no orphans)", {
+  skip_on_cran()
+  skip_on_os("windows")
+  rscript <- file.path(R.home("bin"), "Rscript")
+  skip_if_not(file.exists(rscript))
+
+  script <- tempfile(fileext = ".R")
+  on.exit(unlink(script), add = TRUE)
+  writeLines(
+    c(
+      "library(loo)",
+      "LLarr <- example_loglik_array()",
+      "r_eff <- relative_eff(exp(LLarr))",
+      "invisible(suppressWarnings(loo(LLarr, r_eff = r_eff, cores = 2)))",
+      "cat('LOO_CHILD_OK\\n')"
+    ),
+    script
+  )
+  out <- suppressWarnings(system2(
+    rscript,
+    c("--vanilla", shQuote(script)),
+    stdout = TRUE,
+    stderr = TRUE,
+    env = "LOO_DAEMONS=2"
+  ))
+  status <- attr(out, "status")
+  expect_true(is.null(status) || identical(as.integer(status), 0L))
+  expect_true(any(grepl("LOO_CHILD_OK", out)))
+})
+
 # Final safety net in case any test above exited early with a live pool.
 mirai::daemons(0)
diff --git a/vignettes/loo2-parallel.Rmd b/vignettes/loo2-parallel.Rmd
new file mode 100644
index 00000000..443569e3
--- /dev/null
+++ b/vignettes/loo2-parallel.Rmd
@@ -0,0 +1,230 @@
+---
+title: "Parallel and distributed loo"
+author: "The loo developers"
+date: "`r Sys.Date()`"
+output:
+  html_vignette:
+    toc: yes
+params:
+  EVAL: !r identical(Sys.getenv("NOT_CRAN"), "true")
+---
+<!--
+%\VignetteEngine{knitr::rmarkdown}
+%\VignetteIndexEntry{Parallel and distributed loo}
+-->
+```{r, child="children/SETTINGS-knitr.txt"}
+```
+
+```{r, child="children/SEE-ONLINE.txt", eval = if (isTRUE(exists("params"))) !params$EVAL else TRUE}
+```
+
+# Introduction
+
+Computing PSIS-LOO, WAIC, and related quantities is *embarrassingly parallel*:
+the expensive work is done once per observation (or per fold), and these
+computations are independent of one another. The __loo__ package can therefore
+spread this work across multiple cores or even multiple machines.
+
+As of __loo__ 2.10 the parallel backend is built on the
+[__mirai__](https://mirai.r-lib.org/) package, with
+[__mori__](https://mirai.r-lib.org/) providing zero-copy sharing of large
+objects (such as the posterior draws) between local workers. You do not need to
+learn either package to benefit from parallelism, but understanding the three
+ways to control the backend lets you pick the right one for your workflow:
+
+1. **Per-call parallelism** with the `cores` argument --- the simplest option,
+   good for one-off calls.
+2. **A persistent session pool** via `options(loo.daemons = k)` or the
+   `LOO_DAEMONS` environment variable --- best when you call __loo__ functions
+   *many* times in one session (simulations, benchmarks, batch/HPC scripts).
+3. **A pool you manage yourself** with `mirai::daemons()` --- needed for
+   remote/SSH/HPC workers, and always takes precedence over the options above.
+
+Every __loo__ function that does per-observation work accepts a `cores`
+argument and respects these settings, including `loo()`, `psis()`, `waic()`,
+`relative_eff()`, `loo_subsample()`, `loo_moment_match()`, and
+`loo_model_weights()`.
+
+# Setup
+
+The examples below only need the __loo__ package and its built-in example
+log-likelihood objects, so they are fast and self-contained.
+
+```{r setup, message=FALSE}
+library("loo")
+
+# A 500 (draws) x 2 (chains) x 32 (observations) log-likelihood array.
+LLarr <- example_loglik_array()
+r_eff <- relative_eff(exp(LLarr))
+```
+
+# 1. Per-call parallelism with `cores`
+
+The `cores` argument is the simplest way to parallelize. Passing `cores > 1`
+tells __loo__ to create a local pool of worker processes *for the duration of
+that single call* and to shut it down automatically when the call returns:
+
+```{r cores-basic}
+# Serial (the default)
+loo_serial <- loo(LLarr, r_eff = r_eff, cores = 1)
+
+# Parallel across 2 local workers, then automatically cleaned up
+loo_parallel <- loo(LLarr, r_eff = r_eff, cores = 2)
+```
+
+The results are identical --- parallelism only changes *how* the work is
+scheduled, never the answer:
+
+```{r cores-equal}
+all.equal(loo_serial$estimates, loo_parallel$estimates)
+```
+
+You can set the default number of cores for a whole session with the standard
+`mc.cores` option, so you don't have to pass `cores` to every call:
+
+```{r mc-cores, eval=FALSE}
+options(mc.cores = 4)
+loo(LLarr, r_eff = r_eff) # uses 4 cores
+```
+
+Because the pool is created and destroyed on every call, this mode pays a small
+start-up/teardown cost each time. For a single analysis that cost is
+negligible, but if you call __loo__ functions repeatedly it can add up --- which
+is exactly what the next section addresses.
+
+# 2. A persistent session pool: `loo.daemons` / `LOO_DAEMONS`
+
+When you call __loo__ functions many times in one session --- for example in a
+simulation study, a benchmark, or a script that loops over many models --- you
+can avoid repeatedly spinning workers up and down by keeping a pool **warm for
+the whole session**.
+
+Opt in by setting either the `loo.daemons` R option or the `LOO_DAEMONS`
+environment variable to the desired number of workers (an integer `>= 2`):
+
+```{r persistent-option, eval=FALSE}
+# Either set the R option ...
+options(loo.daemons = 4)
+
+# ... or, equivalently, set the environment variable (useful for batch/HPC
+# scripts and `Rscript`):
+Sys.setenv(LOO_DAEMONS = 4)
+```
+
+With this enabled, __loo__:
+
+* creates a local pool **lazily**, on the first call that actually needs
+  parallelism (so nothing is started if you only do serial work),
+* **reuses** that same warm pool for every later call, and
+* **cleans it up automatically** when the R session ends (so no worker
+  processes are left behind, even in non-interactive scripts).
+
+This is ideal for loops like the following, where the pool is created once on
+the first iteration and reused thereafter:
+
+```{r persistent-loop, eval=FALSE}
+options(loo.daemons = 4)
+
+# Imagine `model_loglik` returns a log-likelihood matrix for each model.
+results <- lapply(seq_len(100), function(i) {
+  ll <- model_loglik(i)
+  loo(ll, r_eff = relative_eff(exp(ll)), cores = 4)
+})
+```
+
+A couple of details worth knowing:
+
+* `cores > 1` is still the per-call switch that *enables* parallelism; the size
+  of the persistent pool comes from `loo.daemons`/`LOO_DAEMONS`, not from
+  `cores`.
+* Invalid values disable the feature: anything below `2`, a non-integer, or
+  unset all mean "no persistent pool" (and __loo__ falls back to the per-call
+  behavior from Section 1).
+
+From a script, the most convenient pattern is to set the environment variable
+outside R so you never touch __loo__-specific code:
+
+```bash
+LOO_DAEMONS=8 Rscript my_simulation.R
+```
+
+# 3. User-managed pools, including remote/HPC workers
+
+If you configure a pool yourself with `mirai::daemons()`, __loo__ will **reuse
+it and never tear it down**. A user-managed pool always takes precedence over
+both the `cores` per-call behavior and the `loo.daemons` option, which makes it
+the right tool for advanced setups --- in particular **remote, SSH, or HPC
+cluster** workers.
+
+A local pool you manage by hand behaves just like the persistent pool from
+Section 2:
+
+```{r user-local, eval=FALSE}
+library(mirai)
+
+daemons(4)                                  # start 4 local workers
+loo1 <- loo(LLarr, r_eff = r_eff, cores = 2) # reuses your pool
+loo2 <- loo(LLarr, r_eff = r_eff, cores = 2) # reuses it again
+daemons(0)                                  # you decide when to stop them
+```
+
+For distributed computing, point __mirai__ at remote machines or a cluster
+scheduler. __loo__ does not need to know any of the details --- it simply uses
+whatever pool is connected:
+
+```{r user-remote, eval=FALSE}
+library(mirai)
+
+# Launch workers on remote hosts over SSH (see ?mirai::daemons and
+# ?mirai::ssh_config for the full set of options, including HPC launchers).
+daemons(
+  n = 8,
+  url = host_url(),
+  remote = ssh_config(c("ssh://node1", "ssh://node2"))
+)
+
+loo_big <- loo(large_loglik, r_eff = r_eff_big, cores = 8)
+
+daemons(0)
+```
+
+# Large objects and shared memory
+
+When __loo__ uses the function method (e.g. `loo(llfun, data = ..., draws =
+...)`), the posterior `draws` are reused identically by every observation. On a
+**local** pool these large objects are written once into shared memory with
+__mori__ so each worker maps the same physical pages (zero-copy), which keeps
+memory use low even with many workers.
+
+On a **remote** pool shared memory is not available, so __loo__ falls back to
+ordinary serialization and chunks the work to send each large object roughly
+once per worker rather than once per observation. You don't need to do anything
+to get this behavior --- __loo__ detects whether the active pool is local and
+chooses the appropriate transport automatically.
+
+# Choosing a strategy
+
+| Situation | Recommended approach |
+|---|---|
+| A single `loo()`/`psis()` call | `cores = k` (Section 1) |
+| Default cores for a session | `options(mc.cores = k)` |
+| Many calls in one session (simulations, benchmarks, batch jobs) | `options(loo.daemons = k)` or `LOO_DAEMONS=k` (Section 2) |
+| Remote / SSH / HPC workers | `mirai::daemons(url = ..., remote = ...)` (Section 3) |
+
+A few general notes:
+
+* All approaches give identical results; they differ only in scheduling and
+  set-up cost.
+* These mechanisms compose safely. A pool you set up yourself is always reused
+  and never removed by __loo__, and nested __loo__ calls reuse an outer pool
+  rather than creating a new one. For example, `loo_model_weights()` sets up a
+  single pool and reuses it across all of the models it evaluates.
+* On Windows, avoid setting `mc.cores` in your `.Rprofile`
+  (see [this issue](https://github.com/stan-dev/loo/issues/94)); pass `cores`
+  explicitly or set the option in your script instead.
+
+# See also
+
+* `help("loo", package = "loo")` for the `cores` argument documentation.
+* The [__mirai__](https://mirai.r-lib.org/) documentation, especially
+  `?mirai::daemons`, for configuring local, remote, and HPC pools.

From 89187e1ef605b7727c82739242cd2be559bb791c Mon Sep 17 00:00:00 2001
From: Florence Bockting <florence.bockting@aalto.fi>
Date: Tue, 30 Jun 2026 17:39:17 +0300
Subject: [PATCH 4/6] ignore cores if daemons are set

---
 R/parallel.R                   | 84 +++++++++++++++++++++++++++-------
 tests/testthat/test_parallel.R | 32 ++++++++++++-
 vignettes/loo2-parallel.Rmd    | 74 +++++++++++++++++++++++++-----
 3 files changed, 162 insertions(+), 28 deletions(-)

diff --git a/R/parallel.R b/R/parallel.R
index e7ff0f0b..ca2d7808 100644
--- a/R/parallel.R
+++ b/R/parallel.R
@@ -9,6 +9,9 @@
 #'   session-exit finalizer is only registered once.
 #' * `warned_bad_daemons`: guards the malformed-config warning in
 #'   `loo_persist_config()` so it is only emitted once per session.
+#' * `informed_cores_ignored`: guards the "a pool is connected so `cores` is
+#'   ignored" message in `with_loo_daemons()` so it is only emitted once per
+#'   session.
 #'
 #' It also serves as the object the daemon-cleanup finalizer is attached to.
 .loo_internal <- new.env(parent = emptyenv())
@@ -76,6 +79,32 @@ loo_warn_bad_daemons <- function(value) {
   invisible(NULL)
 }
 
+#' Inform (once per session) that `cores` is ignored while a pool is connected
+#'
+#' @noRd
+#' @keywords internal
+#' @description
+#' Emitted by `with_loo_daemons()` when a mirai daemon pool is connected (a
+#' user-managed pool, or a persistent pool left warm by an earlier call) and
+#' the current call passed `cores <= 1`. When a pool is connected loo always
+#' uses it, so the `cores` argument is ignored and the work runs in parallel
+#' regardless of its value. This message makes that behaviour visible the
+#' first time a call looks like it asked for serial execution; the usual cause
+#' is relying on the default `cores = getOption("mc.cores", 1)` after setting
+#' up daemons.
+loo_inform_cores_ignored <- function() {
+  if (isTRUE(.loo_internal$informed_cores_ignored)) {
+    return(invisible(NULL))
+  }
+  .loo_internal$informed_cores_ignored <- TRUE
+  message(
+    "A mirai daemon pool is connected, so 'cores' is ignored and this call ",
+    "runs in parallel on the existing pool. Call mirai::daemons(0) to stop ",
+    "the pool if you want serial execution."
+  )
+  invisible(NULL)
+}
+
 #' Register a one-time session-exit cleanup for the persistent daemon pool
 #'
 #' @noRd
@@ -108,11 +137,17 @@ loo_register_daemon_cleanup <- function() {
 #' [mirai::daemons()] pool exists for the duration of a computation. It is
 #' deliberately a good citizen of the user's session:
 #'
-#' * `cores <= 1`: runs `code` serially without touching daemons.
-#' * A daemon pool is already configured (e.g. the user called
-#'   [mirai::daemons()] themselves, possibly with remote/HPC daemons): `code`
-#'   runs on the existing pool, which is left untouched. This always takes
-#'   precedence over the options below.
+#' * A daemon pool is already connected (e.g. the user called
+#'   [mirai::daemons()] themselves, possibly with remote/HPC daemons, or a
+#'   persistent pool was left warm by an earlier call): `code` runs on the
+#'   existing pool, which is left untouched. **A connected pool always wins**,
+#'   so the `cores` argument is ignored entirely in this case (loo uses the
+#'   pool regardless of `cores`). If `cores <= 1` here -- i.e. the call looked
+#'   like it requested serial execution -- a one-time-per-session message
+#'   notes that `cores` is being ignored (via `loo_inform_cores_ignored()`).
+#' * `cores <= 1` with no pool connected: runs `code` serially without
+#'   touching daemons. (A configured `loo.daemons` pool is created lazily only
+#'   when `cores > 1`, so serial-only work never starts workers.)
 #' * Otherwise, if the user opted in to a persistent session pool via the
 #'   `loo.daemons` option or `LOO_DAEMONS` environment variable (see
 #'   `loo_persist_config()`): a local pool of that size is created lazily on
@@ -130,16 +165,29 @@ loo_register_daemon_cleanup <- function() {
 #' existing pool, it is safe to nest: an inner call made while an outer call
 #' already established a pool simply reuses it instead of creating another.
 #'
-#' @param cores Integer number of cores requested by the user. Acts as the
-#'   per-call "enable parallel" switch; the persistent pool size, when enabled,
-#'   comes from `loo_persist_config()` rather than from `cores`.
+#' @param cores Integer number of cores requested by the user. When no pool is
+#'   connected this is the per-call "enable parallel" switch (and the size of
+#'   the ephemeral pool). When a pool is already connected it is ignored --
+#'   loo always uses the connected pool. The persistent pool size, when
+#'   enabled, comes from `loo_persist_config()` rather than from `cores`.
 #' @param code Expression to evaluate. Lazily evaluated in the calling
 #'   environment, after any daemon pool has been set up.
 #' @return The value of `code`.
 with_loo_daemons <- function(cores, code) {
-  if (cores <= 1 || loo_has_pool()) {
-    # Serial work, or reuse the daemon pool the user (or an outer loo call)
-    # already configured. This always wins over the persistent-pool option.
+  if (loo_has_pool()) {
+    # A pool is connected (user-managed, or a warm persistent pool): use it
+    # regardless of `cores`. This always wins over the options below. If the
+    # call looked like it asked for serial work, note once that `cores` is
+    # being ignored.
+    if (cores <= 1) {
+      loo_inform_cores_ignored()
+    }
+    return(code)
+  }
+  if (cores <= 1) {
+    # No pool connected and no parallelism requested: run serially. A
+    # configured loo.daemons pool is created lazily only when cores > 1, so
+    # serial-only work never starts workers.
     return(code)
   }
   persist <- loo_persist_config()
@@ -221,7 +269,7 @@ loo_pool_is_local <- function() {
 #' Replaces the previous platform-branching
 #' [parallel::mclapply()] / [parallel::parLapply()] code paths with a single
 #' [mirai::mirai_map()] path, while preserving the serial [lapply()] behaviour
-#' when no parallelism is requested or available.
+#' when no daemon pool is connected.
 #'
 #' Object transport is chosen automatically:
 #'
@@ -238,8 +286,10 @@ loo_pool_is_local <- function() {
 #' @param FUN Worker function. Called as `FUN(x, <broadcast>, <...>)`; the
 #'   names in `broadcast` and `...` must match `FUN`'s formals.
 #' @param ... Small constant arguments forwarded to `FUN` for every element.
-#' @param cores Integer number of cores requested by the user. Parallelism is
-#'   only used when `cores > 1` and a daemon pool is connected.
+#' @param cores Integer number of cores requested by the user. Used only as a
+#'   fallback worker count for chunking when (unexpectedly) no pool is
+#'   connected; it does not gate execution. Parallelism is used whenever a
+#'   daemon pool is connected (the connected pool always wins over `cores`).
 #' @param broadcast Named list of large objects reused by every element. See
 #'   Description for how these are transported.
 #' @param chunk Chunking strategy. `"auto"` (default) splits `X` into roughly
@@ -254,9 +304,11 @@ loo_map <- function(X, FUN, ..., cores = 1L, broadcast = list(),
   chunk <- match.arg(chunk)
   dots <- list(...)
 
-  if (!(cores > 1L && loo_has_pool())) {
+  if (!loo_has_pool()) {
     # Serial path: identical behaviour to a plain lapply() with the broadcast
-    # and constant arguments supplied by name.
+    # and constant arguments supplied by name. When a pool is connected loo
+    # uses it regardless of `cores`; the decision to create one lives upstream
+    # in with_loo_daemons().
     return(do.call(lapply, c(list(X, FUN), broadcast, dots)))
   }
 
diff --git a/tests/testthat/test_parallel.R b/tests/testthat/test_parallel.R
index 3c0eb874..106596a2 100644
--- a/tests/testthat/test_parallel.R
+++ b/tests/testthat/test_parallel.R
@@ -56,10 +56,11 @@ test_that("loo_map() runs serially when no pool is available", {
   expect_identical(res, as.list((1:5) * 2))
 })
 
-test_that("loo_map() runs serially when cores <= 1 even with a pool", {
+test_that("loo_map() uses a connected pool regardless of cores", {
   skip_on_cran()
   mirai::daemons(2)
   on.exit(mirai::daemons(0), add = TRUE)
+  # cores = 1 no longer forces serial: a connected pool always wins.
   res <- loo:::loo_map(1:5, function(x, m) x * m, m = 3, cores = 1)
   expect_identical(res, as.list((1:5) * 3))
 })
@@ -325,6 +326,35 @@ test_that("a user-configured pool takes precedence over loo.daemons", {
   expect_equal(loo:::loo_n_workers(2), 2L)
 })
 
+test_that("with_loo_daemons() informs (once) that cores is ignored with a pool", {
+  skip_on_cran()
+  mirai::daemons(0)
+  # Reset the once-per-session guard so this test is order-independent.
+  loo:::.loo_internal$informed_cores_ignored <- NULL
+  on.exit(loo:::.loo_internal$informed_cores_ignored <- NULL, add = TRUE)
+
+  # No pool connected: cores = 1 runs serially without a message.
+  expect_silent(loo:::with_loo_daemons(1, 42))
+
+  mirai::daemons(2)
+  on.exit(mirai::daemons(0), add = TRUE)
+
+  # Pool connected and cores = 1: informs once that cores is ignored, and the
+  # value still comes back (work runs on the pool).
+  expect_message(
+    out <- loo:::with_loo_daemons(1, 42),
+    "'cores' is ignored"
+  )
+  expect_identical(out, 42)
+
+  # Message is emitted at most once per session.
+  expect_silent(loo:::with_loo_daemons(1, 42))
+
+  # cores > 1 with a pool never triggers the message (parallel was requested).
+  loo:::.loo_internal$informed_cores_ignored <- NULL
+  expect_silent(loo:::with_loo_daemons(2, 42))
+})
+
 test_that("a persistent-pool child process exits cleanly (no orphans)", {
   skip_on_cran()
   skip_on_os("windows")
diff --git a/vignettes/loo2-parallel.Rmd b/vignettes/loo2-parallel.Rmd
index 443569e3..2738ae94 100644
--- a/vignettes/loo2-parallel.Rmd
+++ b/vignettes/loo2-parallel.Rmd
@@ -45,6 +45,12 @@ argument and respects these settings, including `loo()`, `psis()`, `waic()`,
 `relative_eff()`, `loo_subsample()`, `loo_moment_match()`, and
 `loo_model_weights()`.
 
+One rule ties all three together: **when a daemon pool is already connected
+(options 2 or 3), __loo__ always uses it and the `cores` argument is ignored.**
+`cores` only takes effect when no pool is connected (option 1). The section
+[How `cores` interacts with a pool](#how-cores-interacts-with-a-pool) spells
+this out.
+
 # Setup
 
 The examples below only need the __loo__ package and its built-in example
@@ -113,9 +119,10 @@ Sys.setenv(LOO_DAEMONS = 4)
 
 With this enabled, __loo__:
 
-* creates a local pool **lazily**, on the first call that actually needs
-  parallelism (so nothing is started if you only do serial work),
-* **reuses** that same warm pool for every later call, and
+* creates a local pool **lazily**, on the first call made with `cores > 1`
+  (so nothing is started if you only do serial work),
+* **reuses** that same warm pool for every later call --- which, because a
+  connected pool always wins, then run in parallel *regardless* of `cores`, and
 * **cleans it up automatically** when the R session ends (so no worker
   processes are left behind, even in non-interactive scripts).
 
@@ -126,6 +133,9 @@ the first iteration and reused thereafter:
 options(loo.daemons = 4)
 
 # Imagine `model_loglik` returns a log-likelihood matrix for each model.
+# `cores = 4` on the first iteration triggers creation of the warm pool; on
+# later iterations the pool already exists, so `cores` is ignored and the work
+# simply runs on it.
 results <- lapply(seq_len(100), function(i) {
   ll <- model_loglik(i)
   loo(ll, r_eff = relative_eff(exp(ll)), cores = 4)
@@ -134,9 +144,10 @@ results <- lapply(seq_len(100), function(i) {
 
 A couple of details worth knowing:
 
-* `cores > 1` is still the per-call switch that *enables* parallelism; the size
-  of the persistent pool comes from `loo.daemons`/`LOO_DAEMONS`, not from
-  `cores`.
+* `cores > 1` is what *triggers* the lazy creation of the pool on the first
+  call; the size of the persistent pool comes from `loo.daemons`/`LOO_DAEMONS`,
+  not from `cores`. Once the pool is warm, every later call uses it regardless
+  of `cores` (see "How `cores` interacts with a pool" below).
 * Invalid values disable the feature: anything below `2`, a non-integer, or
   unset all mean "no persistent pool" (and __loo__ falls back to the per-call
   behavior from Section 1).
@@ -157,15 +168,16 @@ the right tool for advanced setups --- in particular **remote, SSH, or HPC
 cluster** workers.
 
 A local pool you manage by hand behaves just like the persistent pool from
-Section 2:
+Section 2. Because a connected pool always wins, you do not even need to pass
+`cores` --- __loo__ uses the pool whatever `cores` is:
 
 ```{r user-local, eval=FALSE}
 library(mirai)
 
-daemons(4)                                  # start 4 local workers
-loo1 <- loo(LLarr, r_eff = r_eff, cores = 2) # reuses your pool
-loo2 <- loo(LLarr, r_eff = r_eff, cores = 2) # reuses it again
-daemons(0)                                  # you decide when to stop them
+daemons(4)                       # start 4 local workers
+loo1 <- loo(LLarr, r_eff = r_eff) # reuses your pool (cores not needed)
+loo2 <- loo(LLarr, r_eff = r_eff) # reuses it again
+daemons(0)                       # you decide when to stop them
 ```
 
 For distributed computing, point __mirai__ at remote machines or a cluster
@@ -188,6 +200,42 @@ loo_big <- loo(large_loglik, r_eff = r_eff_big, cores = 8)
 daemons(0)
 ```
 
+# How `cores` interacts with a pool
+
+The rule is simple: **whenever a daemon pool is connected, __loo__ uses it and
+ignores `cores`.** This applies both to a pool you set up yourself (Section 3)
+and to a `loo.daemons` pool once it has been created (Section 2). In that state
+the *value* of `cores` does not matter --- `cores = 2` and `cores = 6` behave
+identically, and even `cores = 1` (the default when `mc.cores` is unset) still
+runs in parallel on the pool. The first time a call would otherwise have looked
+like a request for serial execution (`cores <= 1`) while a pool is connected,
+__loo__ prints a one-time message noting that `cores` is being ignored.
+
+`cores` only controls things when **no** pool is connected: there `cores > 1`
+both enables parallelism and sets the size of the temporary per-call pool
+(Section 1), and it is what triggers lazy creation of a `loo.daemons` pool
+(Section 2). If you genuinely want a single call to run serially while a pool
+is up, stop the pool first with `mirai::daemons(0)`.
+
+## When is the number of worker daemons equal to `cores`?
+
+It follows directly from the rule above. The number of workers actually doing
+the computation equals `cores` **only in the per-call case**; in every other
+case the worker count comes from the pool and `cores` is irrelevant:
+
+| How parallelism is set up | Number of worker daemons | Equal to `cores`? |
+|---|---|---|
+| Per-call, no pool connected (`cores = k`, Section 1) | `k` | **Yes** --- loo spawns exactly `cores` daemons for the call |
+| Persistent pool (`options(loo.daemons = k)`, Section 2) | `k` (from `loo.daemons`) | Only if you happen to pass the same `cores` |
+| User-managed pool (`mirai::daemons(n)`, Section 3) | `n` (whatever you started) | Only if you happen to pass the same `cores` |
+
+In other words, `cores` *is* the daemon count in the per-call path, but for a
+persistent or user-managed pool the daemon count is fixed by `loo.daemons` or
+by your `mirai::daemons()` call, and passing a different `cores` (including the
+default `1`) does **not** change how many workers run. Internally __loo__ always
+splits the work according to the number of daemons that are actually connected,
+not the requested `cores`.
+
 # Large objects and shared memory
 
 When __loo__ uses the function method (e.g. `loo(llfun, data = ..., draws =
@@ -219,6 +267,10 @@ A few general notes:
   and never removed by __loo__, and nested __loo__ calls reuse an outer pool
   rather than creating a new one. For example, `loo_model_weights()` sets up a
   single pool and reuses it across all of the models it evaluates.
+* A connected pool always wins: when one is active, `cores` is ignored and the
+  work runs on the pool (the first time a `cores <= 1` call is overridden this
+  way, __loo__ prints a one-time message). Stop the pool with
+  `mirai::daemons(0)` if you need a call to run serially.
 * On Windows, avoid setting `mc.cores` in your `.Rprofile`
   (see [this issue](https://github.com/stan-dev/loo/issues/94)); pass `cores`
   explicitly or set the option in your script instead.

From 5dd4b52cfa4950289ad75320fbadc1344882db03 Mon Sep 17 00:00:00 2001
From: Florence Bockting <florence.bockting@aalto.fi>
Date: Wed, 1 Jul 2026 10:52:19 +0300
Subject: [PATCH 5/6] update benchmark scenarios

---
 benchmark/README.md            | 24 +++++++--
 benchmark/benchmark-parallel.R | 92 +++++++++++++++++++++++-----------
 2 files changed, 84 insertions(+), 32 deletions(-)

diff --git a/benchmark/README.md b/benchmark/README.md
index 40174ff0..28e618bf 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -121,9 +121,27 @@ tree's real footprint).
 
 Edit the top of `benchmark-parallel.R` to match your machine / problem sizes:
 
-- `cores_grid` (default `c(1, 4, 8)`) — the core counts to sweep.
-- `iters` (default `5`) — iterations per `bench::mark()` measurement.
-- `psis_sizes` and the `loo.function` `draws` dimensions — the problem sizes.
+- `cores_grid` (default `c(1, 4, 6)`) — the core counts to sweep.
+- `iters` (default `10`) — iterations per `bench::mark()` measurement for the
+  small/cheap scenarios.
+- `big_iters` (default `5`) — iterations for the large, slow scenarios that are
+  actually worth parallelizing (fewer iterations keeps total runtime sane).
+- `psis_sizes` and `loo_sizes` — the problem sizes. Each entry carries its own
+  `iters`.
+
+Which scenarios are actually worth parallelizing:
+
+- **`loo.function` (`loo_sizes`)** is the path that parallelizes well. `draws`
+  is shared zero-copy across local workers via `mori`, so only tiny
+  per-observation data/results move. The two larger configs (~9s and ~18s
+  serial) reach roughly 2.3x at 4 cores and 3x at 8 cores in local testing —
+  even with the per-call pool, so a single one-off call already benefits.
+- **Matrix `psis` (`psis_sizes`)** is *not* worth parallelizing, even when
+  large: it must ship a big log-ratio matrix out to the workers and return an
+  equally large weighted matrix, so it is communication-bound and stays near 1x
+  regardless of size. The large `S=6000 N=20000` entry (~960 MB) is included
+  deliberately to show this — make sure the machine has enough RAM before adding
+  bigger ones.
 
 For `peak-mem-run.R`, adjust `S`, `P`, and `Nf` to change the size of the
 broadcast `draws` matrix.
diff --git a/benchmark/benchmark-parallel.R b/benchmark/benchmark-parallel.R
index 4b45d68c..83c1f11b 100644
--- a/benchmark/benchmark-parallel.R
+++ b/benchmark/benchmark-parallel.R
@@ -25,29 +25,30 @@ suppressMessages({
 })
 
 is_new <- identical(label, "new")
-cores_grid <- c(1L, 4L, 8L)
-iters <- 10L
+cores_grid <- c(1L, 4L, 6L)
+iters <- 10L     # iterations for the small/cheap scenarios
+big_iters <- 5L  # fewer iterations for the large, slow "worth parallelizing" scenarios
 
 rows <- list()
-record <- function(scenario, mode, cores, expr) {
+record <- function(scenario, mode, cores, expr, n_iter = iters) {
   expr <- substitute(expr)
   pf <- parent.frame()
   eval(expr, pf) # warm up (process spawn / pool / JIT)
   b <- tryCatch(
     bench::mark(
       eval(expr, pf),
-      iterations = iters, check = FALSE, memory = TRUE, filter_gc = FALSE
+      iterations = n_iter, check = FALSE, memory = TRUE, filter_gc = FALSE
     ),
     error = function(e) {
       bench::mark(
         eval(expr, pf),
-        iterations = iters, check = FALSE, memory = FALSE, filter_gc = FALSE
+        iterations = n_iter, check = FALSE, memory = FALSE, filter_gc = FALSE
       )
     }
   )
   rows[[length(rows) + 1L]] <<- data.frame(
     label = label, scenario = scenario, mode = mode, cores = cores,
-    iters = iters,
+    iters = n_iter,
     median_s = as.numeric(b$median),
     mem_mb = if ("mem_alloc" %in% names(b) && !is.na(b$mem_alloc[1])) {
       as.numeric(b$mem_alloc) / 1e6
@@ -64,58 +65,91 @@ record <- function(scenario, mode, cores, expr) {
 
 # ---------------------------------------------------------------------------
 # Scenario 1: standalone PSIS over a log-ratio matrix (partitioned columns).
+#
+# Counter-intuitively, the matrix `psis()` path is NOT worth parallelizing even
+# when it is large: it has to ship a big log-ratio matrix out to the workers and
+# return an equally large weighted matrix, so it is communication-bound. The
+# parallel speedup stays ~1x no matter how big the problem is. The first two
+# sizes are cheap exercisers; the last is large (~11s serial) and is included
+# precisely to *demonstrate* that size alone does not make this path worth
+# parallelizing -- compare its ~1x speedup with the loo.function scenarios below.
 # ---------------------------------------------------------------------------
-psis_sizes <- list(c(S = 2000, N = 1000), c(S = 4000, N = 4000))
+psis_sizes <- list(
+  list(S = 2000, N = 1000, iters = iters),       # cheap exerciser
+  list(S = 4000, N = 4000, iters = iters),       # cheap exerciser
+  list(S = 6000, N = 10000, iters = big_iters)   # large but communication-bound (~1x)
+)
 for (sz in psis_sizes) {
   set.seed(2024) # identical inputs across versions
   S <- sz[["S"]]
   N <- sz[["N"]]
+  n_iter <- sz[["iters"]]
   LL <- matrix(rnorm(S * N), nrow = S)
   re <- rep(1, N)
   scen <- sprintf("psis S=%d N=%d", S, N)
   for (k in cores_grid) {
     mode <- if (k == 1L) "serial" else "per-call"
-    record(scen, mode, k, suppressWarnings(psis(-LL, r_eff = re, cores = k)))
+    record(scen, mode, k, suppressWarnings(psis(-LL, r_eff = re, cores = k)), n_iter = n_iter)
     if (is_new && k > 1L) {
       # Opt in to loo's persistent session pool; the warm-up call inside
       # record() creates it and the timed iterations reuse it.
       old_opt <- options(loo.daemons = k)
-      record(scen, "persist", k, suppressWarnings(psis(-LL, r_eff = re, cores = k)))
+      record(scen, "persist", k, suppressWarnings(psis(-LL, r_eff = re, cores = k)), n_iter = n_iter)
       mirai::daemons(0)
       options(old_opt)
     }
   }
+  rm(LL)
+  gc()
 }
 
 # ---------------------------------------------------------------------------
 # Scenario 2: loo.function with a large broadcast `draws` matrix (the case
 # where fork shares memory for free and mori must recover that benefit).
+#
+# This is the path that is genuinely worth parallelizing: `draws` is shared
+# zero-copy across local workers via mori, and only tiny per-observation data
+# and results move, so the per-observation work parallelizes cleanly. The first
+# config is cheap (N=400, sub-second serial) and not worth parallelizing, but
+# the two larger ones (~9s and ~18s serial) scale well even with the per-call
+# pool -- ~2.3x at 4 cores and ~3x at 8 cores in local testing -- so a single
+# one-off call already benefits.
 # ---------------------------------------------------------------------------
-set.seed(7)
-S2 <- 8000L
-P <- 150L
-Nf <- 400L
-draws_big <- matrix(rnorm(S2 * P), nrow = S2, dimnames = list(NULL, paste0("p", seq_len(P))))
-data_f <- data.frame(y = rnorm(Nf))
 llfun_b <- function(data_i, draws, ...) {
   dnorm(data_i$y, mean = draws[, "p1"], sd = abs(draws[, "p2"]) + 0.5, log = TRUE)
 }
-scen <- sprintf("loo.function S=%d P=%d N=%d (draws=%.1fMB)", S2, P, Nf, S2 * P * 8 / 1e6)
-for (k in cores_grid) {
-  mode <- if (k == 1L) "serial" else "per-call"
-  record(scen, mode, k, suppressWarnings(
-    loo(llfun_b, data = data_f, draws = draws_big, cores = k)
-  ))
-  if (is_new && k > 1L) {
-    # Opt in to loo's persistent session pool; the warm-up call inside
-    # record() creates it and the timed iterations reuse it.
-    old_opt <- options(loo.daemons = k)
-    record(scen, "persist", k, suppressWarnings(
+loo_sizes <- list(
+  list(S = 8000, P = 150, N = 400, iters = iters),        # cheap: not worth parallelizing
+  list(S = 8000, P = 200, N = 8000, iters = big_iters),   # ~9s serial: worth parallelizing
+  list(S = 8000, P = 200, N = 10000, iters = big_iters)   # ~18s serial: worth parallelizing
+)
+for (sz in loo_sizes) {
+  set.seed(7) # identical inputs across versions
+  S2 <- sz[["S"]]
+  P <- sz[["P"]]
+  Nf <- sz[["N"]]
+  n_iter <- sz[["iters"]]
+  draws_big <- matrix(rnorm(S2 * P), nrow = S2, dimnames = list(NULL, paste0("p", seq_len(P))))
+  data_f <- data.frame(y = rnorm(Nf))
+  scen <- sprintf("loo.function S=%d P=%d N=%d (draws=%.1fMB)", S2, P, Nf, S2 * P * 8 / 1e6)
+  for (k in cores_grid) {
+    mode <- if (k == 1L) "serial" else "per-call"
+    record(scen, mode, k, suppressWarnings(
       loo(llfun_b, data = data_f, draws = draws_big, cores = k)
-    ))
-    mirai::daemons(0)
-    options(old_opt)
+    ), n_iter = n_iter)
+    if (is_new && k > 1L) {
+      # Opt in to loo's persistent session pool; the warm-up call inside
+      # record() creates it and the timed iterations reuse it.
+      old_opt <- options(loo.daemons = k)
+      record(scen, "persist", k, suppressWarnings(
+        loo(llfun_b, data = data_f, draws = draws_big, cores = k)
+      ), n_iter = n_iter)
+      mirai::daemons(0)
+      options(old_opt)
+    }
   }
+  rm(draws_big)
+  gc()
 }
 
 out <- do.call(rbind, rows)

From 2bee14f8682c5ff1f30c5b6bd5b337b312c38970 Mon Sep 17 00:00:00 2001
From: Florence Bockting <florence.bockting@aalto.fi>
Date: Wed, 1 Jul 2026 11:28:11 +0300
Subject: [PATCH 6/6] update benchmark results

---
 benchmark/bench-comparison.md | 62 ++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 27 deletions(-)

diff --git a/benchmark/bench-comparison.md b/benchmark/bench-comparison.md
index 487b38f5..3707e39a 100644
--- a/benchmark/bench-comparison.md
+++ b/benchmark/bench-comparison.md
@@ -1,24 +1,33 @@
 # loo parallel benchmark comparison
 
-_Generated 2026-06-30 16:29:48._
+_Generated 2026-07-01 11:23:20._
 
 **Columns / modes.** `base` is the baseline version. `new/call` is the new version's default per-call pool (created and torn down each call). `new/persist` is the new version's persistent session pool (`options(loo.daemons = k)`), reused across calls. `cores = 1` rows are fully serial (the parallel backend is never used).
 
 ## Median wall-clock time (s) and speedup vs baseline
 
-Median over 10 iterations of one `psis()`/`loo()` call (a warm-up run is excluded). `speedup = base / new`, so a value `> 1` means the new version is faster. Expect `new/persist` to win when many calls reuse the pool, and `new/call` to look slower than `base` for cheap problems because it pays pool start-up/teardown on every call. The `cores = 1` `base` and `new/call` numbers should be roughly equal (both serial); sizeable gaps there are run-to-run noise, not real differences.
+Median over 5-10 iterations of one `psis()`/`loo()` call (a warm-up run is excluded). `speedup = base / new`, so a value `> 1` means the new version is faster. Expect `new/persist` to win when many calls reuse the pool, and `new/call` to look slower than `base` for cheap problems because it pays pool start-up/teardown on every call. The `cores = 1` `base` and `new/call` numbers should be roughly equal (both serial); sizeable gaps there are run-to-run noise, not real differences.
 
 | scenario | cores | base | new/call | new/persist | speedup (call) | speedup (persist) |
 |:---|---:|---:|---:|---:|---:|---:|
-| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 1 | 0.458 | 0.406 | — | 1.13x | — |
-| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 4 | 0.670 | 1.315 | 0.175 | 0.51x | 3.83x |
-| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 8 | 0.536 | 1.433 | 0.133 | 0.37x | 4.02x |
-| psis S=2000 N=1000 | 1 | 0.416 | 0.403 | — | 1.03x | — |
-| psis S=2000 N=1000 | 4 | 0.237 | 1.262 | 0.158 | 0.19x | 1.51x |
-| psis S=2000 N=1000 | 8 | 0.205 | 1.276 | 0.136 | 0.16x | 1.51x |
-| psis S=4000 N=4000 | 1 | 2.998 | 2.628 | — | 1.14x | — |
-| psis S=4000 N=4000 | 4 | 2.253 | 2.385 | 1.455 | 0.94x | 1.55x |
-| psis S=4000 N=4000 | 8 | 1.707 | 2.258 | 1.050 | 0.76x | 1.63x |
+| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 1 | 0.677 | 0.678 | — | 1.00x | — |
+| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 4 | 0.468 | 1.161 | 0.231 | 0.40x | 2.02x |
+| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 6 | 0.379 | 1.200 | 0.147 | 0.32x | 2.58x |
+| loo.function S=8000 P=200 N=10000 (draws=12.8MB) | 1 | 15.150 | 17.807 | — | 0.85x | — |
+| loo.function S=8000 P=200 N=10000 (draws=12.8MB) | 4 | 4.716 | 5.691 | 5.013 | 0.83x | 0.94x |
+| loo.function S=8000 P=200 N=10000 (draws=12.8MB) | 6 | 2.598 | 4.036 | 3.258 | 0.64x | 0.80x |
+| loo.function S=8000 P=200 N=8000 (draws=12.8MB) | 1 | 13.527 | 13.847 | — | 0.98x | — |
+| loo.function S=8000 P=200 N=8000 (draws=12.8MB) | 4 | 4.325 | 4.944 | 4.086 | 0.87x | 1.06x |
+| loo.function S=8000 P=200 N=8000 (draws=12.8MB) | 6 | 2.597 | 3.437 | 2.650 | 0.76x | 0.98x |
+| psis S=2000 N=1000 | 1 | 0.388 | 0.371 | — | 1.05x | — |
+| psis S=2000 N=1000 | 4 | 0.242 | 1.153 | 0.214 | 0.21x | 1.13x |
+| psis S=2000 N=1000 | 6 | 0.252 | 1.190 | 0.176 | 0.21x | 1.43x |
+| psis S=4000 N=4000 | 1 | 2.892 | 2.764 | — | 1.05x | — |
+| psis S=4000 N=4000 | 4 | 1.704 | 2.331 | 1.465 | 0.73x | 1.16x |
+| psis S=4000 N=4000 | 6 | 1.367 | 2.547 | 1.786 | 0.54x | 0.77x |
+| psis S=6000 N=10000 | 1 | 9.641 | 9.195 | — | 1.05x | — |
+| psis S=6000 N=10000 | 4 | 6.203 | 7.735 | 7.242 | 0.80x | 0.86x |
+| psis S=6000 N=10000 | 6 | 5.384 | 7.408 | 6.801 | 0.73x | 0.79x |
 
 ## Main-process memory allocation (MB)
 
@@ -27,24 +36,23 @@ Total bytes allocated on the R heap by the *coordinator* process during the call
 | scenario | cores | base | new/call | new/persist |
 |:---|---:|---:|---:|---:|
 | loo.function S=8000 P=150 N=400 (draws=9.6MB) | 1 | 803.292 | 803.292 | — |
-| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 4 | — | 0.720 | 0.700 |
-| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 8 | — | 1.297 | 1.276 |
+| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 4 | — | 1.112 | 1.092 |
+| loo.function S=8000 P=150 N=400 (draws=9.6MB) | 6 | — | 1.596 | 1.576 |
+| loo.function S=8000 P=200 N=10000 (draws=12.8MB) | 1 | 20082.281 | 20082.281 | — |
+| loo.function S=8000 P=200 N=10000 (draws=12.8MB) | 4 | — | 2.917 | 2.897 |
+| loo.function S=8000 P=200 N=10000 (draws=12.8MB) | 6 | — | 3.401 | 3.381 |
+| loo.function S=8000 P=200 N=8000 (draws=12.8MB) | 1 | 16065.825 | 16065.825 | — |
+| loo.function S=8000 P=200 N=8000 (draws=12.8MB) | 4 | — | 2.541 | 2.521 |
+| loo.function S=8000 P=200 N=8000 (draws=12.8MB) | 6 | — | 3.025 | 3.005 |
 | psis S=2000 N=1000 | 1 | 304.130 | 304.130 | — |
-| psis S=2000 N=1000 | 4 | — | 120.805 | 120.785 |
-| psis S=2000 N=1000 | 8 | — | 121.322 | 121.302 |
+| psis S=2000 N=1000 | 4 | — | 120.986 | 120.966 |
+| psis S=2000 N=1000 | 6 | — | 121.335 | 121.315 |
 | psis S=4000 N=4000 | 1 | 2306.371 | 2306.371 | — |
-| psis S=4000 N=4000 | 4 | — | 961.519 | 961.498 |
-| psis S=4000 N=4000 | 8 | — | 962.035 | 962.015 |
-
-## Peak RSS of the whole process tree (MB)
-
-From `peak-mem.sh` (single large-`draws` `loo()` run; Linux only). Maximum summed resident memory of the *entire* process tree (main process plus all workers), sampled during the run. This is the metric for the job's real memory footprint. On a local pool, `mori` shares the `draws` matrix across workers (zero-copy), so peak RSS stays close to a single copy rather than growing with the number of workers.
-
-| label | mode | cores | peak RSS (MB) |
-|:---|:---|---:|---:|
-| baseline | per-call | 8 | 2418 |
-| new | per-call | 8 | 343 |
-| new | persist | 8 | 343 |
+| psis S=4000 N=4000 | 4 | — | 961.700 | 961.679 |
+| psis S=4000 N=4000 | 6 | — | 962.048 | 962.028 |
+| psis S=6000 N=10000 | 1 | 8458.151 | 8458.151 | — |
+| psis S=6000 N=10000 | 4 | — | 3603.208 | 3603.188 |
+| psis S=6000 N=10000 | 6 | — | 3603.557 | 3603.537 |
 
 ## Caveats