The goal of this vignette is explain how to use ResamplingSameOtherSizesCV for various kinds of cross-validation.

Simulations

We begin with a simple simulated data set.

Comparing training on Same/Other/All subsets

N <- 2100
abs.x <- 70
set.seed(2)
x.vec <- runif(N, -abs.x, abs.x)
str(x.vec)
#> num [1:2100] -44.1 28.3 10.3 -46.5 62.1 ...
library(data.table)
#>
#>Attaching package: ‘data.table’
#>
#>The following object is masked from ‘package:base’:
#>
#>    %notin%
#>
(task.dt <- data.table(
  x=x.vec,
  y = sin(x.vec)+rnorm(N,sd=0.5)))

x	y
-44.116	-0.408
28.332	-0.085
10.266	-1.233
-46.473	-1.362
62.138	-1.338
⋮	⋮
60.838	-0.107
55.715	-0.924
14.310	1.045
27.180	1.678
23.672	-0.269

if(require(ggplot2)){
  text.size <- 6
  my_theme <- theme_bw(20)
  theme_set(my_theme)
  ggplot()+
    geom_point(aes(
      x, y),
      shape=1,
      data=task.dt)
}
#>Loading required package: ggplot2

Above we see a scatterplot of the simulated data. The goal of the learning algorithm will be to predict y from x.

The code below assigns three test groups to the randomly simulated data.

atomic.group.size <- 2
task.dt[, agroup := rep(seq(1, N/atomic.group.size), each=atomic.group.size)][]

x	y	agroup
-44.116	-0.408	1
28.332	-0.085	1
10.266	-1.233	2
-46.473	-1.362	2
62.138	-1.338	3
⋮	⋮	⋮
60.838	-0.107	1048
55.715	-0.924	1049
14.310	1.045	1049
27.180	1.678	1050
23.672	-0.269	1050

task.dt[, random_group := rep(
  rep(c("A","B","B","C","C","C","C"), each=atomic.group.size),
  l=.N
)][]

x	y	agroup	random_group
-44.116	-0.408	1	A
28.332	-0.085	1	A
10.266	-1.233	2	B
-46.473	-1.362	2	B
62.138	-1.338	3	B
⋮	⋮	⋮	⋮
60.838	-0.107	1048	C
55.715	-0.924	1049	C
14.310	1.045	1049	C
27.180	1.678	1050	C
23.672	-0.269	1050	C

table(group.tab <- task.dt$random_group)
#>
#>   A    B    C 
#> 300  600 1200

The output above shows the number of rows in each random group. Before we create a task from these data, we need to load the mlr3resampling package, or we will get an error like below, because this is not a standard mlr3 role.

bad.task <- mlr3::TaskRegr$new("bad", task.dt, target="y")
bad.task$col_roles$subset <- "random_group"
#>Error in .__Task__col_roles(self = self, private = private, super = super, : Assertion on 'names(rhs)' failed: Names must be a permutation of set {'feature','target','name','order','stratum','group','offset','weights_learner','weights_measure'}, but has extra elements {'subset'}.

Below we define the cross-validation object, which loads the mlr3resampling package,

soak_default <- mlr3resampling::ResamplingSameOtherSizesCV$new()

Below we define a task,

reg.task <- mlr3::TaskRegr$new(
  "sin", task.dt, target="y")
reg.task$col_roles$group <- "agroup"
reg.task$col_roles$stratum <- "random_group"
reg.task$col_roles$feature <- "x"

Below we assign the random group column to be used as the subset role.

reg.task$col_roles$subset <- "random_group"

Below we instantiate a clone of the resampler, in order to show details about how it works (but normally you should not instantiate it yourself, as this will be done automatically inside the call to mlr3::benchmark).

soak_default$clone()$instantiate(reg.task)$instance$iteration.dt

test.subset	train.subsets	groups	test.fold	test	train	seed	n.train.groups	iteration	Train_subsets
A	all	700	1	43,44,57,58,71,72,…[100]	1, 2, 7, 8, 9,10,…[1400]	1	700	1	all
B	all	700	1	3, 4, 5, 6,17,18,…[200]	1, 2, 7, 8, 9,10,…[1400]	1	700	2	all
C	all	700	1	23,24,25,26,37,38,…[400]	1, 2, 7, 8, 9,10,…[1400]	1	700	3	all
A	all	700	2	1, 2,15,16,29,30,…[100]	3,4,5,6,7,8,…[1400]	1	700	4	all
B	all	700	2	33,34,47,48,61,62,…[200]	3,4,5,6,7,8,…[1400]	1	700	5	all
⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮
B	same	200	2	33,34,47,48,61,62,…[200]	3, 4, 5, 6,17,18,…[400]	1	200	23	same
C	same	400	2	13,14,21,22,35,36,…[400]	7, 8, 9,10,11,12,…[800]	1	400	24	same
A	same	100	3	99,100,155,156,169,170,…[100]	1, 2,15,16,29,30,…[200]	1	100	25	same
B	same	200	3	19,20,45,46,75,76,…[200]	3, 4, 5, 6,17,18,…[400]	1	200	26	same
C	same	400	3	7, 8, 9,10,11,12,…[400]	13,14,21,22,23,24,…[800]	1	400	27	same

So using the K-fold cross-validation, we will do one train/test split for each row of the table above. There is one row for each combination of test subset (A, B, C), train subset (same, other, all), and test fold (1, 2, 3).

We compute and plot the results using the code below,

(reg.learner.list <- list(
  mlr3::LearnerRegrFeatureless$new()))
#>[[1]]
#>
#>── <LearnerRegrFeatureless> (regr.featureless): Featureless Regression Learner ─
#>• Model: -
#>• Parameters: robust=FALSE
#>• Packages: mlr3 and stats
#>• Predict Types: [response], se, and quantiles
#>• Feature Types: logical, integer, numeric, character, factor, ordered,
#>POSIXct, and Date
#>• Encapsulation: none (fallback: -)
#>• Properties: featureless, importance, missings, selected_features, and weights
#>• Other settings: use_weights = 'use', predict_raw = 'FALSE'
#>
if(requireNamespace("rpart")){
  reg.learner.list$rpart <- mlr3::LearnerRegrRpart$new()
}
#>Loading required namespace: rpart
set.seed(3)
(soak_default_grid <- mlr3::benchmark_grid(
  reg.task,
  reg.learner.list,
  soak_default))

task	learner	resampling
TaskRegr:sin	LearnerRegrFeatureless:regr.featureless
TaskRegr:sin	LearnerRegrRpart:regr.rpart

##if(require(future))plan("multisession")
lgr::get_logger("mlr3")$set_threshold("warn")
(soak_default_result <- mlr3::benchmark(
  soak_default_grid, store_models = TRUE))
#>
#>── <BenchmarkResult> of 54 rows with 2 resampling run ──────────────────────────
#> nr task_id       learner_id       resampling_id iters warnings errors
#>  1     sin regr.featureless same_other_sizes_cv    27        0      0
#>  2     sin       regr.rpart same_other_sizes_cv    27        0      0
soak_default_score <- mlr3resampling::score(
  soak_default_result, mlr3::msr("regr.rmse"))
plot(soak_default_score)+my_theme

The plot method above shows a multi-panel figure (vertical facet for each algorithm), whereas below we make a custom ggplot with no vertical facets, and color for algorithm.

soak_default_score[, n.train := sapply(train, length)]
soak_default_score[1]

test.subset	train.subsets	groups	test.fold	test	train	seed	n.train.groups	iteration	Train_subsets	uhash	nr	task	task_id	learner	learner_id	resampling	resampling_id	prediction_test	regr.rmse	algorithm	n.train
A	all	700	1	57, 58,141,142,211,212,…[100]	1,2,3,4,5,6,…[1400]	1	700	1	all	3cff8fc3-23ce-4f40-8bd2-f42d80f22305	1	TaskRegr:sin	sin	LearnerRegrFeatureless:regr.featureless	regr.featureless		same_other_sizes_cv		0.912	featureless	1400

if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      regr.rmse, train.subsets, color=algorithm),
      shape=1,
      data=soak_default_score)+
    geom_text(aes(
      Inf, train.subsets,
      label=sprintf("n.train=%d ", n.train)),
      size=text.size,
      hjust=1,
      vjust=1.5,
      data=soak_default_score[algorithm=="featureless" & test.fold==1])+
    facet_grid(. ~ test.subset, labeller=label_both, scales="free")+
    scale_x_continuous(
      "Root mean squared prediction error (test set)")
}

The figure above shows the effect of train set size on test error.

soak_default_wide <- dcast(
  soak_default_score,
  algorithm + test.subset + train.subsets ~ .,
  list(mean, sd),
  value.var="regr.rmse")
if(require(ggplot2)){
  ggplot()+
    geom_segment(aes(
      regr.rmse_mean+regr.rmse_sd, train.subsets,
      xend=regr.rmse_mean-regr.rmse_sd, yend=train.subsets,
      color=algorithm),
      data=soak_default_wide)+
    geom_point(aes(
      regr.rmse_mean, train.subsets, color=algorithm),
      shape=1,
      data=soak_default_wide)+
    geom_text(aes(
      Inf, train.subsets,
      label=sprintf("n.train=%d ", n.train)),
      size=text.size,
      hjust=1,
      vjust=1.5,
      data=soak_default_score[algorithm=="featureless" & test.fold==1])+
    facet_grid(. ~ test.subset, labeller=label_both, scales="free")+
    scale_x_continuous(
      "Root mean squared prediction error (test set)")
}

The figure above shows a test subset in each panel, the train subsets on the y axis, the test error on the x axis, the two different algorithms are shown in two different colors. We can clearly see that

For train.subsets=same, test error is largest, sometimes almost as large as featureless, which is the error rate when no relationship has been learned between inputs and outputs (not enough data).
For train.subsets=other, rpart test error is significantly smaller than featureless, indicating that some non-trivial relationship between inputs and outputs has been learned. Sometimes other has larger error than same, sometimes smaller (depending on sample size).
For train.subsets=all, rpart test error tends to be minimal, which indicates that combining all of the subsets is beneficial in this case (when the pattern is exactly the same in the different subsets).

Overall in the plot above, all tends to have less prediction error than same, which suggests that the subsets are similar (and indeed the subsets are i.i.d. in this simulation). Another visualization method is shown below,

plist <- mlr3resampling::pvalue(soak_default_score, digits=3)
plot(plist)+my_theme

The visualization above includes P-values (two-sided T-test) for the differences between Same and Other/All.

Below we visualize test error as a function of train size.

if(require(ggplot2)){
  ggplot()+
    theme(legend.position=c(0.85,0.85))+
    geom_line(aes(
      n.train, regr.rmse,
      color=algorithm,
      group=paste(algorithm, test.fold)),
      data=soak_default_score)+
    geom_label(aes(
      n.train, regr.rmse,
      color=algorithm,
      label=train.subsets),
      size=text.size,
      data=soak_default_score)+
    facet_grid(. ~ test.subset, labeller=label_both, scales="free")+
    scale_y_continuous(
      "Root mean squared prediction error (test set)")
}

Downsample to see how many train data are required for good accuracy overall

In the previous section we defined a task using the subset role, which means that the different values in that column will be used to define different subsets for training/testing using same/other/all CV. In contrast, below we define a task without the subset role, which means that we will not have separate CV iterations for same/other/all (full data is treated as one subset / train subset is same).

task.no.subset <- mlr3::TaskRegr$new(
  "sin", task.dt, target="y")
task.no.subset$col_roles$group <- "agroup"
task.no.subset$col_roles$stratum <- "random_group"
task.no.subset$col_roles$feature <- "x"
str(task.no.subset$col_roles)
#>List of 11
#> $ feature        : chr "x"
#> $ target         : chr "y"
#> $ name           : chr(0) 
#> $ order          : chr(0) 
#> $ stratum        : chr "random_group"
#> $ group          : chr "agroup"
#> $ offset         : chr(0) 
#> $ weights_learner: chr(0) 
#> $ weights_measure: chr(0) 
#> $ subset         : chr(0) 
#> $ fold           : chr(0)

Below we define cross-validation, and we set the sizes to 5 so we can see what happens when we have have train sets that are 5 sizes smaller than the full train set size.

five_smaller_sizes <- mlr3resampling::ResamplingSameOtherSizesCV$new()
five_smaller_sizes$param_set$values$sizes <- 5
five_smaller_sizes$clone()$instantiate(task.no.subset)$instance$iteration.dt

test.subset	train.subsets	groups	test.fold	test	train	seed	n.train.groups	iteration	Train_subsets
full	same	700	1	7, 8,11,12,15,16,…[700]	25, 26,177,178,229,230,…[42]	1	21	1	same
full	same	700	1	7, 8,11,12,15,16,…[700]	23, 24, 25, 26,177,178,…[84]	1	43	2	same
full	same	700	1	7, 8,11,12,15,16,…[700]	9,10,17,18,23,24,…[170]	1	87	3	same
full	same	700	1	7, 8,11,12,15,16,…[700]	1, 2, 9,10,17,18,…[350]	1	175	4	same
full	same	700	1	7, 8,11,12,15,16,…[700]	1, 2, 9,10,17,18,…[700]	1	350	5	same
⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮
full	same	700	3	3, 4, 9,10,13,14,…[700]	15, 16,169,170,253,254,…[84]	1	43	14	same
full	same	700	3	3, 4, 9,10,13,14,…[700]	15,16,17,18,33,34,…[170]	1	87	15	same
full	same	700	3	3, 4, 9,10,13,14,…[700]	1, 2,15,16,17,18,…[350]	1	175	16	same
full	same	700	3	3, 4, 9,10,13,14,…[700]	1, 2,15,16,17,18,…[700]	1	350	17	same
full	same	700	3	3, 4, 9,10,13,14,…[700]	1,2,5,6,7,8,…[1400]	1	700	18	same

So using the K-fold cross-validation, we will do one train/test split for each row of the table above. There is one row for each combination of n.train.groups (full train set size + 5 smaller sizes), and test fold (1, 2, 3).

We compute and plot the results using the code below,

(reg.learner.list <- list(
  mlr3::LearnerRegrFeatureless$new()))
#>[[1]]
#>
#>── <LearnerRegrFeatureless> (regr.featureless): Featureless Regression Learner ─
#>• Model: -
#>• Parameters: robust=FALSE
#>• Packages: mlr3 and stats
#>• Predict Types: [response], se, and quantiles
#>• Feature Types: logical, integer, numeric, character, factor, ordered,
#>POSIXct, and Date
#>• Encapsulation: none (fallback: -)
#>• Properties: featureless, importance, missings, selected_features, and weights
#>• Other settings: use_weights = 'use', predict_raw = 'FALSE'
#>
if(requireNamespace("rpart")){
  reg.learner.list$rpart <- mlr3::LearnerRegrRpart$new()
}
set.seed(1)
(five_smaller_sizes_grid <- mlr3::benchmark_grid(
  task.no.subset,
  reg.learner.list,
  five_smaller_sizes))

task	learner	resampling
TaskRegr:sin	LearnerRegrFeatureless:regr.featureless
TaskRegr:sin	LearnerRegrRpart:regr.rpart

##if(require(future))plan("multisession")
lgr::get_logger("mlr3")$set_threshold("warn")
(five_smaller_sizes_result <- mlr3::benchmark(
  five_smaller_sizes_grid, store_models = TRUE))
#>
#>── <BenchmarkResult> of 36 rows with 2 resampling run ──────────────────────────
#> nr task_id       learner_id       resampling_id iters warnings errors
#>  1     sin regr.featureless same_other_sizes_cv    18        0      0
#>  2     sin       regr.rpart same_other_sizes_cv    18        0      0
five_smaller_sizes_score <- mlr3resampling::score(
  five_smaller_sizes_result, mlr3::msr("regr.rmse")
)[, n.train := sapply(train, length)]
five_smaller_sizes_score[1]

test.subset	train.subsets	groups	test.fold	test	train	seed	n.train.groups	iteration	Train_subsets	uhash	nr	task	task_id	learner	learner_id	resampling	resampling_id	prediction_test	regr.rmse	algorithm	n.train
full	same	700	1	5, 6,17,18,21,22,…[700]	217,218,269,270,291,292,…[42]	1	21	1	same	30d902af-39d6-4dd5-8cf9-2ce632f0bcf6	1	TaskRegr:sin	sin	LearnerRegrFeatureless:regr.featureless	regr.featureless		same_other_sizes_cv		0.854	featureless	42

if(require(ggplot2)){
  ggplot()+
    geom_line(aes(
      n.train, regr.rmse,
      color=algorithm,
      group=paste(algorithm, test.fold)),
      data=five_smaller_sizes_score)+
    geom_point(aes(
      n.train, regr.rmse,
      color=algorithm),
      data=five_smaller_sizes_score)+
    facet_grid(. ~ test.subset, labeller=label_both, scales="free")+
    scale_x_log10(
      "Number of train rows",
      breaks=unique(five_smaller_sizes_score$n.train))+
    scale_y_continuous(
      "Root mean squared prediction error (test set)")
}

From the plot above, it looks like about 700 rows is enough to get minimal test error, using the rpart learner.

Reproducibility

After the benchmark is done, how can we reproduce it?

Reproducing K-fold CV for largest train size

First we create a new task with a fold column containing the fold IDs used in the previous benchmark.

(five_smaller_instantiated <- five_smaller_sizes_result$resamplings$resampling[[1]])
#>
#>── <ResamplingSameOtherSizesCV> : Compare Same/Other and Sizes Cross-Validation 
#>• Iterations: 18
#>• Instantiated: TRUE
#>• Parameters: folds=3, seeds=1, ratio=0.5, sizes=5, ignore_subset=FALSE,
#>subsets=SOA
(task.dt.fold <- five_smaller_instantiated$instance$fold.dt[
, data.table(fold, task.dt)])

fold	x	y	agroup	random_group
2	-44.116	-0.408	1	A
2	28.332	-0.085	1	A
3	10.266	-1.233	2	B
3	-46.473	-1.362	2	B
1	62.138	-1.338	3	B
⋮	⋮	⋮	⋮	⋮
1	60.838	-0.107	1048	C
1	55.715	-0.924	1049	C
1	14.310	1.045	1049	C
1	27.180	1.678	1050	C
1	23.672	-0.269	1050	C

task.with.fold <- mlr3::TaskRegr$new(
  "sin", task.dt.fold, target="y")
task.with.fold$col_roles$group <- "agroup"
task.with.fold$col_roles$stratum <- "random_group"
task.with.fold$col_roles$feature <- "x"

Then we run a new benchmark with custom CV.

fold_col_cv <- mlr3::ResamplingCustomCV$new()
fold_col_cv$instantiate(task.with.fold, col="fold")
(fold_col_grid <- mlr3::benchmark_grid(
  task.no.subset, #works because same number of rows!
  reg.learner.list,
  fold_col_cv))

task	learner	resampling
TaskRegr:sin	LearnerRegrFeatureless:regr.featureless
TaskRegr:sin	LearnerRegrRpart:regr.rpart

lgr::get_logger("mlr3")$set_threshold("warn")
(fold_col_result <- mlr3::benchmark(
  fold_col_grid, store_models = TRUE))
#>
#>── <BenchmarkResult> of 6 rows with 2 resampling run ───────────────────────────
#> nr task_id       learner_id resampling_id iters warnings errors
#>  1     sin regr.featureless     custom_cv     3        0      0
#>  2     sin       regr.rpart     custom_cv     3        0      0

The code below compares the original benchmark from the previous section to the new benchmark computed in this section.

rep_score_list <- list(
  reproduced=fold_col_result$score(mlr3::msr("regr.rmse"))[, test.fold := iteration],
  original=five_smaller_sizes_score[n.train==max(n.train)])
rep_score_dt <- data.table(data_source=names(rep_score_list))[
, rep_score_list[[data_source]][, .(learner_id, test.fold, regr.rmse)]
, by=data_source]
dcast(rep_score_dt, learner_id + data_source ~ test.fold, value.var="regr.rmse")

learner_id	data_source	1	2	3
regr.featureless	original	0.842	0.852	0.890
regr.featureless	reproduced	0.842	0.852	0.890
regr.rpart	original	0.767	0.718	0.702
regr.rpart	reproduced	0.767	0.718	0.702

The output above shows that the reproduced error rates are consistent with the original error rates.

Reproducing each split

Each train/test split in mlr3 is called an iteration. Below we use a custom resampling to reproduce the iterations created by five_smaller_sizes in the previous section.

custom_splits <- mlr3::ResamplingCustom$new()
five_smaller_instantiated$instance$iteration.dt[
, custom_splits$instantiate(task.no.subset, train, test)]
#>
#>── <ResamplingCustom> : Custom Splits ──────────────────────────────────────────
#>• Iterations: 18
#>• Instantiated: TRUE
#>• Parameters: list()
(custom_splits_grid <- mlr3::benchmark_grid(
  task.no.subset,
  reg.learner.list,
  custom_splits))

task	learner	resampling
TaskRegr:sin	LearnerRegrFeatureless:regr.featureless
TaskRegr:sin	LearnerRegrRpart:regr.rpart

lgr::get_logger("mlr3")$set_threshold("warn")
(custom_split_result <- mlr3::benchmark(
  custom_splits_grid, store_models = TRUE))
#>
#>── <BenchmarkResult> of 36 rows with 2 resampling run ──────────────────────────
#> nr task_id       learner_id resampling_id iters warnings errors
#>  1     sin regr.featureless        custom    18        0      0
#>  2     sin       regr.rpart        custom    18        0      0

The code below compares the reproduced error rates computed in this section with the original error rates computed in the previous section.

rep_custom_list <- list(
  reproduced=custom_split_result$score(mlr3::msr("regr.rmse")),
  original=five_smaller_sizes_score)
rep_custom_dt <- data.table(data_source=names(rep_custom_list))[
, rep_custom_list[[data_source]][, .(learner_id, iteration, regr.rmse)]
, by=data_source]
dcast(
  rep_custom_dt,
  learner_id + iteration ~ data_source,
  value.var="regr.rmse"
)[, diff := reproduced-original][]

learner_id	iteration	original	reproduced	diff
regr.featureless	1	0.854	0.854	0
regr.featureless	2	0.843	0.843	0
regr.featureless	3	0.844	0.844	0
regr.featureless	4	0.842	0.842	0
regr.featureless	5	0.842	0.842	0
⋮	⋮	⋮	⋮	⋮
regr.rpart	14	0.923	0.923	0
regr.rpart	15	0.904	0.904	0
regr.rpart	16	0.812	0.812	0
regr.rpart	17	0.708	0.708	0
regr.rpart	18	0.702	0.702	0

The output above shows a difference of zero, indicating that the error rates are consistent.

Downsample to sizes of other sets

To investigate down-sampling in the context of training on same/other/all subsets, we first generate some new data (smaller than previously).

N <- 600
abs.x <- 20
set.seed(1)
x.vec <- sort(runif(N, -abs.x, abs.x))
str(x.vec)
#> num [1:600] -19.9 -19.9 -19.7 -19.6 -19.6 ...
library(data.table)
(task.dt <- data.table(
  x=x.vec,
  y = sin(x.vec)+rnorm(N,sd=0.5)))

x	y
-19.927	-0.434
-19.923	-1.402
-19.675	0.251
-19.559	-0.843
-19.554	0.179
⋮	⋮
19.707	0.750
19.750	0.318
19.757	1.395
19.839	-0.209
19.843	0.575

if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      x, y),
      shape=1,
      data=task.dt)+
    coord_equal()
}

atomic.subset.size <- 2
task.dt[, agroup := rep(seq(1, N/atomic.subset.size), each=atomic.subset.size)][]

x	y	agroup
-19.927	-0.434	1
-19.923	-1.402	1
-19.675	0.251	2
-19.559	-0.843	2
-19.554	0.179	3
⋮	⋮	⋮
19.707	0.750	298
19.750	0.318	299
19.757	1.395	299
19.839	-0.209	300
19.843	0.575	300

task.dt[, random_subset := rep(
  rep(c("A","B","B","B"), each=atomic.subset.size),
  l=.N
)][]

x	y	agroup	random_subset
-19.927	-0.434	1	A
-19.923	-1.402	1	A
-19.675	0.251	2	B
-19.559	-0.843	2	B
-19.554	0.179	3	B
⋮	⋮	⋮	⋮
19.707	0.750	298	B
19.750	0.318	299	B
19.757	1.395	299	B
19.839	-0.209	300	B
19.843	0.575	300	B

table(subset.tab <- task.dt$random_subset)
#>
#>  A   B 
#>150 450 
reg.task <- mlr3::TaskRegr$new(
  "sin", task.dt, target="y")
reg.task$col_roles$subset <- "random_subset"
reg.task$col_roles$group <- "agroup"
reg.task$col_roles$stratum <- "random_subset"
reg.task$col_roles$feature <- "x"
soak_sizes <- mlr3resampling::ResamplingSameOtherSizesCV$new()

In the previous section we analyzed prediction accuracy of same/other/all, which corresponds to keeping sizes parameter at default of -1. The main difference in this section is that we change sizes to 0, which means to down-sample same/other/all, so we can see if there is an effect for sample size (there should be for iid problems with intermediate difficulty). We set sizes to 0 in the next line:

soak_sizes$param_set$values$sizes <- 0
soak_sizes$instantiate(reg.task)
soak_sizes$instance$it

test.subset	train.subsets	groups	test.fold	test	train	seed	n.train.groups	iteration	Train_subsets
A	all	200	1	1, 2,49,50,57,58,…[50]	3,4,5,6,7,8,…[400]	1	200	1	all
A	all	200	1	1, 2,49,50,57,58,…[50]	5, 6, 9,10,15,16,…[98]	1	50	2	all
B	all	200	1	19,20,31,32,37,38,…[150]	3,4,5,6,7,8,…[400]	1	200	3	all
B	all	200	1	19,20,31,32,37,38,…[150]	3, 4, 7, 8,15,16,…[98]	1	50	4	all
A	all	200	2	17,18,41,42,89,90,…[50]	1, 2, 9,10,15,16,…[400]	1	200	5	all
⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮
B	same	150	2	3,4,5,6,7,8,…[150]	15,16,19,20,21,22,…[300]	1	150	26	same
B	same	150	2	3,4,5,6,7,8,…[150]	23,24,37,38,51,52,…[100]	1	50	27	same
A	same	50	3	9,10,25,26,33,34,…[50]	1, 2,17,18,41,42,…[100]	1	50	28	same
B	same	150	3	15,16,21,22,23,24,…[150]	3,4,5,6,7,8,…[300]	1	150	29	same
B	same	150	3	15,16,21,22,23,24,…[150]	11,12,19,20,45,46,…[100]	1	50	30	same

(reg.learner.list <- list(
  mlr3::LearnerRegrFeatureless$new()))
#>[[1]]
#>
#>── <LearnerRegrFeatureless> (regr.featureless): Featureless Regression Learner ─
#>• Model: -
#>• Parameters: robust=FALSE
#>• Packages: mlr3 and stats
#>• Predict Types: [response], se, and quantiles
#>• Feature Types: logical, integer, numeric, character, factor, ordered,
#>POSIXct, and Date
#>• Encapsulation: none (fallback: -)
#>• Properties: featureless, importance, missings, selected_features, and weights
#>• Other settings: use_weights = 'use', predict_raw = 'FALSE'
#>
if(requireNamespace("rpart")){
  reg.learner.list$rpart <- mlr3::LearnerRegrRpart$new()
}
(soak_sizes_grid <- mlr3::benchmark_grid(
  reg.task,
  reg.learner.list,
  soak_sizes))

task	learner	resampling
TaskRegr:sin	LearnerRegrFeatureless:regr.featureless
TaskRegr:sin	LearnerRegrRpart:regr.rpart

##if(require(future))plan("multisession")
lgr::get_logger("mlr3")$set_threshold("warn")
(soak_sizes_result <- mlr3::benchmark(
  soak_sizes_grid, store_models = TRUE))
#>
#>── <BenchmarkResult> of 60 rows with 2 resampling run ──────────────────────────
#> nr task_id       learner_id       resampling_id iters warnings errors
#>  1     sin regr.featureless same_other_sizes_cv    30        0      0
#>  2     sin       regr.rpart same_other_sizes_cv    30        0      0
soak_sizes_score <- mlr3resampling::score(
  soak_sizes_result, mlr3::msr("regr.rmse"))
soak_sizes_score[1]

test.subset	train.subsets	groups	test.fold	test	train	seed	n.train.groups	iteration	Train_subsets	uhash	nr	task	task_id	learner	learner_id	resampling	resampling_id	prediction_test	regr.rmse	algorithm
A	all	200	1	1, 2,49,50,57,58,…[50]	3,4,5,6,7,8,…[400]	1	200	1	all	86086583-3597-4a17-a0eb-f3f8e4eb2fdd	1	TaskRegr:sin	sin	LearnerRegrFeatureless:regr.featureless	regr.featureless		same_other_sizes_cv		0.775	featureless

The plot below shows the same results (no down-sampling) as if we did sizes=-1 (like in the previous section.

if(require(ggplot2)){
ggplot()+
  geom_point(aes(
    regr.rmse, train.subsets, color=algorithm),
    shape=1,
    data=soak_sizes_score[groups==n.train.groups])+
  facet_grid(. ~ test.subset, labeller=label_both)
}

The plots below compare all six train subsets (including three down-sampled), and it it is clear there is an effect for sample size.

soak_sizes_score[, subset.N := paste(train.subsets, n.train.groups)]
(levs <- soak_sizes_score[order(train.subsets, n.train.groups), unique(subset.N)])
#>[1] "all 50"    "all 200"   "other 50"  "other 150" "same 50"   "same 150" 
soak_sizes_score[, subset.N.fac := factor(subset.N, levs)]
if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      regr.rmse, subset.N.fac, color=algorithm),
      shape=1,
      data=soak_sizes_score)+
    facet_wrap("test.subset", labeller=label_both, scales="free", nrow=1)
}

(levs <- soak_sizes_score[order(n.train.groups, train.subsets), unique(subset.N)])
#>[1] "all 50"    "other 50"  "same 50"   "other 150" "same 150"  "all 200"  
soak_sizes_score[, N.subset.fac := factor(subset.N, levs)]
if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      regr.rmse, N.subset.fac, color=algorithm),
      shape=1,
      data=soak_sizes_score)+
    facet_wrap("test.subset", labeller=label_both, scales="free", nrow=1)
}

Another way to view the effect of sample size is to plot the test/prediction error, as a function of number of train data, as in the plots below.

if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      n.train.groups, regr.rmse,
      color=train.subsets),
      shape=1,
      data=soak_sizes_score)+
    geom_line(aes(
      n.train.groups, regr.rmse,
      group=paste(train.subsets, seed, algorithm),
      linetype=algorithm,
      color=train.subsets),
      data=soak_sizes_score)+
    facet_grid(test.fold ~ test.subset, labeller=label_both)
}

rpart.score <- soak_sizes_score[algorithm=="rpart" & train.subsets != "other"]
if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      n.train.groups, regr.rmse,
      color=train.subsets),
      shape=1,
      data=rpart.score)+
    geom_line(aes(
      n.train.groups, regr.rmse,
      group=paste(train.subsets, seed, algorithm),
      color=train.subsets),
      data=rpart.score)+
    facet_grid(test.fold ~ test.subset, labeller=label_both)
}

Use with auto_tuner on a task with stratification and grouping

In this section we show how ResamplingSameOtherSizesCV can be used on a task with stratification and grouping, for hyper-parameter learning. First we recall the previously defined task and evaluation CV.

str(reg.task$col_roles)
#>List of 11
#> $ feature        : chr "x"
#> $ target         : chr "y"
#> $ name           : chr(0) 
#> $ order          : chr(0) 
#> $ stratum        : chr "random_subset"
#> $ group          : chr "agroup"
#> $ offset         : chr(0) 
#> $ weights_learner: chr(0) 
#> $ weights_measure: chr(0) 
#> $ subset         : chr "random_subset"
#> $ fold           : chr(0)

We see in the output above that the task has column roles for both stratum and group, which normally errors when used with ResamplingCV:

mlr3::ResamplingCV$new()$instantiate(reg.task)
#>Error: 
#>✖ Cannot combine stratification with grouping
#>→ Class: Mlr3ErrorInput

Below we show how ResamplingSameOtherSizesCV can be used instead:

ignore.cv <- mlr3resampling::ResamplingSameOtherSizesCV$new()
ignore.cv$param_set$values$ignore_subset <- TRUE
ignore.cv$instantiate(reg.task)
ignore.cv$instance$iteration.dt

test.subset	train.subsets	groups	test.fold	test	train	seed	n.train.groups	iteration	Train_subsets
full	same	200	1	5, 6, 7, 8, 9,10,…[200]	1, 2, 3, 4,11,12,…[400]	1	200	1	same
full	same	200	2	3, 4,11,12,13,14,…[200]	1,2,5,6,7,8,…[400]	1	200	2	same
full	same	200	3	1, 2,25,26,31,32,…[200]	3,4,5,6,7,8,…[400]	1	200	3	same

To use the above CV object with a learning algorithm in a benchmark experiment, we need to use it as the resampling argument to auto_tuner, as in the code below,

do_benchmark <- function(subtrain.valid.cv){
  reg.learner.list <- list(
    mlr3::LearnerRegrFeatureless$new())
  if(requireNamespace("rpart")){
    reg.learner.list$rpart <- mlr3::LearnerRegrRpart$new()
    if(requireNamespace("mlr3tuning")){
      rpart.learner <- mlr3::LearnerRegrRpart$new()
      ##mlr3tuningspaces::lts(rpart.learner)$param_set$values
      rpart.learner$param_set$values$cp <- paradox::to_tune(1e-4, 0.1, log=TRUE)
      reg.learner.list$rpart.tuned <- mlr3tuning::auto_tuner(
        tuner = mlr3tuning::tnr("grid_search"), #mlr3tuning::TunerBatchGridSearch$new()
        learner = rpart.learner,
        resampling = subtrain.valid.cv,
        measure = mlr3::msr("regr.rmse"))
    }
  }
  soak_sizes_grid <- mlr3::benchmark_grid(
    reg.task,
    reg.learner.list,
    soak_sizes)
  lgr::get_logger("bbotk")$set_threshold("warn")
  soak_sizes_result <- mlr3::benchmark(
    soak_sizes_grid, store_models = TRUE)
}

do_benchmark(mlr3::ResamplingCV$new())
#>Loading required namespace: mlr3tuning
#>Caught Mlr3ErrorInput. Canceling all iterations ...
#>Error: 
#>✖ Cannot combine stratification with grouping
#>→ Class: Mlr3ErrorInput

The error above is because ResamplingCV does not support stratification and grouping. To fix that, we can use the code below:

ignore.cv <- mlr3resampling::ResamplingSameOtherSizesCV$new()
ignore.cv$param_set$values$ignore_subset <- TRUE
(same.other.result <- do_benchmark(ignore.cv))
#>
#>── <BenchmarkResult> of 90 rows with 3 resampling run ──────────────────────────
#> nr task_id       learner_id       resampling_id iters warnings errors
#>  1     sin regr.featureless same_other_sizes_cv    30        0      0
#>  2     sin       regr.rpart same_other_sizes_cv    30        0      0
#>  3     sin regr.rpart.tuned same_other_sizes_cv    30        0      0

The output above shows that the benchmark worked. The code below plots the results.

same.other.score <- mlr3resampling::score(
  same.other.result, mlr3::msr("regr.rmse"))
same.other.score[1]

test.subset	train.subsets	groups	test.fold	test	train	seed	n.train.groups	iteration	Train_subsets	uhash	nr	task	task_id	learner	learner_id	resampling	resampling_id	prediction_test	regr.rmse	algorithm
A	all	200	1	1, 2,49,50,57,58,…[50]	3,4,5,6,7,8,…[400]	1	200	1	all	db1ae541-36fd-45b0-a7bf-b702a5655793	1	TaskRegr:sin	sin	LearnerRegrFeatureless:regr.featureless	regr.featureless		same_other_sizes_cv		0.775	featureless

same.other.wide <- dcast(
  same.other.score,
  algorithm + test.subset + train.subsets ~ .,
  list(mean, sd),
  value.var="regr.rmse")
if(require(ggplot2)){
  ggplot()+
    geom_segment(aes(
      regr.rmse_mean+regr.rmse_sd, train.subsets,
      xend=regr.rmse_mean-regr.rmse_sd, yend=train.subsets),
      data=same.other.wide)+
    geom_point(aes(
      regr.rmse_mean, train.subsets),
      shape=1,
      data=same.other.wide)+
    facet_grid(algorithm ~ test.subset, labeller=label_both)
}

The plot above has different panels for rpart (without tuning) and tuned (rpart with tuning of cp).

Conclusions

mlr3resampling::ResamplingSameOtherSizesCV can be used for model evaluation (train/test split):

compare prediction accuracy of models trained on same/other/all subsets (need to set column role subset).
compare prediction accuracy of models trained on down-sampled subsets (need to set param sizes).

It can also be used for model training (subtrain/validation split):

to learn regularization hyper-parameters, on a task with both stratum and group roles (use is as resampling argument of auto_tuner).

Arizona trees data

The goal of this section is explain the differences between various column roles:

group is used to designate observations which should stay together when splitting. In other words, two rows in the same group should never appear in different sets.
subset designates a column whose values are each treated as a test set (the train data come from Same/Other/All subsets).

What is a group?

Below we load the data set.

data(AZtrees,package="mlr3resampling")
library(data.table)
AZdt <- data.table(AZtrees)
AZdt[1]

xcoord	ycoord	region3	region4	polygon	y	SAMPLE_1	SAMPLE_2	SAMPLE_3	SAMPLE_4	SAMPLE_5	SAMPLE_6	SAMPLE_7	SAMPLE_8	SAMPLE_9	SAMPLE_10	SAMPLE_11	SAMPLE_12	SAMPLE_13	SAMPLE_14	SAMPLE_15	SAMPLE_16	SAMPLE_17	SAMPLE_18	SAMPLE_19	SAMPLE_20	SAMPLE_21
-111.664	35.237	NE	NE	1	Not tree	3331	3919	3957	4514	4700	4607	4420	4494	4139	3906	14	-40	-71	125	21	25	10	-263	-324	-362	370

Above we see one row of data. Below we see a scatterplot of the data:

Every row is a labeled pixel.
Every dot is plotted at the xcoord/ycoord (lat/long) position on a map around Flagstaff, AZ.

x.center <- -111.72
y.center <- 35.272
rect.size <- 0.01/2
x.min.max <- x.center+c(-1, 1)*rect.size
y.min.max <- y.center+c(-1, 1)*rect.size
rect.dt <- data.table(
  xmin=x.min.max[1], xmax=x.min.max[2],
  ymin=y.min.max[1], ymax=y.min.max[2])
if(require(ggplot2)){
  tree.fill.scale <- scale_fill_manual(
    values=c(Tree="black", "Not tree"="white"))
  ggplot()+
    tree.fill.scale+
    geom_rect(aes(
      xmin=xmin, xmax=xmax, ymin=ymin,ymax=ymax),
      data=rect.dt,
      fill="red",
      linewidth=3,
      color="red")+
    geom_point(aes(
      xcoord, ycoord, fill=y),
      shape=21,
      data=AZdt)+
    coord_equal()
}

Note the red square in the plot above. Below we zoom into that square.

if(require(ggplot2)){
  gg <- ggplot()+
    tree.fill.scale+
    geom_point(aes(
      xcoord, ycoord, fill=y),
      shape=21,
      data=AZdt)+
    coord_equal()+
    scale_x_continuous(
      limits=x.min.max)+
    scale_y_continuous(
      limits=y.min.max)
  if(require(directlabels)){
    gg <- gg+geom_dl(aes(
      xcoord, ycoord, label=paste("polygon",polygon)),
      data=AZdt,
      method=list(cex=2, "smart.grid"))
  }
  gg
}
#>Loading required package: directlabels
#>Removed 5927 rows containing missing values or values outside the scale range
#>(`geom_point()`).
#>Removed 5927 rows containing missing values or values outside the scale range
#>(`geom_dl()`).

In the plot above, we see that there are several groups of points, each with a black number. Each group of points comes from a single polygon (label drawn in GIS software), and the black number is the polygon ID number. So each polygon represents one label, either tree or not, and there are one or more points/pixels with that label inside each polygon.

A polygon is an example of a group. Each polygon results in one or more rows of training data (pixels), but since pixels in a given group were all labeled together, we would like to keep them together when splitting the data.

What is a subset?

Below we plot the same data, but this time colored by region.

##dput(RColorBrewer::brewer.pal(3,"Dark2"))
region.colors <- c(NW="#1B9E77", NE="#D95F02", S="#7570B3")
if(require(ggplot2)){
  ggplot()+
    tree.fill.scale+
    scale_color_manual(
      values=region.colors)+
    geom_point(aes(
      xcoord, ycoord, color=region3, fill=y),
      shape=21,
      data=AZdt)+
    coord_equal()
}

We can see in the plot above that there are three values in the region3 column: NE, NW, and S (different geographical regions on the map which are well-separated). We would like to know if it is possible to train on one region, and then accurately predict on another region.

Cross-validation

First we create a task:

ctask <- mlr3::TaskClassif$new(
  "AZtrees", AZdt, target="y")
ctask$col_roles$subset <- "region3"
ctask$col_roles$group <- "polygon"
ctask$col_roles$stratum <- "y"
ctask$col_roles$feature <- grep("SAMPLE",names(AZdt),value=TRUE)
str(ctask$col_roles)
#>List of 11
#> $ feature        : chr [1:21] "SAMPLE_1" "SAMPLE_2" "SAMPLE_3" "SAMPLE_4" ...
#> $ target         : chr "y"
#> $ name           : chr(0) 
#> $ order          : chr(0) 
#> $ stratum        : chr "y"
#> $ group          : chr "polygon"
#> $ offset         : chr(0) 
#> $ weights_learner: chr(0) 
#> $ weights_measure: chr(0) 
#> $ subset         : chr "region3"
#> $ fold           : chr(0)

Then we can instantiate the CV to see how it works (but usually you do not need to instantiate, if you are using benchmark it does it for you).

same.other.cv <- mlr3resampling::ResamplingSameOtherSizesCV$new()
same.other.cv$param_set$values$folds <- 3
same.other.cv$instantiate(ctask)
same.other.cv$instance$iteration.dt[, .(
  train.subsets, test.fold, test.subset, n.train.groups,
  train.rows=sapply(train, length))]

train.subsets	test.fold	test.subset	n.train.groups	train.rows
all	1	NE	125	3108
all	1	NW	125	3108
all	1	S	125	3108
all	2	NE	125	4325
all	2	NW	125	4325
⋮	⋮	⋮	⋮	⋮
same	2	NW	21	801
same	2	S	34	2749
same	3	NE	70	979
same	3	NW	21	869
same	3	S	34	2631

The table above has one row per train/test split for which error/accuracy metrics will be computed. The n.train.groups column is the number of polygons which are used in the train set, which is defined as the intersection of the train subsets and the train folds. To double check, below we compute the total number of groups/polygons per subset/region, and the expected number of train groups/polygons.

AZdt[, .(
  polygons=length(unique(polygon))
), by=region3][
, train.polygons := polygons*with(same.other.cv$param_set$values, (folds-1)/folds)
][]

region3	polygons	train.polygons
NE	105	70.000
NW	32	21.333
S	52	34.667

It is clear that the counts in the train.polygons column above match the numbers in the previous table column n.train.groups. To determine the number of rows of train data, we can look at the train.rows column in the previous table.

Benchmark and test error computation

Below we define the benchmark experiment.

same.other.cv <- mlr3resampling::ResamplingSameOtherSizesCV$new()
(learner.list <- list(
  mlr3::LearnerClassifFeatureless$new()))
#>[[1]]
#>
#>── <LearnerClassifFeatureless> (classif.featureless): Featureless Classification
#>• Model: -
#>• Parameters: method=mode
#>• Packages: mlr3
#>• Predict Types: [response] and prob
#>• Feature Types: logical, integer, numeric, character, factor, ordered,
#>POSIXct, and Date
#>• Encapsulation: none (fallback: -)
#>• Properties: featureless, importance, missings, multiclass, selected_features,
#>twoclass, and weights
#>• Other settings: use_weights = 'use', predict_raw = 'FALSE'
#>
if(requireNamespace("rpart")){
  learner.list$rpart <- mlr3::LearnerClassifRpart$new()
}
for(learner.i in seq_along(learner.list)){
  learner.list[[learner.i]]$predict_type <- "prob"
}
set.seed(1)
(bench.grid <- mlr3::benchmark_grid(ctask, learner.list, same.other.cv))

task	learner	resampling
TaskClassif:AZtrees	LearnerClassifFeatureless:classif.featureless
TaskClassif:AZtrees	LearnerClassifRpart:classif.rpart

Above we see one row per combination of task, learner, and resampling. Below we compute the benchmark result and test accuracy.

bench.result <- mlr3::benchmark(bench.grid)
measure.list <- mlr3::msrs(c("classif.acc","classif.auc"))
score.dt <- mlr3resampling::score(bench.result, measure.list)
score.dt[1]

test.subset	train.subsets	groups	test.fold	test	train	seed	n.train.groups	iteration	Train_subsets	uhash	nr	task	task_id	learner	learner_id	resampling	resampling_id	prediction_test	classif.acc	classif.auc	algorithm
NE	all	125	1	9,10,11,12,13,14,…[352]	1,2,3,4,5,6,…[4462]	1	125	1	all	e664bcb4-4e86-4e50-856c-f4016509bd9f	1	TaskClassif:AZtrees	AZtrees	LearnerClassifFeatureless:classif.featureless	classif.featureless		same_other_sizes_cv		0.784	0.5	featureless

Above we see one row of the result, for one train/test split. Below we plot the accuracy results using two different methods.

score.long <- melt(
  score.dt,
  measure.vars=measure(variable, pattern="classif.(acc|auc)"))
if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      value, train.subsets, color=algorithm),
      data=score.long)+
    facet_grid(test.subset ~ variable, labeller=label_both, scales="free")
}

Above we show one dot per train/test split, and another way to do that is via the plot method, as below.

plot(score.dt)+my_theme

Below we take the mean/SD over folds.

score.wide <- dcast(
  score.long,
  algorithm + test.subset + train.subsets + variable ~ .,
  list(mean, sd),
  value.var="value")
if(require(ggplot2)){
  ggplot()+
    geom_point(aes(
      value_mean, train.subsets, color=algorithm),
      size=3,
      fill="white",
      shape=21,
      data=score.wide)+
    geom_segment(aes(
      value_mean+value_sd, train.subsets,
      color=algorithm,
      linewidth=algorithm,
      xend=value_mean-value_sd, yend=train.subsets),
      data=score.wide)+
    scale_linewidth_manual(values=c(featureless=2, rpart=1))+
    facet_grid(test.subset ~ variable, labeller=label_both, scales="free")+
    scale_x_continuous(
      "Mean +/- SD of test accuracy/AUC over folds/splits")
}

The plot above shows an interesting pattern:

For test subsets NE and NW, training on other subsets is less accurate than training on the same subset. Training on All subsets is no more accurate than training on the same subset. These results suggest that learnable patterns in other subsets are too different to be beneficial for predicting on these subsets.
For test subset S, training on other subsets is slightly more accurate than training on the same subset, and training on all subsets is slightly more accurate still. These results suggest that the learnable pattern is similar enough in the other subsets so as to be beneficial for prediction in subset S.

Another way to visualize these patterns is via the plot method for pvalue objects, as below.

AZ_pval <- mlr3resampling::pvalue(score.dt, digits=3)
plot(AZ_pval)+my_theme

The figure above shows P-values for classification accuracy (by default the first measure is used). If we want to compute P-values for AUC, we can use the code below:

AZ_pval_AUC <- mlr3resampling::pvalue(score.dt, "classif.auc", digits=3)
plot(AZ_pval_AUC)+my_theme

Conclusion

Column roles group, stratum, and subset may be used together, in the same task, in order to perform a cross-validation experiment which captures the structure in the data.

Session info

sessionInfo()
#>R version 4.6.0 (2026-04-24)
#>Platform: x86_64-pc-linux-gnu
#>Running under: Ubuntu 24.04.4 LTS
#>
#>Matrix products: default
#>BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#>LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#>
#>locale:
#> [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
#> [4] LC_COLLATE=C           LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
#> [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
#>[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
#>
#>time zone: UTC
#>tzcode source: system (glibc)
#>
#>attached base packages:
#>[1] stats     graphics  grDevices utils     datasets  methods   base     
#>
#>other attached packages:
#>[1] directlabels_2026.4.23   mlr3resampling_2026.4.26 mlr3_1.6.0              
#>[4] future_1.70.0            ggplot2_4.0.3            data.table_1.18.2.1     
#>
#>loaded via a namespace (and not attached):
#> [1] gtable_0.3.6         future.apply_1.20.2  compiler_4.6.0      
#> [4] crayon_1.5.3         rpart_4.1.27         Rcpp_1.1.1-1.1      
#> [7] parallel_4.6.0       globals_0.19.1       scales_1.4.0        
#>[10] uuid_1.2-2           R6_2.6.1             commonmark_2.0.0    
#>[13] mlr3tuning_1.6.0     labeling_0.4.3       palmerpenguins_0.1.1
#>[16] backports_1.5.1      checkmate_2.3.4      paradox_1.0.1       
#>[19] RColorBrewer_1.1-3   mlr3measures_1.3.0   rlang_1.2.0         
#>[22] lgr_0.5.2            litedown_0.9         xfun_0.57           
#>[25] quadprog_1.5-8       mlr3misc_0.21.0      S7_0.2.2            
#>[28] cli_3.6.6            withr_3.0.2          digest_0.6.39       
#>[31] grid_4.6.0           bbotk_1.10.0         lifecycle_1.0.5     
#>[34] vctrs_0.7.3          glue_1.8.1           farver_2.1.2        
#>[37] listenv_0.10.1       codetools_0.2-20     parallelly_1.47.0   
#>[40] tools_4.6.0