Я пытаюсь обучить случайный лес с помощью Rolling_origin из набора Tidymodels. Хотелось бы, чтобы складки были именно по месяцам года. Вложенность выглядит так, как будто это может помочь, но tune_grid не может найти переменные, когда данные вложены. Как я могу заставить это работать? Ниже я привел воспроизводимый пример.
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(yardstick))
# Create dummy data ====================================================================================================
dates <- seq(from = as.Date("2019-01-01"), to = as.Date("2019-12-31"), by = 'day' )
l <- length(dates)
set.seed(1)
data_set <- data.frame(
date = dates,
v1 = rnorm(l),
v2 = rnorm(l),
v3 = rnorm(l),
y = rnorm(l)
)
# Random Forest Model =================================================================================================
model <-
parsnip::rand_forest(
mode = "regression",
trees = tune()) %>%
set_engine("ranger")
# grid specification
params <-
dials::parameters(
trees()
)
# Set up grid and model workflow =======================================================================================
grid <-
dials::grid_max_entropy(
params,
size = 2
)
form <- as.formula(paste("y ~ v1 + v2 + v3"))
model_workflow <-
workflows::workflow() %>%
add_model(model) %>%
add_formula(form)
# Tuning on the normal data set works ====================================================================================================
data_ro_day <- data_set %>%
rolling_origin(
initial = 304,
assess = 30,
cumulative = TRUE,
skip = 30
)
results <- tune_grid(
model_workflow,
grid = grid,
resamples = data_ro_day,
param_info = params,
metrics = metric_set(mae, mape, rmse, rsq),
control = control_grid(verbose = TRUE))
results %>% show_best("mape", n = 2)
# Tuning on the nested data set doesn't work =========================================================================================
data_ro_month <- data_set %>%
mutate(year_month = format(date, "%Y-%m")) %>%
nest(-year_month) %>%
rolling_origin(
initial = 10,
assess = 1,
cumulative = TRUE
)
results <- tune_grid(
model_workflow,
grid = grid,
resamples = data_ro_month,
param_info = params,
metrics = metric_set(mae, mape, rmse, rsq),
control = control_grid(verbose = TRUE))
results$.notes ```