microsoft · jmoralez · May 4, 2024 · May 4, 2024 · Jun 10, 2024 · Jun 10, 2024
@@ -106,10 +106,10 @@ if [[ $OS_NAME == "macos" ]]; then
         -target / || exit 1
 fi
 
-# fix for issue where CRAN was not returning {lattice} when using R 3.6
+# fix for issue where CRAN was not returning {lattice} and {evaluate} when using R 3.6
 # "Warning: dependency ‘lattice’ is not available"
 if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
-    Rscript --vanilla -e "install.packages('https://cran.r-project.org/src/contrib/Archive/lattice/lattice_0.20-41.tar.gz', repos = NULL, lib = '${R_LIB_PATH}')"
+    Rscript --vanilla -e "install.packages(c('https://cran.r-project.org/src/contrib/Archive/lattice/lattice_0.20-41.tar.gz', 'https://cran.r-project.org/src/contrib/Archive/evaluate/evaluate_0.23.tar.gz'), repos = NULL, lib = '${R_LIB_PATH}')"
 else
     # {Matrix} needs {lattice}, so this needs to run before manually installing {Matrix}.
     # This should be unnecessary on R >=4.4.0

@@ -170,7 +170,12 @@ Dataset <- R6::R6Class(
 
             # Check if more categorical features were output over the feature space
             data_is_not_filename <- !is.character(private$raw_data)
-            if (data_is_not_filename && max(private$categorical_feature) > ncol(private$raw_data)) {
+            if (
+              data_is_not_filename
+              && !is.null(private$raw_data)
+              && is.null(private$used_indices)
+              && max(private$categorical_feature) > ncol(private$raw_data)
+            ) {
               stop(
                 "lgb.Dataset.construct: supplied a too large value in categorical_feature: "
                 , max(private$categorical_feature)

@@ -440,6 +440,35 @@ test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using l
   expect_true(methods::is(bst, "lgb.CVBooster"))
 })
 
+test_that("lgb.Dataset: should be able to be used in lgb.cv() when constructed with categorical feature indices", {
+  data("mtcars")
+  y <- mtcars$mpg
+  x <- as.matrix(mtcars[, -1L])
+  categorical_feature <- which(names(mtcars) %in% c("cyl", "vs", "am", "gear", "carb")) - 1L
+  dtrain <- lgb.Dataset(
+    data = x
+    , label = y
+    , categorical_feature = categorical_feature
+    , free_raw_data = TRUE
+    , params = list(num_threads = .LGB_MAX_THREADS)
+  )
+  # constructing the Dataset frees the raw data
+  dtrain$construct()
+  params <- list(
+    objective = "regression"
+    , num_leaves = 2L
+    , verbose = .LGB_VERBOSITY
+    , num_threads = .LGB_MAX_THREADS
+  )
+  # cv should reuse the same categorical features without checking the indices
+  bst <- lgb.cv(params = params, data = dtrain, stratified = FALSE, nrounds = 1L)
+  expect_equal(
+    unlist(bst$boosters[[1L]]$booster$params$categorical_feature)
+    , categorical_feature - 1L  # 0-based
+  )
+})
+
+
 test_that("lgb.Dataset: should be able to use and retrieve long feature names", {
   # set one feature to a value longer than the default buffer size used
   # in LGBM_DatasetGetFeatureNames_R
@@ -621,3 +650,12 @@ test_that("lgb.Dataset can be constructed with categorical features and without
     lgb.Dataset(raw_mat, categorical_feature = 2L)$construct()
   }, regexp = "supplied a too large value in categorical_feature: 2 but only 1 features")
 })
+
+test_that("lgb.Dataset.slice fails with a categorical feature index greater than the number of features", {
+  data <- matrix(runif(100L), nrow = 50L, ncol = 2L)
+  ds <- lgb.Dataset(data = data, categorical_feature = 3L)
+  subset <- ds$slice(1L:20L)
+  expect_error({
+    subset$construct()
+  }, regexp = "supplied a too large value in categorical_feature: 3 but only 2 features")
+})