Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[R-package] skip integer categorical feature check when building dataset subset (fixes #6412) #6442

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions .ci/test_r_package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,10 @@ if [[ $OS_NAME == "macos" ]]; then
-target / || exit 1
fi

# fix for issue where CRAN was not returning {lattice} when using R 3.6
# fix for issue where CRAN was not returning {lattice} and {evaluate} when using R 3.6
# "Warning: dependency ‘lattice’ is not available"
if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
Rscript --vanilla -e "install.packages('https://cran.r-project.org/src/contrib/Archive/lattice/lattice_0.20-41.tar.gz', repos = NULL, lib = '${R_LIB_PATH}')"
Rscript --vanilla -e "install.packages(c('https://cran.r-project.org/src/contrib/Archive/lattice/lattice_0.20-41.tar.gz', 'https://cran.r-project.org/src/contrib/Archive/evaluate/evaluate_0.23.tar.gz'), repos = NULL, lib = '${R_LIB_PATH}')"
else
# {Matrix} needs {lattice}, so this needs to run before manually installing {Matrix}.
# This should be unnecessary on R >=4.4.0
Expand Down
7 changes: 6 additions & 1 deletion R-package/R/lgb.Dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,12 @@ Dataset <- R6::R6Class(

# Check if more categorical features were output over the feature space
data_is_not_filename <- !is.character(private$raw_data)
if (data_is_not_filename && max(private$categorical_feature) > ncol(private$raw_data)) {
if (
data_is_not_filename
&& !is.null(private$raw_data)
&& is.null(private$used_indices)
&& max(private$categorical_feature) > ncol(private$raw_data)
) {
stop(
"lgb.Dataset.construct: supplied a too large value in categorical_feature: "
, max(private$categorical_feature)
Expand Down
38 changes: 38 additions & 0 deletions R-package/tests/testthat/test_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,35 @@ test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using l
expect_true(methods::is(bst, "lgb.CVBooster"))
})

test_that("lgb.Dataset: should be able to be used in lgb.cv() when constructed with categorical feature indices", {
data("mtcars")
y <- mtcars$mpg
x <- as.matrix(mtcars[, -1L])
categorical_feature <- which(names(mtcars) %in% c("cyl", "vs", "am", "gear", "carb")) - 1L
dtrain <- lgb.Dataset(
data = x
, label = y
, categorical_feature = categorical_feature
, free_raw_data = TRUE
, params = list(num_threads = .LGB_MAX_THREADS)
)
# constructing the Dataset frees the raw data
dtrain$construct()
params <- list(
objective = "regression"
, num_leaves = 2L
, verbose = .LGB_VERBOSITY
, num_threads = .LGB_MAX_THREADS
)
# cv should reuse the same categorical features without checking the indices
bst <- lgb.cv(params = params, data = dtrain, stratified = FALSE, nrounds = 1L)
expect_equal(
unlist(bst$boosters[[1L]]$booster$params$categorical_feature)
, categorical_feature - 1L # 0-based
)
})


test_that("lgb.Dataset: should be able to use and retrieve long feature names", {
# set one feature to a value longer than the default buffer size used
# in LGBM_DatasetGetFeatureNames_R
Expand Down Expand Up @@ -621,3 +650,12 @@ test_that("lgb.Dataset can be constructed with categorical features and without
lgb.Dataset(raw_mat, categorical_feature = 2L)$construct()
}, regexp = "supplied a too large value in categorical_feature: 2 but only 1 features")
})

test_that("lgb.Dataset.slice fails with a categorical feature index greater than the number of features", {
data <- matrix(runif(100L), nrow = 50L, ncol = 2L)
ds <- lgb.Dataset(data = data, categorical_feature = 3L)
subset <- ds$slice(1L:20L)
expect_error({
subset$construct()
}, regexp = "supplied a too large value in categorical_feature: 3 but only 2 features")
})