You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Problem: training ends successfully on a relatively small dataset, then an error occurs on a large dataset (40 000 000 rows, numeric features - 32, categorical features - 16):
Traceback (most recent call last):
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/recsys_3_1_ranking_model.py", line 147, in
main(customer_code, path_to_files, catboost_params, current_date, period_days, val_prcnt, test_prcnt)
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/recsys_3_1_ranking_model.py", line 103, in main
ctb_model = classifier.fit(train_pool)
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/ai.catboost_catboost-spark_3.4_2.12-1.2.2.jar/catboost_spark/core.py", line 5362, in fit
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/ai.catboost_catboost-spark_3.4_2.12-1.2.2.jar/catboost_spark/core.py", line 5359, in _fit_with_eval
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/ai.catboost_catboost-spark_3.4_2.12-1.2.2.jar/catboost_spark/core.py", line 5316, in _fit_with_eval
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1322, in call
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/pyspark.zip/pyspark/errors/exceptions/captured.py", line 169, in deco
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/py4j-0.10.9.7-src.zip/py4j/protocol.py", line 326, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o576.fit.
: java.lang.OutOfMemoryError: GC overhead limit exceeded
Transform categorical features to an ML column of label indices
categorical_index_cols = [name + '_index' for name in categorical_cols]
string_indexer = StringIndexer(inputCols=categorical_cols,
outputCols=categorical_index_cols,
handleInvalid='keep')
model_string_indexer = string_indexer.fit(df_ctb_train)
df_indexed_ctb_train = model_string_indexer.transform(df_ctb_train)
df_indexed_ctb_test = model_string_indexer.transform(df_ctb_test)
model_string_indexer.write().overwrite().save(path_for_string_indexer_model)
# Transform all the features into a vector
input_cols = numeric_cols + categorical_index_cols
vec_assembler = VectorAssembler(inputCols=input_cols,
outputCol='features')
df_vectored_ctb_train = vec_assembler \
.transform(df_indexed_ctb_train) \
.select(F.col('target').alias('label'), F.col('features'), F.col('weight'))
df_vectored_ctb_test = vec_assembler.transform(df_indexed_ctb_test) \
.select(F.col('target').alias('label'), F.col('features'), F.col('weight'))
vec_assembler.write().overwrite().save(path_for_vector_assembler)
# Transform dataframe to catboost_spark.Pool
train_pool = catboost_spark.Pool(df_vectored_ctb_train) \
.setLabelCol('label') \
.setFeaturesCol('features') \
.setWeightCol('weight')
test_pool = catboost_spark.Pool(df_vectored_ctb_test) \
.setLabelCol('label') \
.setFeaturesCol('features')
# Train CatBoostClassifier
classifier = catboost_spark.CatBoostClassifier(**catboost_params)
ctb_model = classifier.fit(train_pool)
ctb_model.write().overwrite().save(path_for_ranking_model)
Changing the Spark startup configuration does not solve the problem, there are a huge number of stages, such as "foreach in DataHelpers.scala:1042" "toArray in CtrFeatures.scala:100", which are executed sequentially, it takes a long time, and eventually everything ends with an error. Please tell me what the problem is: dataframe, spark configuration or ...?
catboost version: 1.2.2
The text was updated successfully, but these errors were encountered:
Problem: training ends successfully on a relatively small dataset, then an error occurs on a large dataset (40 000 000 rows, numeric features - 32, categorical features - 16):
Traceback (most recent call last):
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/recsys_3_1_ranking_model.py", line 147, in
main(customer_code, path_to_files, catboost_params, current_date, period_days, val_prcnt, test_prcnt)
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/recsys_3_1_ranking_model.py", line 103, in main
ctb_model = classifier.fit(train_pool)
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/ai.catboost_catboost-spark_3.4_2.12-1.2.2.jar/catboost_spark/core.py", line 5362, in fit
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/ai.catboost_catboost-spark_3.4_2.12-1.2.2.jar/catboost_spark/core.py", line 5359, in _fit_with_eval
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/ai.catboost_catboost-spark_3.4_2.12-1.2.2.jar/catboost_spark/core.py", line 5316, in _fit_with_eval
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1322, in call
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/pyspark.zip/pyspark/errors/exceptions/captured.py", line 169, in deco
File "/hadoop/yarn/local/usercache/prophet/appcache/application_1709201690261_0632/container_e121_1709201690261_0632_02_000001/py4j-0.10.9.7-src.zip/py4j/protocol.py", line 326, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o576.fit.
: java.lang.OutOfMemoryError: GC overhead limit exceeded
running with: spark-submit --conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=./environment/bin/python --conf spark.yarn.dist.archives=hdfs:///user/aloha/spark/share/arima_env.tar.gz#update --ai.catboost packages:catboost-spark_3.4_2.12:1.2.2 --master yarn --conf spark.executor.instances=80 --conf spark.executor.memory=10G --conf spark.driver.memory=30G
part of code:
Transform categorical features to an ML column of label indices
Changing the Spark startup configuration does not solve the problem, there are a huge number of stages, such as "foreach in DataHelpers.scala:1042" "toArray in CtrFeatures.scala:100", which are executed sequentially, it takes a long time, and eventually everything ends with an error. Please tell me what the problem is: dataframe, spark configuration or ...?
catboost version: 1.2.2
The text was updated successfully, but these errors were encountered: