-
Notifications
You must be signed in to change notification settings - Fork 3.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-41480: [Python] Building PyArrow: enable/disable python components by default based on availability in Arrow C++ #41494
Changes from 2 commits
a7c33c5
c340833
ec6f02b
fa91a93
f096999
726afec
1410185
63dea5a
7ad1977
c3a2136
f9b2b3c
5f2382e
0390e9e
f65dbc1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -108,25 +108,6 @@ if(UNIX) | |
endif() | ||
endif() | ||
|
||
# Top level cmake dir | ||
if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") | ||
option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" OFF) | ||
option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF) | ||
option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF) | ||
option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF) | ||
option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF) | ||
option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" OFF) | ||
option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF) | ||
option(PYARROW_BUILD_PARQUET_ENCRYPTION | ||
"Build the PyArrow Parquet encryption integration" OFF) | ||
option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF) | ||
option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF) | ||
option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF) | ||
set(PYARROW_CXXFLAGS | ||
"" | ||
CACHE STRING "Compiler flags to append when compiling Arrow") | ||
endif() | ||
|
||
find_program(CCACHE_FOUND ccache) | ||
if(CCACHE_FOUND | ||
AND NOT CMAKE_C_COMPILER_LAUNCHER | ||
|
@@ -265,11 +246,44 @@ message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}") | |
|
||
include(UseCython) | ||
|
||
# PyArrow C++ | ||
# Arrow C++ and set default PyArrow build options | ||
include(GNUInstallDirs) | ||
|
||
find_package(Arrow REQUIRED) | ||
|
||
# Top level cmake dir | ||
if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @kou for my education, could you explain what this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's for avoiding defining our options when this is used via We don't need this because:
|
||
option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" ${ARROW_ACERO}) | ||
option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" ${ARROW_CUDA}) | ||
option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" ${ARROW_DATASET}) | ||
option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" ${ARROW_FLIGHT}) | ||
option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" ${ARROW_GANDIVA}) | ||
option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" ${ARROW_ORC}) | ||
option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" ${ARROW_PARQUET}) | ||
option(PYARROW_BUILD_PARQUET_ENCRYPTION | ||
"Build the PyArrow Parquet encryption integration" ${PARQUET_REQUIRE_ENCRYPTION}) | ||
option(PYARROW_BUILD_SUBSTRAIT "Build the PyArrow Substrait integration" | ||
${ARROW_SUBSTRAIT}) | ||
option(PYARROW_BUILD_AZURE "Build the PyArrow Azure integration" ${ARROW_AZURE}) | ||
option(PYARROW_BUILD_GCS "Build the PyArrow GCS integration" ${ARROW_GCS}) | ||
option(PYARROW_BUILD_S3 "Build the PyArrow S3 integration" ${ARROW_S3}) | ||
option(PYARROW_BUILD_HDFS "Build the PyArrow HDFS integration" ${ARROW_HDFS}) | ||
option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF) | ||
option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF) | ||
option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF) | ||
set(PYARROW_CXXFLAGS | ||
"" | ||
CACHE STRING "Compiler flags to append when compiling Arrow") | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
endif() | ||
|
||
# enforce module dependencies | ||
if(PYARROW_BUILD_SUBSTRAIT) | ||
set(PYARROW_BUILD_DATASET ON) | ||
endif() | ||
if(PYARROW_BUILD_DATASET) | ||
set(PYARROW_BUILD_ACERO ON) | ||
endif() | ||
Comment on lines
+304
to
+310
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is to ensure we keep the same logic as we have now in setup.py, but we could in theory also leave this out (and ensure the scripts don't explicitly disable Acero when enabling Dataset) |
||
|
||
# PyArrow C++ | ||
set(PYARROW_CPP_ROOT_DIR pyarrow/src) | ||
set(PYARROW_CPP_SOURCE_DIR ${PYARROW_CPP_ROOT_DIR}/arrow/python) | ||
set(PYARROW_CPP_SRCS | ||
|
@@ -305,6 +319,7 @@ set(PYARROW_CPP_LINK_LIBS "") | |
|
||
# Check all the options from Arrow and PyArrow C++ to be in line | ||
if(PYARROW_BUILD_DATASET) | ||
message(STATUS "Building PyArrow Dataset") | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if(NOT ARROW_DATASET) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_DATASET=ON") | ||
endif() | ||
|
@@ -317,6 +332,7 @@ if(PYARROW_BUILD_DATASET) | |
endif() | ||
|
||
if(PYARROW_BUILD_ACERO) | ||
message(STATUS "Building PyArrow Acero") | ||
if(NOT ARROW_ACERO) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_ACERO=ON") | ||
endif() | ||
|
@@ -329,18 +345,13 @@ if(PYARROW_BUILD_ACERO) | |
endif() | ||
|
||
if(PYARROW_BUILD_PARQUET OR PYARROW_BUILD_PARQUET_ENCRYPTION) | ||
message(STATUS "Building PyArrow Parquet") | ||
if(NOT ARROW_PARQUET) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_PARQUET=ON") | ||
endif() | ||
find_package(Parquet REQUIRED) | ||
endif() | ||
|
||
if(PYARROW_BUILD_HDFS) | ||
if(NOT ARROW_HDFS) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON") | ||
endif() | ||
endif() | ||
|
||
# Check for only Arrow C++ options | ||
if(ARROW_CSV) | ||
list(APPEND PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/csv.cc) | ||
|
@@ -400,6 +411,7 @@ endif() | |
|
||
set(PYARROW_CPP_FLIGHT_SRCS ${PYARROW_CPP_SOURCE_DIR}/flight.cc) | ||
if(PYARROW_BUILD_FLIGHT) | ||
message(STATUS "Building PyArrow Flight") | ||
if(NOT ARROW_FLIGHT) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_FLIGHT=ON") | ||
endif() | ||
|
@@ -555,23 +567,39 @@ set_source_files_properties(pyarrow/lib.pyx PROPERTIES CYTHON_API TRUE) | |
set(LINK_LIBS arrow_python) | ||
|
||
if(PYARROW_BUILD_AZURE) | ||
message(STATUS "Building PyArrow Azure") | ||
if(NOT ARROW_AZURE) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_AZURE=ON") | ||
endif() | ||
list(APPEND CYTHON_EXTENSIONS _azurefs) | ||
endif() | ||
|
||
if(PYARROW_BUILD_GCS) | ||
message(STATUS "Building PyArrow GCS") | ||
if(NOT ARROW_GCS) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_GCS=ON") | ||
endif() | ||
list(APPEND CYTHON_EXTENSIONS _gcsfs) | ||
endif() | ||
|
||
if(PYARROW_BUILD_S3) | ||
message(STATUS "Building PyArrow S3") | ||
if(NOT ARROW_S3) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_S3=ON") | ||
endif() | ||
list(APPEND CYTHON_EXTENSIONS _s3fs) | ||
endif() | ||
|
||
if(PYARROW_BUILD_HDFS) | ||
message(STATUS "Building PyArrow HDFS") | ||
if(NOT ARROW_HDFS) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON") | ||
endif() | ||
list(APPEND CYTHON_EXTENSIONS _hdfs) | ||
endif() | ||
|
||
if(PYARROW_BUILD_CUDA) | ||
# Arrow CUDA | ||
message(STATUS "Building PyArrow CUDA") | ||
if(NOT ARROW_CUDA) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_CUDA=ON") | ||
endif() | ||
|
@@ -646,8 +674,9 @@ if(PYARROW_BUILD_PARQUET) | |
endif() | ||
endif() | ||
|
||
# ORC | ||
if(PYARROW_BUILD_ORC) | ||
# ORC | ||
message(STATUS "Building PyArrow ORC") | ||
if(NOT ARROW_ORC) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_ORC=ON") | ||
endif() | ||
|
@@ -679,6 +708,7 @@ endif() | |
|
||
# Substrait | ||
if(PYARROW_BUILD_SUBSTRAIT) | ||
message(STATUS "Building PyArrow Substrait") | ||
if(NOT ARROW_SUBSTRAIT) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_SUBSTRAIT=ON") | ||
endif() | ||
|
@@ -696,6 +726,7 @@ endif() | |
|
||
# Gandiva | ||
if(PYARROW_BUILD_GANDIVA) | ||
message(STATUS "Building PyArrow Gandiva") | ||
if(NOT ARROW_GANDIVA) | ||
message(FATAL_ERROR "You must build Arrow C++ with ARROW_GANDIVA=ON") | ||
endif() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -152,48 +152,13 @@ def initialize_options(self): | |
if not hasattr(sys, 'gettotalrefcount'): | ||
self.build_type = 'release' | ||
|
||
self.with_azure = strtobool( | ||
os.environ.get('PYARROW_WITH_AZURE', '0')) | ||
self.with_gcs = strtobool( | ||
os.environ.get('PYARROW_WITH_GCS', '0')) | ||
self.with_s3 = strtobool( | ||
os.environ.get('PYARROW_WITH_S3', '0')) | ||
self.with_hdfs = strtobool( | ||
os.environ.get('PYARROW_WITH_HDFS', '0')) | ||
self.with_cuda = strtobool( | ||
os.environ.get('PYARROW_WITH_CUDA', '0')) | ||
self.with_substrait = strtobool( | ||
os.environ.get('PYARROW_WITH_SUBSTRAIT', '0')) | ||
self.with_flight = strtobool( | ||
os.environ.get('PYARROW_WITH_FLIGHT', '0')) | ||
self.with_acero = strtobool( | ||
os.environ.get('PYARROW_WITH_ACERO', '0')) | ||
self.with_dataset = strtobool( | ||
os.environ.get('PYARROW_WITH_DATASET', '0')) | ||
self.with_parquet = strtobool( | ||
os.environ.get('PYARROW_WITH_PARQUET', '0')) | ||
self.with_parquet_encryption = strtobool( | ||
os.environ.get('PYARROW_WITH_PARQUET_ENCRYPTION', '0')) | ||
self.with_orc = strtobool( | ||
os.environ.get('PYARROW_WITH_ORC', '0')) | ||
self.with_gandiva = strtobool( | ||
os.environ.get('PYARROW_WITH_GANDIVA', '0')) | ||
self.generate_coverage = strtobool( | ||
os.environ.get('PYARROW_GENERATE_COVERAGE', '0')) | ||
self.bundle_arrow_cpp = strtobool( | ||
os.environ.get('PYARROW_BUNDLE_ARROW_CPP', '0')) | ||
self.bundle_cython_cpp = strtobool( | ||
os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0')) | ||
|
||
self.with_parquet_encryption = (self.with_parquet_encryption and | ||
self.with_parquet) | ||
|
||
# enforce module dependencies | ||
if self.with_substrait: | ||
self.with_dataset = True | ||
if self.with_dataset: | ||
self.with_acero = True | ||
|
||
CYTHON_MODULE_NAMES = [ | ||
'lib', | ||
'_fs', | ||
|
@@ -270,23 +235,28 @@ def append_cmake_bool(value, varname): | |
cmake_options.append('-D{0}={1}'.format( | ||
varname, 'on' if value else 'off')) | ||
|
||
def maybe_append_cmake_bool(varname): | ||
env_var = varname.replace("PYARROW_BUILD", "PYARROW_WITH") | ||
if env_var in os.environ: | ||
value = strtobool(os.environ.get(env_var)) | ||
append_cmake_bool(value, varname) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about doing this in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did that initially in a first version locally, but then moved it again to setup.py to avoid the same problem with it being cached. But with you suggestion above to use an AUTO default, then I can indeed move it back to the cmake. |
||
|
||
if self.cmake_generator: | ||
cmake_options += ['-G', self.cmake_generator] | ||
|
||
append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA') | ||
append_cmake_bool(self.with_substrait, 'PYARROW_BUILD_SUBSTRAIT') | ||
append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT') | ||
append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA') | ||
append_cmake_bool(self.with_acero, 'PYARROW_BUILD_ACERO') | ||
append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET') | ||
append_cmake_bool(self.with_orc, 'PYARROW_BUILD_ORC') | ||
append_cmake_bool(self.with_parquet, 'PYARROW_BUILD_PARQUET') | ||
append_cmake_bool(self.with_parquet_encryption, | ||
'PYARROW_BUILD_PARQUET_ENCRYPTION') | ||
append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE') | ||
append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS') | ||
append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3') | ||
append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS') | ||
maybe_append_cmake_bool('PYARROW_BUILD_CUDA') | ||
maybe_append_cmake_bool('PYARROW_BUILD_SUBSTRAIT') | ||
maybe_append_cmake_bool('PYARROW_BUILD_FLIGHT') | ||
maybe_append_cmake_bool('PYARROW_BUILD_GANDIVA') | ||
maybe_append_cmake_bool('PYARROW_BUILD_ACERO') | ||
maybe_append_cmake_bool('PYARROW_BUILD_DATASET') | ||
maybe_append_cmake_bool('PYARROW_BUILD_ORC') | ||
maybe_append_cmake_bool('PYARROW_BUILD_PARQUET') | ||
maybe_append_cmake_bool('PYARROW_BUILD_PARQUET_ENCRYPTION') | ||
maybe_append_cmake_bool('PYARROW_BUILD_AZURE') | ||
maybe_append_cmake_bool('PYARROW_BUILD_GCS') | ||
maybe_append_cmake_bool('PYARROW_BUILD_S3') | ||
maybe_append_cmake_bool('PYARROW_BUILD_HDFS') | ||
append_cmake_bool(self.bundle_arrow_cpp, | ||
'PYARROW_BUNDLE_ARROW_CPP') | ||
append_cmake_bool(self.bundle_cython_cpp, | ||
|
@@ -329,54 +299,8 @@ def append_cmake_bool(value, varname): | |
self._found_names = [] | ||
for name in self.CYTHON_MODULE_NAMES: | ||
built_path = pjoin(install_prefix, name + ext_suffix) | ||
if not os.path.exists(built_path): | ||
print(f'Did not find {built_path}') | ||
if self._failure_permitted(name): | ||
print(f'Cython module {name} failure permitted') | ||
continue | ||
raise RuntimeError('PyArrow C-extension failed to build:', | ||
os.path.abspath(built_path)) | ||
|
||
self._found_names.append(name) | ||
|
||
def _failure_permitted(self, name): | ||
if name == '_parquet' and not self.with_parquet: | ||
return True | ||
if name == '_parquet_encryption' and not self.with_parquet_encryption: | ||
return True | ||
if name == '_orc' and not self.with_orc: | ||
return True | ||
if name == '_flight' and not self.with_flight: | ||
return True | ||
if name == '_substrait' and not self.with_substrait: | ||
return True | ||
if name == '_azurefs' and not self.with_azure: | ||
return True | ||
if name == '_gcsfs' and not self.with_gcs: | ||
return True | ||
if name == '_s3fs' and not self.with_s3: | ||
return True | ||
if name == '_hdfs' and not self.with_hdfs: | ||
return True | ||
if name == '_dataset' and not self.with_dataset: | ||
return True | ||
if name == '_acero' and not self.with_acero: | ||
return True | ||
if name == '_exec_plan' and not self.with_acero: | ||
return True | ||
if name == '_dataset_orc' and not ( | ||
self.with_orc and self.with_dataset | ||
): | ||
return True | ||
if name == '_dataset_parquet' and not ( | ||
self.with_parquet and self.with_dataset | ||
): | ||
return True | ||
if name == '_cuda' and not self.with_cuda: | ||
return True | ||
if name == 'gandiva' and not self.with_gandiva: | ||
return True | ||
return False | ||
if os.path.exists(built_path): | ||
self._found_names.append(name) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to prepare There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's used in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm. It seems that the setuptools
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It does have
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, sorry. I checked wrong name... |
||
|
||
def _get_build_dir(self): | ||
# Get the package directory from build_py | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh... These patterns are wrong...
Could you use
^XXX/.*CMakeLists\.txt$|
for all existing patterns too?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Opened #41689 specifically for that