Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-41480: [Python] Building PyArrow: enable/disable python components by default based on availability in Arrow C++ #41494

Merged
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ repos:
?^go/.*/CMakeLists\.txt$|
?^java/.*/CMakeLists\.txt$|
?^matlab/.*/CMakeLists\.txt$|
?^python/CMakeLists\.txt$|
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh... These patterns are wrong...

Could you use ^XXX/.*CMakeLists\.txt$| for all existing patterns too?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened #41689 specifically for that

?^python/.*/CMakeLists\.txt$|
)
exclude: >-
Expand Down
1 change: 0 additions & 1 deletion ci/appveyor-cpp-build.bat
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ set PYARROW_WITH_ORC=%ARROW_ORC%
set PYARROW_WITH_PARQUET=ON
set PYARROW_WITH_PARQUET_ENCRYPTION=ON
set PYARROW_WITH_S3=%ARROW_S3%
set PYARROW_WITH_STATIC_BOOST=ON
set PYARROW_WITH_SUBSTRAIT=ON

set ARROW_HOME=%CONDA_PREFIX%\Library
Expand Down
122 changes: 93 additions & 29 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,25 +108,6 @@ if(UNIX)
endif()
endif()

# Top level cmake dir
if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" OFF)
option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF)
option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF)
option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF)
option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF)
option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" OFF)
option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF)
option(PYARROW_BUILD_PARQUET_ENCRYPTION
"Build the PyArrow Parquet encryption integration" OFF)
option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF)
option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF)
option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF)
set(PYARROW_CXXFLAGS
""
CACHE STRING "Compiler flags to append when compiling Arrow")
endif()

find_program(CCACHE_FOUND ccache)
if(CCACHE_FOUND
AND NOT CMAKE_C_COMPILER_LAUNCHER
Expand Down Expand Up @@ -265,11 +246,77 @@ message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}")

include(UseCython)

# PyArrow C++
# Arrow C++ and set default PyArrow build options
include(GNUInstallDirs)

find_package(Arrow REQUIRED)

macro(define_option name description arrow_option)
set("PYARROW_${name}"
"AUTO"
CACHE STRING ${description})

if("${PYARROW_${name}}" STREQUAL "AUTO")
# by default, first check if env variable exists, otherwise use Arrow C++ config
set(env_variable "PYARROW_WITH_${name}")
if(DEFINED ENV{${env_variable}})
message(STATUS "Env variable is defined: ${env_variable}=$ENV{${env_variable}}")
if($ENV{${env_variable}})
set("PYARROW_BUILD_${name}" ON)
message(STATUS "Setting ${name} to ON through env variable")
else()
set("PYARROW_BUILD_${name}" OFF)
message(STATUS "Setting ${name} to OFF through env variable")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made this quite verbose, mostly for testing / debugging purposes. But can trim most of the messages after everything is working if that is preferred.

endif()
else()
if(${arrow_option})
set("PYARROW_BUILD_${name}" ON)
message(STATUS "Setting ${name} to ON through Arrow C++ config")
else()
set("PYARROW_BUILD_${name}" OFF)
message(STATUS "Setting ${name} to OFF through Arrow C++ config")
endif()
endif()
else()
if("${PYARROW_${name}}")
set("PYARROW_BUILD_${name}" ON)
message(STATUS "Setting ${name} to ON through CMake option")
else()
set("PYARROW_BUILD_${name}" OFF)
message(STATUS "Setting ${name} to OFF through CMake option")
endif()
endif()
endmacro()

define_option(ACERO "Build the PyArrow Acero integration" ARROW_ACERO)
define_option(CUDA "Build the PyArrow CUDA support" ARROW_CUDA)
define_option(DATASET "Build the PyArrow Dataset integration" ARROW_DATASET)
define_option(FLIGHT "Build the PyArrow Flight integration" ARROW_FLIGHT)
define_option(GANDIVA "Build the PyArrow Gandiva integration" ARROW_GANDIVA)
define_option(ORC "Build the PyArrow ORC integration" ARROW_ORC)
define_option(PARQUET "Build the PyArrow Parquet integration" ARROW_PARQUET)
define_option(PARQUET_ENCRYPTION "Build the PyArrow Parquet encryption integration"
PARQUET_REQUIRE_ENCRYPTION)
define_option(SUBSTRAIT "Build the PyArrow Substrait integration" ARROW_SUBSTRAIT)
define_option(AZURE "Build the PyArrow Azure integration" ARROW_AZURE)
define_option(GCS "Build the PyArrow GCS integration" ARROW_GCS)
define_option(S3 "Build the PyArrow S3 integration" ARROW_S3)
define_option(HDFS "Build the PyArrow HDFS integration" ARROW_HDFS)
option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF)
option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF)
option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF)
set(PYARROW_CXXFLAGS
""
CACHE STRING "Compiler flags to append when compiling PyArrow C++")

# enforce module dependencies
if(PYARROW_BUILD_SUBSTRAIT)
set(PYARROW_BUILD_DATASET ON)
endif()
if(PYARROW_BUILD_DATASET)
set(PYARROW_BUILD_ACERO ON)
endif()
Comment on lines +304 to +310
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to ensure we keep the same logic as we have now in setup.py, but we could in theory also leave this out (and ensure the scripts don't explicitly disable Acero when enabling Dataset)


# PyArrow C++
set(PYARROW_CPP_ROOT_DIR pyarrow/src)
set(PYARROW_CPP_SOURCE_DIR ${PYARROW_CPP_ROOT_DIR}/arrow/python)
set(PYARROW_CPP_SRCS
Expand Down Expand Up @@ -305,6 +352,7 @@ set(PYARROW_CPP_LINK_LIBS "")

# Check all the options from Arrow and PyArrow C++ to be in line
if(PYARROW_BUILD_DATASET)
message(STATUS "Building PyArrow with Dataset")
if(NOT ARROW_DATASET)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_DATASET=ON")
endif()
Expand All @@ -317,6 +365,7 @@ if(PYARROW_BUILD_DATASET)
endif()

if(PYARROW_BUILD_ACERO)
message(STATUS "Building PyArrow with Acero")
if(NOT ARROW_ACERO)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_ACERO=ON")
endif()
Expand All @@ -329,18 +378,13 @@ if(PYARROW_BUILD_ACERO)
endif()

if(PYARROW_BUILD_PARQUET OR PYARROW_BUILD_PARQUET_ENCRYPTION)
message(STATUS "Building PyArrow with Parquet")
if(NOT ARROW_PARQUET)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_PARQUET=ON")
endif()
find_package(Parquet REQUIRED)
endif()

if(PYARROW_BUILD_HDFS)
if(NOT ARROW_HDFS)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON")
endif()
endif()

# Check for only Arrow C++ options
if(ARROW_CSV)
list(APPEND PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/csv.cc)
Expand Down Expand Up @@ -400,6 +444,7 @@ endif()

set(PYARROW_CPP_FLIGHT_SRCS ${PYARROW_CPP_SOURCE_DIR}/flight.cc)
if(PYARROW_BUILD_FLIGHT)
message(STATUS "Building PyArrow with Flight")
if(NOT ARROW_FLIGHT)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_FLIGHT=ON")
endif()
Expand Down Expand Up @@ -555,23 +600,39 @@ set_source_files_properties(pyarrow/lib.pyx PROPERTIES CYTHON_API TRUE)
set(LINK_LIBS arrow_python)

if(PYARROW_BUILD_AZURE)
message(STATUS "Building PyArrow with Azure")
if(NOT ARROW_AZURE)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_AZURE=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _azurefs)
endif()

if(PYARROW_BUILD_GCS)
message(STATUS "Building PyArrow with GCS")
if(NOT ARROW_GCS)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_GCS=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _gcsfs)
endif()

if(PYARROW_BUILD_S3)
message(STATUS "Building PyArrow with S3")
if(NOT ARROW_S3)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_S3=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _s3fs)
endif()

if(PYARROW_BUILD_HDFS)
message(STATUS "Building PyArrow with HDFS")
if(NOT ARROW_HDFS)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _hdfs)
endif()

if(PYARROW_BUILD_CUDA)
# Arrow CUDA
message(STATUS "Building PyArrow with CUDA")
if(NOT ARROW_CUDA)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_CUDA=ON")
endif()
Expand Down Expand Up @@ -646,8 +707,9 @@ if(PYARROW_BUILD_PARQUET)
endif()
endif()

# ORC
if(PYARROW_BUILD_ORC)
# ORC
message(STATUS "Building PyArrow with ORC")
if(NOT ARROW_ORC)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_ORC=ON")
endif()
Expand Down Expand Up @@ -679,6 +741,7 @@ endif()

# Substrait
if(PYARROW_BUILD_SUBSTRAIT)
message(STATUS "Building PyArrow with Substrait")
if(NOT ARROW_SUBSTRAIT)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_SUBSTRAIT=ON")
endif()
Expand All @@ -696,6 +759,7 @@ endif()

# Gandiva
if(PYARROW_BUILD_GANDIVA)
message(STATUS "Building PyArrow with Gandiva")
if(NOT ARROW_GANDIVA)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_GANDIVA=ON")
endif()
Expand Down
134 changes: 37 additions & 97 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,48 +152,27 @@ def initialize_options(self):
if not hasattr(sys, 'gettotalrefcount'):
self.build_type = 'release'

self.with_azure = strtobool(
os.environ.get('PYARROW_WITH_AZURE', '0'))
self.with_gcs = strtobool(
os.environ.get('PYARROW_WITH_GCS', '0'))
self.with_s3 = strtobool(
os.environ.get('PYARROW_WITH_S3', '0'))
self.with_hdfs = strtobool(
os.environ.get('PYARROW_WITH_HDFS', '0'))
self.with_cuda = strtobool(
os.environ.get('PYARROW_WITH_CUDA', '0'))
self.with_substrait = strtobool(
os.environ.get('PYARROW_WITH_SUBSTRAIT', '0'))
self.with_flight = strtobool(
os.environ.get('PYARROW_WITH_FLIGHT', '0'))
self.with_acero = strtobool(
os.environ.get('PYARROW_WITH_ACERO', '0'))
self.with_dataset = strtobool(
os.environ.get('PYARROW_WITH_DATASET', '0'))
self.with_parquet = strtobool(
os.environ.get('PYARROW_WITH_PARQUET', '0'))
self.with_parquet_encryption = strtobool(
os.environ.get('PYARROW_WITH_PARQUET_ENCRYPTION', '0'))
self.with_orc = strtobool(
os.environ.get('PYARROW_WITH_ORC', '0'))
self.with_gandiva = strtobool(
os.environ.get('PYARROW_WITH_GANDIVA', '0'))
self.with_azure = None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need self.with_XXX? It seems that nobody sets them because we remove os.environ.get(...).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, they are still used in case someone is passing that to setup.py. See #41494 (comment)

I think that's something we should deprecate, though. But planning to do a separate follow-up PR to that, since it's not actually related to this PR (it's just another way that someone can right now override the default, just as setting an environment variable)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, we need them for --with-XXX options!
I see.

self.with_gcs = None
self.with_s3 = None
self.with_hdfs = None
self.with_cuda = None
self.with_substrait = None
self.with_flight = None
self.with_acero = None
self.with_dataset = None
self.with_parquet = None
self.with_parquet_encryption = None
self.with_orc = None
self.with_gandiva = None

self.generate_coverage = strtobool(
os.environ.get('PYARROW_GENERATE_COVERAGE', '0'))
self.bundle_arrow_cpp = strtobool(
os.environ.get('PYARROW_BUNDLE_ARROW_CPP', '0'))
self.bundle_cython_cpp = strtobool(
os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0'))

self.with_parquet_encryption = (self.with_parquet_encryption and
self.with_parquet)

# enforce module dependencies
if self.with_substrait:
self.with_dataset = True
if self.with_dataset:
self.with_acero = True

CYTHON_MODULE_NAMES = [
'lib',
'_fs',
Expand Down Expand Up @@ -270,23 +249,30 @@ def append_cmake_bool(value, varname):
cmake_options.append('-D{0}={1}'.format(
varname, 'on' if value else 'off'))

def append_cmake_component(flag, varname):
# only pass this to cmake is the user pass the --with-component
# flag to setup.py build_ext
if flag is not None:
append_cmake_bool(flag, varname)

if self.cmake_generator:
cmake_options += ['-G', self.cmake_generator]

append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA')
append_cmake_bool(self.with_substrait, 'PYARROW_BUILD_SUBSTRAIT')
append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT')
append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA')
append_cmake_bool(self.with_acero, 'PYARROW_BUILD_ACERO')
append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET')
append_cmake_bool(self.with_orc, 'PYARROW_BUILD_ORC')
append_cmake_bool(self.with_parquet, 'PYARROW_BUILD_PARQUET')
append_cmake_bool(self.with_parquet_encryption,
'PYARROW_BUILD_PARQUET_ENCRYPTION')
append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE')
append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS')
append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3')
append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS')
append_cmake_component(self.with_cuda, 'PYARROW_CUDA')
append_cmake_component(self.with_substrait, 'PYARROW_SUBSTRAIT')
append_cmake_component(self.with_flight, 'PYARROW_FLIGHT')
append_cmake_component(self.with_gandiva, 'PYARROW_GANDIVA')
append_cmake_component(self.with_acero, 'PYARROW_ACERO')
append_cmake_component(self.with_dataset, 'PYARROW_DATASET')
append_cmake_component(self.with_orc, 'PYARROW_ORC')
append_cmake_component(self.with_parquet, 'PYARROW_PARQUET')
append_cmake_component(self.with_parquet_encryption,
'PYARROW_PARQUET_ENCRYPTION')
append_cmake_component(self.with_azure, 'PYARROW_AZURE')
append_cmake_component(self.with_gcs, 'PYARROW_GCS')
append_cmake_component(self.with_s3, 'PYARROW_S3')
append_cmake_component(self.with_hdfs, 'PYARROW_HDFS')

append_cmake_bool(self.bundle_arrow_cpp,
'PYARROW_BUNDLE_ARROW_CPP')
append_cmake_bool(self.bundle_cython_cpp,
Expand Down Expand Up @@ -329,54 +315,8 @@ def append_cmake_bool(value, varname):
self._found_names = []
for name in self.CYTHON_MODULE_NAMES:
built_path = pjoin(install_prefix, name + ext_suffix)
if not os.path.exists(built_path):
print(f'Did not find {built_path}')
if self._failure_permitted(name):
print(f'Cython module {name} failure permitted')
continue
raise RuntimeError('PyArrow C-extension failed to build:',
os.path.abspath(built_path))

self._found_names.append(name)

def _failure_permitted(self, name):
if name == '_parquet' and not self.with_parquet:
return True
if name == '_parquet_encryption' and not self.with_parquet_encryption:
return True
if name == '_orc' and not self.with_orc:
return True
if name == '_flight' and not self.with_flight:
return True
if name == '_substrait' and not self.with_substrait:
return True
if name == '_azurefs' and not self.with_azure:
return True
if name == '_gcsfs' and not self.with_gcs:
return True
if name == '_s3fs' and not self.with_s3:
return True
if name == '_hdfs' and not self.with_hdfs:
return True
if name == '_dataset' and not self.with_dataset:
return True
if name == '_acero' and not self.with_acero:
return True
if name == '_exec_plan' and not self.with_acero:
return True
if name == '_dataset_orc' and not (
self.with_orc and self.with_dataset
):
return True
if name == '_dataset_parquet' and not (
self.with_parquet and self.with_dataset
):
return True
if name == '_cuda' and not self.with_cuda:
return True
if name == 'gandiva' and not self.with_gandiva:
return True
return False
if os.path.exists(built_path):
self._found_names.append(name)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to prepare self._found_names?
It seems that nobody uses it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's used in get_names -> get_outputs methods. We don't use those ourselves, but I am assuming this might be part of the setuptools build_ext class interface

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm. It seems that the setuptools build_ext doesn't have get_names interface:

>>> import setuptools.command.build_ext
>>> setuptools.command.build_ext.build_ext
<class 'setuptools.command.build_ext.build_ext'>
>>> 'get_names' in dir(setuptools.command.build_ext.build_ext)
False

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does have get_outputs:

In [9]: 'get_outputs' in dir(setuptools.command.build_ext.build_ext)
Out[9]: True

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, sorry. I checked wrong name...


def _get_build_dir(self):
# Get the package directory from build_py
Expand Down