Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-41480: [Python] Building PyArrow: enable/disable python components by default based on availability in Arrow C++ #41494

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ repos:
?^go/.*/CMakeLists\.txt$|
?^java/.*/CMakeLists\.txt$|
?^matlab/.*/CMakeLists\.txt$|
?^python/CMakeLists\.txt$|
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh... These patterns are wrong...

Could you use ^XXX/.*CMakeLists\.txt$| for all existing patterns too?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened #41689 specifically for that

?^python/.*/CMakeLists\.txt$|
)
exclude: >-
Expand Down
1 change: 0 additions & 1 deletion ci/appveyor-cpp-build.bat
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ set PYARROW_WITH_ORC=%ARROW_ORC%
set PYARROW_WITH_PARQUET=ON
set PYARROW_WITH_PARQUET_ENCRYPTION=ON
set PYARROW_WITH_S3=%ARROW_S3%
set PYARROW_WITH_STATIC_BOOST=ON
set PYARROW_WITH_SUBSTRAIT=ON

set ARROW_HOME=%CONDA_PREFIX%\Library
Expand Down
2 changes: 1 addition & 1 deletion ci/scripts/python_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ fi
export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR:-Ninja}
export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE:-debug}

export PYARROW_WITH_ACERO=${ARROW_ACERO:-OFF}
export PYARROW_WITH_ACERO=${ARROW_ACERO:-ON}
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
export PYARROW_WITH_AZURE=${ARROW_AZURE:-OFF}
export PYARROW_WITH_CUDA=${ARROW_CUDA:-OFF}
export PYARROW_WITH_DATASET=${ARROW_DATASET:-ON}
Expand Down
89 changes: 60 additions & 29 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,25 +108,6 @@ if(UNIX)
endif()
endif()

# Top level cmake dir
if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" OFF)
option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF)
option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF)
option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF)
option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF)
option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" OFF)
option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF)
option(PYARROW_BUILD_PARQUET_ENCRYPTION
"Build the PyArrow Parquet encryption integration" OFF)
option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF)
option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF)
option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF)
set(PYARROW_CXXFLAGS
""
CACHE STRING "Compiler flags to append when compiling Arrow")
endif()

find_program(CCACHE_FOUND ccache)
if(CCACHE_FOUND
AND NOT CMAKE_C_COMPILER_LAUNCHER
Expand Down Expand Up @@ -265,11 +246,44 @@ message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}")

include(UseCython)

# PyArrow C++
# Arrow C++ and set default PyArrow build options
include(GNUInstallDirs)

find_package(Arrow REQUIRED)

# Top level cmake dir
if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kou for my education, could you explain what this if check is exactly doing or why we need it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's for avoiding defining our options when this is used via add_subdirectory() (or FetchContent).

We don't need this because:

  • This is never used via add_subdirectory()
  • Our option definitions are harmless with add_subdirectory() use

option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" ${ARROW_ACERO})
option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" ${ARROW_CUDA})
option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" ${ARROW_DATASET})
option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" ${ARROW_FLIGHT})
option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" ${ARROW_GANDIVA})
option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" ${ARROW_ORC})
option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" ${ARROW_PARQUET})
option(PYARROW_BUILD_PARQUET_ENCRYPTION
"Build the PyArrow Parquet encryption integration" ${PARQUET_REQUIRE_ENCRYPTION})
option(PYARROW_BUILD_SUBSTRAIT "Build the PyArrow Substrait integration"
${ARROW_SUBSTRAIT})
option(PYARROW_BUILD_AZURE "Build the PyArrow Azure integration" ${ARROW_AZURE})
option(PYARROW_BUILD_GCS "Build the PyArrow GCS integration" ${ARROW_GCS})
option(PYARROW_BUILD_S3 "Build the PyArrow S3 integration" ${ARROW_S3})
option(PYARROW_BUILD_HDFS "Build the PyArrow HDFS integration" ${ARROW_HDFS})
option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF)
option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF)
option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF)
set(PYARROW_CXXFLAGS
""
CACHE STRING "Compiler flags to append when compiling Arrow")
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
endif()

# enforce module dependencies
if(PYARROW_BUILD_SUBSTRAIT)
set(PYARROW_BUILD_DATASET ON)
endif()
if(PYARROW_BUILD_DATASET)
set(PYARROW_BUILD_ACERO ON)
endif()
Comment on lines +304 to +310
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to ensure we keep the same logic as we have now in setup.py, but we could in theory also leave this out (and ensure the scripts don't explicitly disable Acero when enabling Dataset)


# PyArrow C++
set(PYARROW_CPP_ROOT_DIR pyarrow/src)
set(PYARROW_CPP_SOURCE_DIR ${PYARROW_CPP_ROOT_DIR}/arrow/python)
set(PYARROW_CPP_SRCS
Expand Down Expand Up @@ -305,6 +319,7 @@ set(PYARROW_CPP_LINK_LIBS "")

# Check all the options from Arrow and PyArrow C++ to be in line
if(PYARROW_BUILD_DATASET)
message(STATUS "Building PyArrow Dataset")
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
if(NOT ARROW_DATASET)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_DATASET=ON")
endif()
Expand All @@ -317,6 +332,7 @@ if(PYARROW_BUILD_DATASET)
endif()

if(PYARROW_BUILD_ACERO)
message(STATUS "Building PyArrow Acero")
if(NOT ARROW_ACERO)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_ACERO=ON")
endif()
Expand All @@ -329,18 +345,13 @@ if(PYARROW_BUILD_ACERO)
endif()

if(PYARROW_BUILD_PARQUET OR PYARROW_BUILD_PARQUET_ENCRYPTION)
message(STATUS "Building PyArrow Parquet")
if(NOT ARROW_PARQUET)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_PARQUET=ON")
endif()
find_package(Parquet REQUIRED)
endif()

if(PYARROW_BUILD_HDFS)
if(NOT ARROW_HDFS)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON")
endif()
endif()

# Check for only Arrow C++ options
if(ARROW_CSV)
list(APPEND PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/csv.cc)
Expand Down Expand Up @@ -400,6 +411,7 @@ endif()

set(PYARROW_CPP_FLIGHT_SRCS ${PYARROW_CPP_SOURCE_DIR}/flight.cc)
if(PYARROW_BUILD_FLIGHT)
message(STATUS "Building PyArrow Flight")
if(NOT ARROW_FLIGHT)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_FLIGHT=ON")
endif()
Expand Down Expand Up @@ -555,23 +567,39 @@ set_source_files_properties(pyarrow/lib.pyx PROPERTIES CYTHON_API TRUE)
set(LINK_LIBS arrow_python)

if(PYARROW_BUILD_AZURE)
message(STATUS "Building PyArrow Azure")
if(NOT ARROW_AZURE)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_AZURE=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _azurefs)
endif()

if(PYARROW_BUILD_GCS)
message(STATUS "Building PyArrow GCS")
if(NOT ARROW_GCS)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_GCS=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _gcsfs)
endif()

if(PYARROW_BUILD_S3)
message(STATUS "Building PyArrow S3")
if(NOT ARROW_S3)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_S3=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _s3fs)
endif()

if(PYARROW_BUILD_HDFS)
message(STATUS "Building PyArrow HDFS")
if(NOT ARROW_HDFS)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON")
endif()
list(APPEND CYTHON_EXTENSIONS _hdfs)
endif()

if(PYARROW_BUILD_CUDA)
# Arrow CUDA
message(STATUS "Building PyArrow CUDA")
if(NOT ARROW_CUDA)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_CUDA=ON")
endif()
Expand Down Expand Up @@ -646,8 +674,9 @@ if(PYARROW_BUILD_PARQUET)
endif()
endif()

# ORC
if(PYARROW_BUILD_ORC)
# ORC
message(STATUS "Building PyArrow ORC")
if(NOT ARROW_ORC)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_ORC=ON")
endif()
Expand Down Expand Up @@ -679,6 +708,7 @@ endif()

# Substrait
if(PYARROW_BUILD_SUBSTRAIT)
message(STATUS "Building PyArrow Substrait")
if(NOT ARROW_SUBSTRAIT)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_SUBSTRAIT=ON")
endif()
Expand All @@ -696,6 +726,7 @@ endif()

# Gandiva
if(PYARROW_BUILD_GANDIVA)
message(STATUS "Building PyArrow Gandiva")
if(NOT ARROW_GANDIVA)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_GANDIVA=ON")
endif()
Expand Down
118 changes: 21 additions & 97 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,48 +152,13 @@ def initialize_options(self):
if not hasattr(sys, 'gettotalrefcount'):
self.build_type = 'release'

self.with_azure = strtobool(
os.environ.get('PYARROW_WITH_AZURE', '0'))
self.with_gcs = strtobool(
os.environ.get('PYARROW_WITH_GCS', '0'))
self.with_s3 = strtobool(
os.environ.get('PYARROW_WITH_S3', '0'))
self.with_hdfs = strtobool(
os.environ.get('PYARROW_WITH_HDFS', '0'))
self.with_cuda = strtobool(
os.environ.get('PYARROW_WITH_CUDA', '0'))
self.with_substrait = strtobool(
os.environ.get('PYARROW_WITH_SUBSTRAIT', '0'))
self.with_flight = strtobool(
os.environ.get('PYARROW_WITH_FLIGHT', '0'))
self.with_acero = strtobool(
os.environ.get('PYARROW_WITH_ACERO', '0'))
self.with_dataset = strtobool(
os.environ.get('PYARROW_WITH_DATASET', '0'))
self.with_parquet = strtobool(
os.environ.get('PYARROW_WITH_PARQUET', '0'))
self.with_parquet_encryption = strtobool(
os.environ.get('PYARROW_WITH_PARQUET_ENCRYPTION', '0'))
self.with_orc = strtobool(
os.environ.get('PYARROW_WITH_ORC', '0'))
self.with_gandiva = strtobool(
os.environ.get('PYARROW_WITH_GANDIVA', '0'))
self.generate_coverage = strtobool(
os.environ.get('PYARROW_GENERATE_COVERAGE', '0'))
self.bundle_arrow_cpp = strtobool(
os.environ.get('PYARROW_BUNDLE_ARROW_CPP', '0'))
self.bundle_cython_cpp = strtobool(
os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0'))

self.with_parquet_encryption = (self.with_parquet_encryption and
self.with_parquet)

# enforce module dependencies
if self.with_substrait:
self.with_dataset = True
if self.with_dataset:
self.with_acero = True

CYTHON_MODULE_NAMES = [
'lib',
'_fs',
Expand Down Expand Up @@ -270,23 +235,28 @@ def append_cmake_bool(value, varname):
cmake_options.append('-D{0}={1}'.format(
varname, 'on' if value else 'off'))

def maybe_append_cmake_bool(varname):
env_var = varname.replace("PYARROW_BUILD", "PYARROW_WITH")
if env_var in os.environ:
value = strtobool(os.environ.get(env_var))
append_cmake_bool(value, varname)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about doing this in CMakeLists.txt too?
We can refer an environment variable by "$ENV{PYARROW_WITH_CUDA} in CMakeLists.txt.
See also: https://cmake.org/cmake/help/latest/variable/ENV.html

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did that initially in a first version locally, but then moved it again to setup.py to avoid the same problem with it being cached. But with you suggestion above to use an AUTO default, then I can indeed move it back to the cmake.


if self.cmake_generator:
cmake_options += ['-G', self.cmake_generator]

append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA')
append_cmake_bool(self.with_substrait, 'PYARROW_BUILD_SUBSTRAIT')
append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT')
append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA')
append_cmake_bool(self.with_acero, 'PYARROW_BUILD_ACERO')
append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET')
append_cmake_bool(self.with_orc, 'PYARROW_BUILD_ORC')
append_cmake_bool(self.with_parquet, 'PYARROW_BUILD_PARQUET')
append_cmake_bool(self.with_parquet_encryption,
'PYARROW_BUILD_PARQUET_ENCRYPTION')
append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE')
append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS')
append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3')
append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS')
maybe_append_cmake_bool('PYARROW_BUILD_CUDA')
maybe_append_cmake_bool('PYARROW_BUILD_SUBSTRAIT')
maybe_append_cmake_bool('PYARROW_BUILD_FLIGHT')
maybe_append_cmake_bool('PYARROW_BUILD_GANDIVA')
maybe_append_cmake_bool('PYARROW_BUILD_ACERO')
maybe_append_cmake_bool('PYARROW_BUILD_DATASET')
maybe_append_cmake_bool('PYARROW_BUILD_ORC')
maybe_append_cmake_bool('PYARROW_BUILD_PARQUET')
maybe_append_cmake_bool('PYARROW_BUILD_PARQUET_ENCRYPTION')
maybe_append_cmake_bool('PYARROW_BUILD_AZURE')
maybe_append_cmake_bool('PYARROW_BUILD_GCS')
maybe_append_cmake_bool('PYARROW_BUILD_S3')
maybe_append_cmake_bool('PYARROW_BUILD_HDFS')
append_cmake_bool(self.bundle_arrow_cpp,
'PYARROW_BUNDLE_ARROW_CPP')
append_cmake_bool(self.bundle_cython_cpp,
Expand Down Expand Up @@ -329,54 +299,8 @@ def append_cmake_bool(value, varname):
self._found_names = []
for name in self.CYTHON_MODULE_NAMES:
built_path = pjoin(install_prefix, name + ext_suffix)
if not os.path.exists(built_path):
print(f'Did not find {built_path}')
if self._failure_permitted(name):
print(f'Cython module {name} failure permitted')
continue
raise RuntimeError('PyArrow C-extension failed to build:',
os.path.abspath(built_path))

self._found_names.append(name)

def _failure_permitted(self, name):
if name == '_parquet' and not self.with_parquet:
return True
if name == '_parquet_encryption' and not self.with_parquet_encryption:
return True
if name == '_orc' and not self.with_orc:
return True
if name == '_flight' and not self.with_flight:
return True
if name == '_substrait' and not self.with_substrait:
return True
if name == '_azurefs' and not self.with_azure:
return True
if name == '_gcsfs' and not self.with_gcs:
return True
if name == '_s3fs' and not self.with_s3:
return True
if name == '_hdfs' and not self.with_hdfs:
return True
if name == '_dataset' and not self.with_dataset:
return True
if name == '_acero' and not self.with_acero:
return True
if name == '_exec_plan' and not self.with_acero:
return True
if name == '_dataset_orc' and not (
self.with_orc and self.with_dataset
):
return True
if name == '_dataset_parquet' and not (
self.with_parquet and self.with_dataset
):
return True
if name == '_cuda' and not self.with_cuda:
return True
if name == 'gandiva' and not self.with_gandiva:
return True
return False
if os.path.exists(built_path):
self._found_names.append(name)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to prepare self._found_names?
It seems that nobody uses it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's used in get_names -> get_outputs methods. We don't use those ourselves, but I am assuming this might be part of the setuptools build_ext class interface

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm. It seems that the setuptools build_ext doesn't have get_names interface:

>>> import setuptools.command.build_ext
>>> setuptools.command.build_ext.build_ext
<class 'setuptools.command.build_ext.build_ext'>
>>> 'get_names' in dir(setuptools.command.build_ext.build_ext)
False

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does have get_outputs:

In [9]: 'get_outputs' in dir(setuptools.command.build_ext.build_ext)
Out[9]: True

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, sorry. I checked wrong name...


def _get_build_dir(self):
# Get the package directory from build_py
Expand Down