pandas-dev · Mar 1, 2024 · Mar 3, 2024 · Mar 4, 2024 · Mar 4, 2024 · Mar 6, 2024
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -3,7 +3,7 @@ version: 2.1
 jobs:
   test-arm:
     machine:
-      image: ubuntu-2004:2022.04.1
+      image: default
     resource_class: arm.large
     environment:
       ENV_FILE: ci/deps/circle-310-arm64.yaml
@@ -46,7 +46,7 @@ jobs:
       cibw-build:
         type: string
     machine:
-      image: ubuntu-2004:2022.04.1
+      image: default
     resource_class: arm.large
     environment:
       TRIGGER_SOURCE: << pipeline.trigger_source >>
@@ -72,10 +72,6 @@ jobs:
           no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that
           command: |
             pip3 install cibuildwheel==2.15.0
-            # When this is a nightly wheel build, allow picking up NumPy 2.0 dev wheels:
-            if [[ "$IS_SCHEDULE_DISPATCH" == "true" || "$IS_PUSH" != 'true' ]]; then
-                export CIBW_ENVIRONMENT="PIP_EXTRA_INDEX_URL=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
-            fi
             cibuildwheel --prerelease-pythons --output-dir wheelhouse
 
           environment:

diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml
@@ -1,16 +1,9 @@
 name: Run tests and report results
-inputs:
-  preload:
-    description: Preload arguments for sanitizer
-    required: false
-  asan_options:
-    description: Arguments for Address Sanitizer (ASAN)
-    required: false
 runs:
   using: composite
   steps:
     - name: Test
-      run: ${{ inputs.asan_options }} ${{ inputs.preload }} ci/run_tests.sh
+      run: ci/run_tests.sh
       shell: bash -el {0}
 
     - name: Publish test results

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -96,14 +96,6 @@ jobs:
           - name: "Pyarrow Nightly"
             env_file: actions-311-pyarrownightly.yaml
             pattern: "not slow and not network and not single_cpu"
-          - name: "ASAN / UBSAN"
-            env_file: actions-311-sanitizers.yaml
-            pattern: "not slow and not network and not single_cpu and not skip_ubsan"
-            asan_options: "ASAN_OPTIONS=detect_leaks=0"
-            preload: LD_PRELOAD=$(gcc -print-file-name=libasan.so)
-            meson_args: --config-settings=setup-args="-Db_sanitize=address,undefined"
-            cflags_adds: -fno-sanitize-recover=all
-            pytest_workers: -1  # disable pytest-xdist as it swallows stderr from ASAN
       fail-fast: false
     name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }}
     env:
@@ -190,18 +182,12 @@ jobs:
     - name: Test (not single_cpu)
       uses: ./.github/actions/run-tests
       if: ${{ matrix.name != 'Pypy' }}
-      with:
-        preload: ${{ matrix.preload }}
-        asan_options: ${{ matrix.asan_options }}
       env:
         # Set pattern to not single_cpu if not already set
         PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }}
 
     - name: Test (single_cpu)
       uses: ./.github/actions/run-tests
-      with:
-        preload: ${{ matrix.preload }}
-        asan_options: ${{ matrix.asan_options }}
       env:
         PATTERN: 'single_cpu'
         PYTEST_WORKERS: 0

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -139,27 +139,14 @@ jobs:
         shell: bash -el {0}
         run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"
 
-      - name: Build normal wheels
-        if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }}
-        uses: pypa/cibuildwheel@v2.16.5
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v2.17.0
         with:
          package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
         env:
           CIBW_PRERELEASE_PYTHONS: True
           CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }}
 
-      - name: Build nightly wheels (with NumPy pre-release)
-        if: ${{ (env.IS_SCHEDULE_DISPATCH == 'true' && env.IS_PUSH != 'true') }}
-        uses: pypa/cibuildwheel@v2.16.5
-        with:
-         package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
-        env:
-          # The nightly wheels should be build witht he NumPy 2.0 pre-releases
-          # which requires the additional URL.
-          CIBW_ENVIRONMENT: PIP_EXTRA_INDEX_URL=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple
-          CIBW_PRERELEASE_PYTHONS: True
-          CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }}
-
       - name: Set up Python
         uses: mamba-org/setup-micromamba@v1
         with:

diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
@@ -24,6 +24,8 @@ dependencies:
 
   # optional dependencies
   - beautifulsoup4>=4.11.2
+  # https://github.com/conda-forge/pytables-feedstock/issues/97
+  - c-blosc2=2.13.2
   - blosc>=1.21.3
   - bottleneck>=1.3.6
   - fastparquet>=2022.12.0

diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
@@ -26,6 +26,8 @@ dependencies:
 
   # optional dependencies
   - beautifulsoup4>=4.11.2
+  # https://github.com/conda-forge/pytables-feedstock/issues/97
+  - c-blosc2=2.13.2
   - blosc>=1.21.3
   - bottleneck>=1.3.6
   - fastparquet>=2022.12.0

diff --git a/ci/deps/actions-311-sanitizers.yaml b/ci/deps/actions-311-sanitizers.yaml
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
@@ -24,6 +24,8 @@ dependencies:
 
   # optional dependencies
   - beautifulsoup4>=4.11.2
+  # https://github.com/conda-forge/pytables-feedstock/issues/97
+  - c-blosc2=2.13.2
   - blosc>=1.21.3
   - bottleneck>=1.3.6
   - fastparquet>=2022.12.0

diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml
@@ -24,6 +24,8 @@ dependencies:
 
   # optional dependencies
   - beautifulsoup4>=4.11.2
+  # https://github.com/conda-forge/pytables-feedstock/issues/97
+  - c-blosc2=2.13.2
   - blosc>=1.21.3
   - bottleneck>=1.3.6
   - fastparquet>=2022.12.0

diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml
@@ -27,6 +27,8 @@ dependencies:
 
   # optional dependencies
   - beautifulsoup4=4.11.2
+  # https://github.com/conda-forge/pytables-feedstock/issues/97
+  - c-blosc2=2.13.2
   - blosc=1.21.3
   - bottleneck=1.3.6
   - fastparquet=2022.12.0

diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
@@ -24,6 +24,8 @@ dependencies:
 
   # optional dependencies
   - beautifulsoup4>=4.11.2
+  # https://github.com/conda-forge/pytables-feedstock/issues/97
+  - c-blosc2=2.13.2
   - blosc>=1.21.3
   - bottleneck>=1.3.6
   - fastparquet>=2022.12.0

diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml
@@ -25,6 +25,8 @@ dependencies:
 
   # optional dependencies
   - beautifulsoup4>=4.11.2
+  # https://github.com/conda-forge/pytables-feedstock/issues/97
+  - c-blosc2=2.13.2
   - blosc>=1.21.3
   - bottleneck>=1.3.6
   - fastparquet>=2022.12.0

diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -49,6 +49,7 @@ Conversion
    DataFrame.infer_objects
    DataFrame.copy
    DataFrame.bool
+   DataFrame.to_numpy
 
 Indexing, iteration
 ~~~~~~~~~~~~~~~~~~~

diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
@@ -342,7 +342,6 @@ Datetime properties
    Series.dt.tz
    Series.dt.freq
    Series.dt.unit
-   Series.dt.normalize
 
 Datetime methods
 ^^^^^^^^^^^^^^^^

diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
@@ -156,7 +156,7 @@ fits in memory, you can work with datasets that are much larger than memory.
 
    Chunking works well when the operation you're performing requires zero or minimal
    coordination between chunks. For more complicated workflows, you're better off
-   :ref:`using another library <scale.other_libraries>`.
+   :ref:`using other libraries <scale.other_libraries>`.
 
 Suppose we have an even larger "logical dataset" on disk that's a directory of parquet
 files. Each file in the directory represents a different year of the entire dataset.
@@ -219,160 +219,10 @@ different library that implements these out-of-core algorithms for you.
 
 .. _scale.other_libraries:
 
-Use Dask
---------
+Use Other Libraries
+-------------------
 
-pandas is just one library offering a DataFrame API. Because of its popularity,
-pandas' API has become something of a standard that other libraries implement.
-The pandas documentation maintains a list of libraries implementing a DataFrame API
-in `the ecosystem page <https://pandas.pydata.org/community/ecosystem.html>`_.
-
-For example, `Dask`_, a parallel computing library, has `dask.dataframe`_, a
-pandas-like API for working with larger than memory datasets in parallel. Dask
-can use multiple threads or processes on a single machine, or a cluster of
-machines to process data in parallel.
-
-
-We'll import ``dask.dataframe`` and notice that the API feels similar to pandas.
-We can use Dask's ``read_parquet`` function, but provide a globstring of files to read in.
-
-.. ipython:: python
-   :okwarning:
-
-   import dask.dataframe as dd
-
-   ddf = dd.read_parquet("data/timeseries/ts*.parquet", engine="pyarrow")
-   ddf
-
-Inspecting the ``ddf`` object, we see a few things
-
-* There are familiar attributes like ``.columns`` and ``.dtypes``
-* There are familiar methods like ``.groupby``, ``.sum``, etc.
-* There are new attributes like ``.npartitions`` and ``.divisions``
-
-The partitions and divisions are how Dask parallelizes computation. A **Dask**
-DataFrame is made up of many pandas :class:`pandas.DataFrame`. A single method call on a
-Dask DataFrame ends up making many pandas method calls, and Dask knows how to
-coordinate everything to get the result.
-
-.. ipython:: python
-
-   ddf.columns
-   ddf.dtypes
-   ddf.npartitions
-
-One major difference: the ``dask.dataframe`` API is *lazy*. If you look at the
-repr above, you'll notice that the values aren't actually printed out; just the
-column names and dtypes. That's because Dask hasn't actually read the data yet.
-Rather than executing immediately, doing operations build up a **task graph**.
-
-.. ipython:: python
-   :okwarning:
-
-   ddf
-   ddf["name"]
-   ddf["name"].value_counts()
-
-Each of these calls is instant because the result isn't being computed yet.
-We're just building up a list of computation to do when someone needs the
-result. Dask knows that the return type of a :class:`pandas.Series.value_counts`
-is a pandas :class:`pandas.Series` with a certain dtype and a certain name. So the Dask version
-returns a Dask Series with the same dtype and the same name.
-
-To get the actual result you can call ``.compute()``.
-
-.. ipython:: python
-   :okwarning:
-
-   %time ddf["name"].value_counts().compute()
-
-At that point, you get back the same thing you'd get with pandas, in this case
-a concrete pandas :class:`pandas.Series` with the count of each ``name``.
-
-Calling ``.compute`` causes the full task graph to be executed. This includes
-reading the data, selecting the columns, and doing the ``value_counts``. The
-execution is done *in parallel* where possible, and Dask tries to keep the
-overall memory footprint small. You can work with datasets that are much larger
-than memory, as long as each partition (a regular pandas :class:`pandas.DataFrame`) fits in memory.
-
-By default, ``dask.dataframe`` operations use a threadpool to do operations in
-parallel. We can also connect to a cluster to distribute the work on many
-machines. In this case we'll connect to a local "cluster" made up of several
-processes on this single machine.
-
-.. code-block:: python
-
-   >>> from dask.distributed import Client, LocalCluster
-
-   >>> cluster = LocalCluster()
-   >>> client = Client(cluster)
-   >>> client
-   <Client: 'tcp://127.0.0.1:53349' processes=4 threads=8, memory=17.18 GB>
-
-Once this ``client`` is created, all of Dask's computation will take place on
-the cluster (which is just processes in this case).
-
-Dask implements the most used parts of the pandas API. For example, we can do
-a familiar groupby aggregation.
-
-.. ipython:: python
-   :okwarning:
-
-   %time ddf.groupby("name")[["x", "y"]].mean().compute().head()
-
-The grouping and aggregation is done out-of-core and in parallel.
-
-When Dask knows the ``divisions`` of a dataset, certain optimizations are
-possible. When reading parquet datasets written by dask, the divisions will be
-known automatically. In this case, since we created the parquet files manually,
-we need to supply the divisions manually.
-
-.. ipython:: python
-   :okwarning:
-
-   N = 12
-   starts = [f"20{i:>02d}-01-01" for i in range(N)]
-   ends = [f"20{i:>02d}-12-13" for i in range(N)]
-
-   divisions = tuple(pd.to_datetime(starts)) + (pd.Timestamp(ends[-1]),)
-   ddf.divisions = divisions
-   ddf
-
-Now we can do things like fast random access with ``.loc``.
-
-.. ipython:: python
-   :okwarning:
-
-   ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute()
-
-Dask knows to just look in the 3rd partition for selecting values in 2002. It
-doesn't need to look at any other data.
-
-Many workflows involve a large amount of data and processing it in a way that
-reduces the size to something that fits in memory. In this case, we'll resample
-to daily frequency and take the mean. Once we've taken the mean, we know the
-results will fit in memory, so we can safely call ``compute`` without running
-out of memory. At that point it's just a regular pandas object.
-
-.. ipython:: python
-   :okwarning:
-
-   @savefig dask_resample.png
-   ddf[["x", "y"]].resample("1D").mean().cumsum().compute().plot()
-
-.. ipython:: python
-   :suppress:
-
-   import shutil
-
-   shutil.rmtree("data/timeseries")
-
-These Dask examples have all be done using multiple processes on a single
-machine. Dask can be `deployed on a cluster
-<https://docs.dask.org/en/latest/setup.html>`_ to scale up to even larger
-datasets.
-
-You see more dask examples at https://examples.dask.org.
-
-.. _Dask: https://dask.org
-.. _dask.dataframe: https://docs.dask.org/en/latest/dataframe.html
+There are other libraries which provide similar APIs to pandas and work nicely with pandas DataFrame,
+and can give you the ability to scale your large dataset processing and analytics
+by parallel runtime, distributed memory, clustering, etc. You can find more information
+in `the ecosystem page <https://pandas.pydata.org/community/ecosystem.html#out-of-core>`_.