Skip to content

Commit

Permalink
Implement DataArray.to_dask_dataframe() (#7635)
Browse files Browse the repository at this point in the history
* Add feature  to convert dataarray to dask dataframe. This is for the issue #7409

* Add test for new method  dataarray.to_dask_dataframe()

* Changes after review

* Corrections in docstring and import

* docstring correction

* Remove name parameter

* Add feature  to convert dataarray to dask dataframe. This is for the issue #7409

* Add test for new method  dataarray.to_dask_dataframe()

* Changes after review

* Corrections in docstring and import

* docstring correction

* Remove name parameter

* Corrected doc/whats-new.rst

* Update whats-new.rst

* Space corrections in docstring

* Whitespace correction in docstring

* Add white space in docstring line

* Whitespace correction

* Update line npartitions=1

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Revert "Update line npartitions=1"

This reverts commit 4bae82c.

Reverting commit .

* Add whitespace in npartitions=1

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Change example in docstring

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Change example in docstring

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update xarray/core/dataarray.py

Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>

* Update doc/whats-new.rst

Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>

* Add name check

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add test for unnamed dataarray.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove scalar array test

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Change error message

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update xarray/tests/test_dataarray.py

* Update whats-new.rst

* Update doc/whats-new.rst

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>
Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>
  • Loading branch information
4 people committed Apr 28, 2023
1 parent 087ebbb commit 25d9a28
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 0 deletions.
1 change: 1 addition & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,7 @@ DataArray methods
DataArray.from_iris
DataArray.from_series
DataArray.to_cdms2
DataArray.to_dask_dataframe
DataArray.to_dataframe
DataArray.to_dataset
DataArray.to_dict
Expand Down
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ v2023.05.0 (unreleased)

New Features
~~~~~~~~~~~~
- Added new method :py:meth:`DataArray.to_dask_dataframe`, convert a dataarray into a dask dataframe (:issue:`7409`).
By `Deeksha <https://github.com/dsgreen2>`_.
- Add support for lshift and rshift binary operators (`<<`, `>>`) on
:py:class:`xr.DataArray` of type :py:class:`int` (:issue:`7727` , :pull:`7741`).
By `Alan Brammer <https://github.com/abrammer>`_.
Expand Down
69 changes: 69 additions & 0 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@

from numpy.typing import ArrayLike

try:
from dask.dataframe import DataFrame as DaskDataFrame
except ImportError:
DaskDataFrame = None # type: ignore
try:
from dask.delayed import Delayed
except ImportError:
Expand Down Expand Up @@ -6888,6 +6892,71 @@ def resample(
**indexer_kwargs,
)

def to_dask_dataframe(
self,
dim_order: Sequence[Hashable] | None = None,
set_index: bool = False,
) -> DaskDataFrame:
"""Convert this array into a dask.dataframe.DataFrame.
Parameters
----------
dim_order : Sequence of Hashable or None , optional
Hierarchical dimension order for the resulting dataframe.
Array content is transposed to this order and then written out as flat
vectors in contiguous order, so the last dimension in this list
will be contiguous in the resulting DataFrame. This has a major influence
on which operations are efficient on the resulting dask dataframe.
set_index : bool, default: False
If set_index=True, the dask DataFrame is indexed by this dataset's
coordinate. Since dask DataFrames do not support multi-indexes,
set_index only works if the dataset only contains one dimension.
Returns
-------
dask.dataframe.DataFrame
Examples
--------
>>> da = xr.DataArray(
... np.arange(4 * 2 * 2).reshape(4, 2, 2),
... dims=("time", "lat", "lon"),
... coords={
... "time": np.arange(4),
... "lat": [-30, -20],
... "lon": [120, 130],
... },
... name="eg_dataarray",
... attrs={"units": "Celsius", "description": "Random temperature data"},
... )
>>> da.to_dask_dataframe(["lat", "lon", "time"]).compute()
lat lon time eg_dataarray
0 -30 120 0 0
1 -30 120 1 4
2 -30 120 2 8
3 -30 120 3 12
4 -30 130 0 1
5 -30 130 1 5
6 -30 130 2 9
7 -30 130 3 13
8 -20 120 0 2
9 -20 120 1 6
10 -20 120 2 10
11 -20 120 3 14
12 -20 130 0 3
13 -20 130 1 7
14 -20 130 2 11
15 -20 130 3 15
"""
if self.name is None:
raise ValueError(
"Cannot convert an unnamed DataArray to a "
"dask dataframe : use the ``.rename`` method to assign a name."
)
name = self.name
ds = self._to_dataset_whole(name, shallow_copy=False)
return ds.to_dask_dataframe(dim_order, set_index)

# this needs to be at the end, or mypy will confuse with `str`
# https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
str = utils.UncachedAccessor(StringAccessor["DataArray"])
33 changes: 33 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3245,6 +3245,39 @@ def test_to_dataframe_0length(self) -> None:
assert len(actual) == 0
assert_array_equal(actual.index.names, list("ABC"))

@requires_dask
def test_to_dask_dataframe(self) -> None:
arr_np = np.arange(3 * 4).reshape(3, 4)
arr = DataArray(arr_np, [("B", [1, 2, 3]), ("A", list("cdef"))], name="foo")
expected = arr.to_series()
actual = arr.to_dask_dataframe()["foo"]

assert_array_equal(actual.values, expected.values)

actual = arr.to_dask_dataframe(dim_order=["A", "B"])["foo"]
assert_array_equal(arr_np.transpose().reshape(-1), actual.values)

# regression test for coords with different dimensions

arr.coords["C"] = ("B", [-1, -2, -3])
expected = arr.to_series().to_frame()
expected["C"] = [-1] * 4 + [-2] * 4 + [-3] * 4
expected = expected[["C", "foo"]]
actual = arr.to_dask_dataframe()[["C", "foo"]]

assert_array_equal(expected.values, actual.values)
assert_array_equal(expected.columns.values, actual.columns.values)

with pytest.raises(ValueError, match="does not match the set of dimensions"):
arr.to_dask_dataframe(dim_order=["B", "A", "C"])

arr.name = None
with pytest.raises(
ValueError,
match="Cannot convert an unnamed DataArray",
):
arr.to_dask_dataframe()

def test_to_pandas_name_matches_coordinate(self) -> None:
# coordinate with same name as array
arr = DataArray([1, 2, 3], dims="x", name="x")
Expand Down

0 comments on commit 25d9a28

Please sign in to comment.