Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pd.eval: Series names are now preserved even for "numexpr" engine. #58437

Merged
merged 4 commits into from Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Expand Up @@ -465,6 +465,7 @@ Styler
Other
^^^^^
- Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`)
- Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`)
- Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`)
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`)
Expand Down
19 changes: 13 additions & 6 deletions pandas/core/computation/align.py
Expand Up @@ -160,19 +160,24 @@ def align_terms(terms):
# can't iterate so it must just be a constant or single variable
if isinstance(terms.value, (ABCSeries, ABCDataFrame)):
typ = type(terms.value)
return typ, _zip_axes_from_type(typ, terms.value.axes)
return np.result_type(terms.type), None
name = terms.value.name if isinstance(terms.value, ABCSeries) else None
return typ, _zip_axes_from_type(typ, terms.value.axes), name
return np.result_type(terms.type), None, None

# if all resolved variables are numeric scalars
if all(term.is_scalar for term in terms):
return result_type_many(*(term.value for term in terms)).type, None
return result_type_many(*(term.value for term in terms)).type, None, None

# if all input series have a common name, propagate it to the returned series
names = {term.value.name for term in terms if isinstance(term.value, ABCSeries)}
name = names.pop() if len(names) == 1 else None

# perform the main alignment
typ, axes = _align_core(terms)
return typ, axes
return typ, axes, name


def reconstruct_object(typ, obj, axes, dtype):
def reconstruct_object(typ, obj, axes, dtype, name):
"""
Reconstruct an object given its type, raw value, and possibly empty
(None) axes.
Expand Down Expand Up @@ -200,7 +205,9 @@ def reconstruct_object(typ, obj, axes, dtype):
res_t = np.result_type(obj.dtype, dtype)

if not isinstance(typ, partial) and issubclass(typ, PandasObject):
return typ(obj, dtype=res_t, **axes)
if name is None:
return typ(obj, dtype=res_t, **axes)
return typ(obj, dtype=res_t, name=name, **axes)
Comment on lines +209 to +210
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't name=None also be OK? i.e. Can't this always be return typ(obj, dtype=res_t, name=name, **axes)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately no. In some cases typ is DataFrame (in particular cases like pd.eval("df + df")) in which case passing anything as name= (including None) would raise a TypeError


# special case for pathological things like ~True/~False
if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_:
Expand Down
11 changes: 9 additions & 2 deletions pandas/core/computation/engines.py
Expand Up @@ -54,6 +54,7 @@ def __init__(self, expr) -> None:
self.expr = expr
self.aligned_axes = None
self.result_type = None
self.result_name = None

def convert(self) -> str:
"""
Expand All @@ -76,12 +77,18 @@ def evaluate(self) -> object:
The result of the passed expression.
"""
if not self._is_aligned:
self.result_type, self.aligned_axes = align_terms(self.expr.terms)
self.result_type, self.aligned_axes, self.result_name = align_terms(
self.expr.terms
)

# make sure no names in resolvers and locals/globals clash
res = self._evaluate()
return reconstruct_object(
self.result_type, res, self.aligned_axes, self.expr.terms.return_type
self.result_type,
res,
self.aligned_axes,
self.expr.terms.return_type,
self.result_name,
)

@property
Expand Down
43 changes: 24 additions & 19 deletions pandas/tests/computation/test_eval.py
Expand Up @@ -737,6 +737,17 @@ def test_and_logic_string_match(self):
assert pd.eval(f"{event.str.match('hello').a}")
assert pd.eval(f"{event.str.match('hello').a and event.str.match('hello').a}")

def test_eval_keep_name(self, engine, parser):
df = Series([2, 15, 28], name="a").to_frame()
res = df.eval("a + a", engine=engine, parser=parser)
expected = Series([4, 30, 56], name="a")
tm.assert_series_equal(expected, res)

def test_eval_unmatching_names(self, engine, parser):
variable_name = Series([42], name="series_name")
res = pd.eval("variable_name + 0", engine=engine, parser=parser)
tm.assert_series_equal(variable_name, res)


# -------------------------------------
# gh-12388: Typecasting rules consistency with python
Expand Down Expand Up @@ -1269,14 +1280,12 @@ def test_assignment_explicit(self):
expected["c"] = expected["a"] + expected["b"]
tm.assert_frame_equal(df, expected)

def test_column_in(self):
def test_column_in(self, engine):
# GH 11235
df = DataFrame({"a": [11], "b": [-32]})
result = df.eval("a in [11, -32]")
expected = Series([True])
# TODO: 2022-01-29: Name check failed with numexpr 2.7.3 in CI
# but cannot reproduce locally
tm.assert_series_equal(result, expected, check_names=False)
result = df.eval("a in [11, -32]", engine=engine)
expected = Series([True], name="a")
tm.assert_series_equal(result, expected)

@pytest.mark.xfail(reason="Unknown: Omitted test_ in name prior.")
def test_assignment_not_inplace(self):
Expand Down Expand Up @@ -1505,7 +1514,7 @@ def test_date_boolean(self, engine, parser):
parser=parser,
)
expec = df.dates1 < "20130101"
tm.assert_series_equal(res, expec, check_names=False)
tm.assert_series_equal(res, expec)

def test_simple_in_ops(self, engine, parser):
if parser != "python":
Expand Down Expand Up @@ -1620,7 +1629,7 @@ def test_unary_functions(self, fn, engine, parser):
got = self.eval(expr, engine=engine, parser=parser)
with np.errstate(all="ignore"):
expect = getattr(np, fn)(a)
tm.assert_series_equal(got, expect, check_names=False)
tm.assert_series_equal(got, expect)

@pytest.mark.parametrize("fn", _binary_math_ops)
def test_binary_functions(self, fn, engine, parser):
Expand All @@ -1637,7 +1646,7 @@ def test_binary_functions(self, fn, engine, parser):
got = self.eval(expr, engine=engine, parser=parser)
with np.errstate(all="ignore"):
expect = getattr(np, fn)(a, b)
tm.assert_almost_equal(got, expect, check_names=False)
tm.assert_almost_equal(got, expect)

def test_df_use_case(self, engine, parser):
df = DataFrame(
Expand All @@ -1653,8 +1662,8 @@ def test_df_use_case(self, engine, parser):
inplace=True,
)
got = df.e
expect = np.arctan2(np.sin(df.a), df.b)
tm.assert_series_equal(got, expect, check_names=False)
expect = np.arctan2(np.sin(df.a), df.b).rename("e")
tm.assert_series_equal(got, expect)

def test_df_arithmetic_subexpression(self, engine, parser):
df = DataFrame(
Expand All @@ -1665,8 +1674,8 @@ def test_df_arithmetic_subexpression(self, engine, parser):
)
df.eval("e = sin(a + b)", engine=engine, parser=parser, inplace=True)
got = df.e
expect = np.sin(df.a + df.b)
tm.assert_series_equal(got, expect, check_names=False)
expect = np.sin(df.a + df.b).rename("e")
tm.assert_series_equal(got, expect)

@pytest.mark.parametrize(
"dtype, expect_dtype",
Expand All @@ -1690,10 +1699,10 @@ def test_result_types(self, dtype, expect_dtype, engine, parser):
assert df.a.dtype == dtype
df.eval("b = sin(a)", engine=engine, parser=parser, inplace=True)
got = df.b
expect = np.sin(df.a)
expect = np.sin(df.a).rename("b")
assert expect.dtype == got.dtype
assert expect_dtype == got.dtype
tm.assert_series_equal(got, expect, check_names=False)
tm.assert_series_equal(got, expect)

def test_undefined_func(self, engine, parser):
df = DataFrame({"a": np.random.default_rng(2).standard_normal(10)})
Expand Down Expand Up @@ -1898,10 +1907,6 @@ def test_equals_various(other):
df = DataFrame({"A": ["a", "b", "c"]}, dtype=object)
result = df.eval(f"A == {other}")
expected = Series([False, False, False], name="A")
if USE_NUMEXPR:
# https://github.com/pandas-dev/pandas/issues/10239
# lose name with numexpr engine. Remove when that's fixed.
expected.name = None
tm.assert_series_equal(result, expected)


Expand Down
16 changes: 10 additions & 6 deletions pandas/tests/frame/test_query_eval.py
Expand Up @@ -58,26 +58,26 @@ def test_query_default(self, df, expected1, expected2):
result = df.query("A>0")
tm.assert_frame_equal(result, expected1)
result = df.eval("A+1")
tm.assert_series_equal(result, expected2, check_names=False)
tm.assert_series_equal(result, expected2)

def test_query_None(self, df, expected1, expected2):
result = df.query("A>0", engine=None)
tm.assert_frame_equal(result, expected1)
result = df.eval("A+1", engine=None)
tm.assert_series_equal(result, expected2, check_names=False)
tm.assert_series_equal(result, expected2)

def test_query_python(self, df, expected1, expected2):
result = df.query("A>0", engine="python")
tm.assert_frame_equal(result, expected1)
result = df.eval("A+1", engine="python")
tm.assert_series_equal(result, expected2, check_names=False)
tm.assert_series_equal(result, expected2)

def test_query_numexpr(self, df, expected1, expected2):
if NUMEXPR_INSTALLED:
result = df.query("A>0", engine="numexpr")
tm.assert_frame_equal(result, expected1)
result = df.eval("A+1", engine="numexpr")
tm.assert_series_equal(result, expected2, check_names=False)
tm.assert_series_equal(result, expected2)
else:
msg = (
r"'numexpr' is not installed or an unsupported version. "
Expand Down Expand Up @@ -194,8 +194,12 @@ def test_using_numpy(self, engine, parser):
df = Series([0.2, 1.5, 2.8], name="a").to_frame()
res = df.eval("@np.floor(a)", engine=engine, parser=parser)
expected = np.floor(df["a"])
if engine == "numexpr":
expected.name = None # See GH 58069
tm.assert_series_equal(expected, res)

def test_eval_simple(self, engine, parser):
df = Series([0.2, 1.5, 2.8], name="a").to_frame()
res = df.eval("a", engine=engine, parser=parser)
expected = df["a"]
tm.assert_series_equal(expected, res)


Expand Down