BUG/PERF: groupby.transform with unobserved categories (#58084)

pandas-dev · May 8, 2024 · 8d543ba · 8d543ba
1 parent ca55d77
commit 8d543ba
Show file tree

Hide file tree

Showing 5 changed files with 120 additions and 19 deletions.
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -457,6 +457,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
 - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
 - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`)
+- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`)
 
 
 Reshaping

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1875,24 +1875,40 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
 
         else:
             # i.e. func in base.reduction_kernels
+            if self.observed:
+                return self._reduction_kernel_transform(
+                    func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
+                )
 
-            # GH#30918 Use _transform_fast only when we know func is an aggregation
-            # If func is a reduction, we need to broadcast the
-            # result to the whole group. Compute func result
-            # and deal with possible broadcasting below.
-            with com.temp_setattr(self, "as_index", True):
-                # GH#49834 - result needs groups in the index for
-                # _wrap_transform_fast_result
-                if func in ["idxmin", "idxmax"]:
-                    func = cast(Literal["idxmin", "idxmax"], func)
-                    result = self._idxmax_idxmin(func, True, *args, **kwargs)
-                else:
-                    if engine is not None:
-                        kwargs["engine"] = engine
-                        kwargs["engine_kwargs"] = engine_kwargs
-                    result = getattr(self, func)(*args, **kwargs)
+            with (
+                com.temp_setattr(self, "observed", True),
+                com.temp_setattr(self, "_grouper", self._grouper.observed_grouper),
+            ):
+                return self._reduction_kernel_transform(
+                    func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
+                )
+
+    @final
+    def _reduction_kernel_transform(
+        self, func, *args, engine=None, engine_kwargs=None, **kwargs
+    ):
+        # GH#30918 Use _transform_fast only when we know func is an aggregation
+        # If func is a reduction, we need to broadcast the
+        # result to the whole group. Compute func result
+        # and deal with possible broadcasting below.
+        with com.temp_setattr(self, "as_index", True):
+            # GH#49834 - result needs groups in the index for
+            # _wrap_transform_fast_result
+            if func in ["idxmin", "idxmax"]:
+                func = cast(Literal["idxmin", "idxmax"], func)
+                result = self._idxmax_idxmin(func, True, *args, **kwargs)
+            else:
+                if engine is not None:
+                    kwargs["engine"] = engine
+                    kwargs["engine_kwargs"] = engine_kwargs
+                result = getattr(self, func)(*args, **kwargs)
 
-            return self._wrap_transform_fast_result(result)
+        return self._wrap_transform_fast_result(result)
 
     @final
     def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:

diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -668,6 +668,28 @@ def groups(self) -> dict[Hashable, Index]:
         cats = Categorical.from_codes(codes, uniques, validate=False)
         return self._index.groupby(cats)
 
+    @property
+    def observed_grouping(self) -> Grouping:
+        if self._observed:
+            return self
+
+        return self._observed_grouping
+
+    @cache_readonly
+    def _observed_grouping(self) -> Grouping:
+        grouping = Grouping(
+            self._index,
+            self._orig_grouper,
+            obj=self.obj,
+            level=self.level,
+            sort=self._sort,
+            observed=True,
+            in_axis=self.in_axis,
+            dropna=self._dropna,
+            uniques=self._uniques,
+        )
+        return grouping
+
 
 def get_grouper(
     obj: NDFrameT,

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -823,6 +823,19 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
 
         return result_index, ids
 
+    @property
+    def observed_grouper(self) -> BaseGrouper:
+        if all(ping._observed for ping in self.groupings):
+            return self
+
+        return self._observed_grouper
+
+    @cache_readonly
+    def _observed_grouper(self) -> BaseGrouper:
+        groupings = [ping.observed_grouping for ping in self.groupings]
+        grouper = BaseGrouper(self.axis, groupings, sort=self._sort, dropna=self.dropna)
+        return grouper
+
     def _ob_index_and_ids(
         self,
         levels: list[Index],
@@ -1154,6 +1167,10 @@ def groupings(self) -> list[grouper.Grouping]:
         )
         return [ping]
 
+    @property
+    def observed_grouper(self) -> BinGrouper:
+        return self
+
 
 def _is_indexed_like(obj, axes) -> bool:
     if isinstance(obj, Series):

diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
@@ -1232,9 +1232,9 @@ def test_categorical_and_not_categorical_key(observed):
     tm.assert_frame_equal(result, expected_explicit)
 
     # Series case
-    result = df_with_categorical.groupby(["A", "C"], observed=observed)["B"].transform(
-        "sum"
-    )
+    gb = df_with_categorical.groupby(["A", "C"], observed=observed)
+    gbp = gb["B"]
+    result = gbp.transform("sum")
     expected = df_without_categorical.groupby(["A", "C"])["B"].transform("sum")
     tm.assert_series_equal(result, expected)
     expected_explicit = Series([4, 2, 4], name="B")
@@ -1535,3 +1535,48 @@ def test_transform_sum_one_column_with_matching_labels_and_missing_labels():
     result = df.groupby(series, as_index=False).transform("sum")
     expected = DataFrame({"X": [-93203.0, -93203.0, np.nan]})
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["int32", "float32"])
+def test_min_one_unobserved_category_no_type_coercion(dtype):
+    # GH#58084
+    df = DataFrame({"A": Categorical([1, 1, 2], categories=[1, 2, 3]), "B": [3, 4, 5]})
+    df["B"] = df["B"].astype(dtype)
+    gb = df.groupby("A", observed=False)
+    result = gb.transform("min")
+
+    expected = DataFrame({"B": [3, 3, 5]}, dtype=dtype)
+    tm.assert_frame_equal(expected, result)
+
+
+def test_min_all_empty_data_no_type_coercion():
+    # GH#58084
+    df = DataFrame(
+        {
+            "X": Categorical(
+                [],
+                categories=[1, "randomcat", 100],
+            ),
+            "Y": [],
+        }
+    )
+    df["Y"] = df["Y"].astype("int32")
+
+    gb = df.groupby("X", observed=False)
+    result = gb.transform("min")
+
+    expected = DataFrame({"Y": []}, dtype="int32")
+    tm.assert_frame_equal(expected, result)
+
+
+def test_min_one_dim_no_type_coercion():
+    # GH#58084
+    df = DataFrame({"Y": [9435, -5465765, 5055, 0, 954960]})
+    df["Y"] = df["Y"].astype("int32")
+    categories = Categorical([1, 2, 2, 5, 1], categories=[1, 2, 3, 4, 5])
+
+    gb = df.groupby(categories, observed=False)
+    result = gb.transform("min")
+
+    expected = DataFrame({"Y": [9435, -5465765, -5465765, 0, 9435]}, dtype="int32")
+    tm.assert_frame_equal(expected, result)