CLN: Simplify map_infer_mask (#58483)

* CLN: Simplify map_infer_mask * fix some tests * fix tests? * Fix types? * fixup annotations
pandas-dev · May 10, 2024 · 24182c2 · 24182c2
1 parent e67241b
commit 24182c2
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 74 deletions.
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -148,10 +148,14 @@ def time_searchsorted(self, dtype):
 
 
 class Map:
-    params = (["dict", "Series", "lambda"], ["object", "category", "int"])
-    param_names = "mapper"
-
-    def setup(self, mapper, dtype):
+    params = (
+        ["dict", "Series", "lambda"],
+        ["object", "category", "int"],
+        [None, "ignore"],
+    )
+    param_names = ["mapper", "dtype", "na_action"]
+
+    def setup(self, mapper, dtype, na_action):
         map_size = 1000
         map_data = Series(map_size - np.arange(map_size), dtype=dtype)
 
@@ -168,8 +172,8 @@ def setup(self, mapper, dtype):
 
         self.s = Series(np.random.randint(0, map_size, 10000), dtype=dtype)
 
-    def time_map(self, mapper, *args, **kwargs):
-        self.s.map(self.map_data)
+    def time_map(self, mapper, dtype, na_action):
+        self.s.map(self.map_data, na_action=na_action)
 
 
 class Clip:

diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd
@@ -34,8 +34,3 @@ ctypedef fused numeric_t:
 ctypedef fused numeric_object_t:
     numeric_t
     object
-
-ctypedef fused uint8_int64_object_t:
-    uint8_t
-    int64_t
-    object
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -53,6 +53,7 @@ from numpy cimport (
     PyArray_ITER_DATA,
     PyArray_ITER_NEXT,
     PyArray_IterNew,
+    PyArray_SETITEM,
     complex128_t,
     flatiter,
     float64_t,
@@ -75,7 +76,6 @@ cdef extern from "pandas/parser/pd_parser.h":
 PandasParser_IMPORT
 
 from pandas._libs cimport util
-from pandas._libs.dtypes cimport uint8_int64_object_t
 from pandas._libs.util cimport (
     INT64_MAX,
     INT64_MIN,
@@ -2845,14 +2845,16 @@ no_default = _NoDefault.no_default  # Sentinel indicating the default value.
 NoDefault = Literal[_NoDefault.no_default]
 
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
 def map_infer_mask(
-        ndarray[object] arr,
-        object f,
-        const uint8_t[:] mask,
-        *,
-        bint convert=True,
-        object na_value=no_default,
-        cnp.dtype dtype=np.dtype(object)
+    ndarray arr,
+    object f,
+    const uint8_t[:] mask,
+    *,
+    bint convert=True,
+    object na_value=no_default,
+    cnp.dtype dtype=np.dtype(object)
 ) -> "ArrayLike":
     """
     Substitute for np.vectorize with pandas-friendly dtype inference.
@@ -2875,53 +2877,39 @@ def map_infer_mask(
     -------
     np.ndarray or an ExtensionArray
     """
-    cdef Py_ssize_t n = len(arr)
-    result = np.empty(n, dtype=dtype)
-
-    _map_infer_mask(
-        result,
-        arr,
-        f,
-        mask,
-        na_value,
-    )
-    if convert:
-        return maybe_convert_objects(result)
-    else:
-        return result
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def _map_infer_mask(
-        ndarray[uint8_int64_object_t] out,
-        ndarray[object] arr,
-        object f,
-        const uint8_t[:] mask,
-        object na_value=no_default,
-) -> None:
-    """
-    Helper for map_infer_mask, split off to use fused types based on the result.
-    """
     cdef:
-        Py_ssize_t i, n
+        Py_ssize_t i
+        Py_ssize_t n = len(arr)
         object val
 
-    n = len(arr)
+        ndarray result = np.empty(n, dtype=dtype)
+
+        flatiter arr_it = PyArray_IterNew(arr)
+        flatiter result_it = PyArray_IterNew(result)
+
     for i in range(n):
         if mask[i]:
             if na_value is no_default:
-                val = arr[i]
+                val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it))
             else:
                 val = na_value
         else:
-            val = f(arr[i])
+            val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it))
+            val = f(val)
 
             if cnp.PyArray_IsZeroDim(val):
                 # unbox 0-dim arrays, GH#690
                 val = val.item()
 
-        out[i] = val
+        PyArray_SETITEM(result, PyArray_ITER_DATA(result_it), val)
+
+        PyArray_ITER_NEXT(arr_it)
+        PyArray_ITER_NEXT(result_it)
+
+    if convert:
+        return maybe_convert_objects(result)
+    else:
+        return result
 
 
 @cython.boundscheck(False)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -629,28 +629,25 @@ def _str_map(
                 na_value = np.nan
             else:
                 na_value = False
-            try:
-                result = lib.map_infer_mask(
-                    arr,
-                    f,
-                    mask.view("uint8"),
-                    convert=False,
-                    na_value=na_value,
-                    dtype=np.dtype(cast(type, dtype)),
-                )
-                return result
-
-            except ValueError:
-                result = lib.map_infer_mask(
-                    arr,
-                    f,
-                    mask.view("uint8"),
-                    convert=False,
-                    na_value=na_value,
-                )
-                if convert and result.dtype == object:
-                    result = lib.maybe_convert_objects(result)
-                return result
+
+            dtype = np.dtype(cast(type, dtype))
+            if mask.any():
+                # numpy int/bool dtypes cannot hold NaNs so we must convert to
+                # float64 for int (to match maybe_convert_objects) or
+                # object for bool (again to match maybe_convert_objects)
+                if is_integer_dtype(dtype):
+                    dtype = np.dtype("float64")
+                else:
+                    dtype = np.dtype(object)
+            result = lib.map_infer_mask(
+                arr,
+                f,
+                mask.view("uint8"),
+                convert=False,
+                na_value=na_value,
+                dtype=dtype,
+            )
+            return result
 
         elif is_string_dtype(dtype) and not is_object_dtype(dtype):
             # i.e. StringDtype