Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: move MaskedArray subclass attributes to dtypes #58423

Merged
merged 5 commits into from Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions pandas/_libs/lib.pyx
Expand Up @@ -2808,14 +2808,14 @@ def maybe_convert_objects(ndarray[object] objects,
from pandas.core.arrays import IntegerArray

# Set these values to 1 to be deterministic, match
# IntegerArray._internal_fill_value
# IntegerDtype._internal_fill_value
result[mask] = 1
result = IntegerArray(result, mask)
elif result is floats and convert_to_nullable_dtype:
from pandas.core.arrays import FloatingArray

# Set these values to 1.0 to be deterministic, match
# FloatingArray._internal_fill_value
# FloatingDtype._internal_fill_value
result[mask] = 1.0
result = FloatingArray(result, mask)

Expand Down
10 changes: 3 additions & 7 deletions pandas/core/arrays/boolean.py
Expand Up @@ -68,6 +68,9 @@ class BooleanDtype(BaseMaskedDtype):

name: ClassVar[str] = "boolean"

# The value used to fill '_data' to avoid upcasting
_internal_fill_value = False

# https://github.com/python/mypy/issues/4125
# error: Signature of "type" incompatible with supertype "BaseMaskedDtype"
@property
Expand Down Expand Up @@ -293,13 +296,6 @@ class BooleanArray(BaseMaskedArray):
Length: 3, dtype: boolean
"""

# The value used to fill '_data' to avoid upcasting
_internal_fill_value = False
# Fill values used for any/all
# Incompatible types in assignment (expression has type "bool", base class
# "BaseMaskedArray" defined the type as "<typing special form>")
_truthy_value = True # type: ignore[assignment]
_falsey_value = False # type: ignore[assignment]
_TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"}
_FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"}

Expand Down
10 changes: 2 additions & 8 deletions pandas/core/arrays/floating.py
Expand Up @@ -23,6 +23,8 @@ class FloatingDtype(NumericDtype):
The attributes name & type are set when these subclasses are created.
"""

# The value used to fill '_data' to avoid upcasting
_internal_fill_value = np.nan
_default_np_dtype = np.dtype(np.float64)
_checker = is_float_dtype

Expand Down Expand Up @@ -113,14 +115,6 @@ class FloatingArray(NumericArray):

_dtype_cls = FloatingDtype

# The value used to fill '_data' to avoid upcasting
_internal_fill_value = np.nan
# Fill values used for any/all
# Incompatible types in assignment (expression has type "float", base class
# "BaseMaskedArray" defined the type as "<typing special form>")
_truthy_value = 1.0 # type: ignore[assignment]
_falsey_value = 0.0 # type: ignore[assignment]


_dtype_docstring = """
An ExtensionDtype for {dtype} data.
Expand Down
10 changes: 2 additions & 8 deletions pandas/core/arrays/integer.py
Expand Up @@ -23,6 +23,8 @@ class IntegerDtype(NumericDtype):
The attributes name & type are set when these subclasses are created.
"""

# The value used to fill '_data' to avoid upcasting
_internal_fill_value = 1
_default_np_dtype = np.dtype(np.int64)
_checker = is_integer_dtype

Expand Down Expand Up @@ -128,14 +130,6 @@ class IntegerArray(NumericArray):

_dtype_cls = IntegerDtype

# The value used to fill '_data' to avoid upcasting
_internal_fill_value = 1
# Fill values used for any/all
# Incompatible types in assignment (expression has type "int", base class
# "BaseMaskedArray" defined the type as "<typing special form>")
_truthy_value = 1 # type: ignore[assignment]
_falsey_value = 0 # type: ignore[assignment]


_dtype_docstring = """
An ExtensionDtype for {dtype} integer data.
Expand Down
60 changes: 23 additions & 37 deletions pandas/core/arrays/masked.py
Expand Up @@ -5,6 +5,7 @@
Any,
Callable,
Literal,
cast,
overload,
)
import warnings
Expand All @@ -16,22 +17,6 @@
missing as libmissing,
)
from pandas._libs.tslibs import is_supported_dtype
from pandas._typing import (
ArrayLike,
AstypeArg,
AxisInt,
DtypeObj,
FillnaOptions,
InterpolateOptions,
NpDtype,
PositionalIndexer,
Scalar,
ScalarIndexer,
Self,
SequenceIndexer,
Shape,
npt,
)
from pandas.compat import (
IS64,
is_platform_windows,
Expand Down Expand Up @@ -97,6 +82,20 @@
from pandas._typing import (
NumpySorter,
NumpyValueArrayLike,
ArrayLike,
AstypeArg,
AxisInt,
DtypeObj,
FillnaOptions,
InterpolateOptions,
NpDtype,
PositionalIndexer,
Scalar,
ScalarIndexer,
Self,
SequenceIndexer,
Shape,
npt,
)
from pandas._libs.missing import NAType
from pandas.core.arrays import FloatingArray
Expand All @@ -111,16 +110,10 @@ class BaseMaskedArray(OpsMixin, ExtensionArray):
numpy based
"""

# The value used to fill '_data' to avoid upcasting
_internal_fill_value: Scalar
# our underlying data and mask are each ndarrays
_data: np.ndarray
_mask: npt.NDArray[np.bool_]

# Fill values used for any/all
_truthy_value = Scalar # bool(_truthy_value) = True
_falsey_value = Scalar # bool(_falsey_value) = False

@classmethod
def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self:
result = BaseMaskedArray.__new__(cls)
Expand Down Expand Up @@ -155,8 +148,9 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self:
@classmethod
@doc(ExtensionArray._empty)
def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self:
values = np.empty(shape, dtype=dtype.type)
values.fill(cls._internal_fill_value)
dtype = cast(BaseMaskedDtype, dtype)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cast suspicious - why wouldn't we just change the type of dtype to BaseMaskedType if the _internal_fill_value attribute was required?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC mypy complains if we do that, but in practice that is correct that we should only ever get BaseMaskedDtype here

values: np.ndarray = np.empty(shape, dtype=dtype.type)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the type declaration required here? I assume mypy would be able to infer this from the right hand side

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mypy complained without this

values.fill(dtype._internal_fill_value)
mask = np.ones(shape, dtype=bool)
result = cls(values, mask)
if not isinstance(result, cls) or dtype != result.dtype:
Expand Down Expand Up @@ -917,7 +911,9 @@ def take(
) -> Self:
# we always fill with 1 internally
# to avoid upcasting
data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value
data_fill_value = (
self.dtype._internal_fill_value if isna(fill_value) else fill_value
)
result = take(
self._data,
indexer,
Expand Down Expand Up @@ -1397,12 +1393,7 @@ def any(
nv.validate_any((), kwargs)

values = self._data.copy()
# error: Argument 3 to "putmask" has incompatible type "object";
# expected "Union[_SupportsArray[dtype[Any]],
# _NestedSequence[_SupportsArray[dtype[Any]]],
# bool, int, float, complex, str, bytes,
# _NestedSequence[Union[bool, int, float, complex, str, bytes]]]"
np.putmask(values, self._mask, self._falsey_value) # type: ignore[arg-type]
np.putmask(values, self._mask, self.dtype.falsey_value)
result = values.any()
if skipna:
return result
Expand Down Expand Up @@ -1490,12 +1481,7 @@ def all(
nv.validate_all((), kwargs)

values = self._data.copy()
# error: Argument 3 to "putmask" has incompatible type "object";
# expected "Union[_SupportsArray[dtype[Any]],
# _NestedSequence[_SupportsArray[dtype[Any]]],
# bool, int, float, complex, str, bytes,
# _NestedSequence[Union[bool, int, float, complex, str, bytes]]]"
np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type]
np.putmask(values, self._mask, self.dtype.truthy_value)
result = values.all(axis=axis)

if skipna:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/numeric.py
Expand Up @@ -221,7 +221,7 @@ def _coerce_to_data_and_mask(
# we copy as need to coerce here
if mask.any():
values = values.copy()
values[mask] = cls._internal_fill_value
values[mask] = dtype_cls._internal_fill_value
if inferred_type in ("string", "unicode"):
# casts from str are always safe since they raise
# a ValueError if the str cannot be parsed into a float
Expand Down
20 changes: 20 additions & 0 deletions pandas/core/dtypes/dtypes.py
Expand Up @@ -79,6 +79,7 @@
DtypeObj,
IntervalClosedType,
Ordered,
Scalar,
Self,
npt,
type_t,
Expand Down Expand Up @@ -1542,6 +1543,25 @@ class BaseMaskedDtype(ExtensionDtype):

base = None
type: type
_internal_fill_value: Scalar

@property
def truthy_value(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should these be "private" ie `_truthy_value?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

they are accessed from outside the dtype's methods

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't these raise NotImplementedError as a fallback? The idea is this class can be used for more than just float/integer/bool in the future right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i dont think there's any plans to extend like that

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On second thought decided to privatize after all

# Fill values used for 'any'
if self.kind == "f":
return 1.0
if self.kind in "iu":
return 1
return True

@property
def falsey_value(self):
# Fill values used for 'all'
if self.kind == "f":
return 0.0
if self.kind in "iu":
return 0
return False

@property
def na_value(self) -> libmissing.NAType:
Expand Down