Skip to content

Commit

Permalink
PERF: MultiIndex._engine use smaller dtypes
Browse files Browse the repository at this point in the history
  • Loading branch information
GianlucaFicarelli committed Apr 24, 2024
1 parent 41014db commit d8fa119
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 86 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Expand Up @@ -332,6 +332,7 @@ Performance improvements
- Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`)
- Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
- Performance improvement in :meth:`Index.to_frame` returning a :class:`RangeIndex` columns of a :class:`Index` when possible. (:issue:`58018`)
- Performance improvement in :meth:`MultiIndex._engine` to use smaller dtypes if possible (:issue:``)
- Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
- Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`)
- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`)
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/index.pyi
Expand Up @@ -74,13 +74,13 @@ class MaskedBoolEngine(MaskedUInt8Engine): ...

class BaseMultiIndexCodesEngine:
levels: list[np.ndarray]
offsets: np.ndarray # ndarray[uint64_t, ndim=1]
offsets: np.ndarray # np.ndarray[..., ndim=1]

def __init__(
self,
levels: list[Index], # all entries hashable
labels: list[np.ndarray], # all entries integer-dtyped
offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1]
offsets: np.ndarray, # np.ndarray[..., ndim=1]
) -> None: ...
def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ...
def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
Expand Down
45 changes: 38 additions & 7 deletions pandas/_libs/index.pyx
Expand Up @@ -712,14 +712,16 @@ cdef class BaseMultiIndexCodesEngine:
Pre-calculated offsets, one for each level of the index.
"""
self.levels = levels
self.offsets = offsets
# Downcast the type if possible, to prevent upcasting when shifting codes:
self.offsets = offsets.astype(np.min_scalar_type(offsets[0]), copy=False)

# Transform labels in a single array, and add 2 so that we are working
# with positive integers (-1 for NaN becomes 1). This enables us to
# differentiate between values that are missing in other and matching
# NaNs. We will set values that are not found to 0 later:
labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift
codes = labels_arr.astype("uint64", copy=False)
codes = np.array(labels).T
codes += multiindex_nulls_shift # inplace sum optimisation

self.level_has_nans = [-1 in lab for lab in labels]

# Map each codes combination in the index to an integer unambiguously
Expand All @@ -731,8 +733,37 @@ cdef class BaseMultiIndexCodesEngine:
# integers representing labels: we will use its get_loc and get_indexer
self._base.__init__(self, lab_ints)

def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
raise NotImplementedError("Implemented by subclass") # pragma: no cover
def _codes_to_ints(self, ndarray codes) -> np.ndarray:
"""
Transform combination(s) of uint in one uint or Python integer (each), in a
strictly monotonic way (i.e. respecting the lexicographic order of integer
combinations).

Parameters
----------
codes : 1- or 2-dimensional array of dtype uint
Combinations of integers (one per row)

Returns
-------
scalar or 1-dimensional array, of dtype _codes_dtype
Integer(s) representing one combination (each).
"""
# To avoid overflows, first make sure we are working with the right dtype:
codes = codes.astype(self._codes_dtype, copy=False)

# Shift the representation of each level by the pre-calculated number of bits:
codes <<= self.offsets # inplace shift optimisation

# Now sum and OR are in fact interchangeable. This is a simple
# composition of the (disjunct) significant bits of each level (i.e.
# each column in "codes") in a single positive integer (per row):
if codes.ndim == 1:
# Single key
return np.bitwise_or.reduce(codes)

# Multiple keys
return np.bitwise_or.reduce(codes, axis=1)

def _extract_level_codes(self, target) -> np.ndarray:
"""
Expand All @@ -757,7 +788,7 @@ cdef class BaseMultiIndexCodesEngine:
codes[codes > 0] += 1
if self.level_has_nans[i]:
codes[target.codes[i] == -1] += 1
return self._codes_to_ints(np.array(level_codes, dtype="uint64").T)
return self._codes_to_ints(np.array(level_codes, dtype=self._codes_dtype).T)

def get_indexer(self, target: np.ndarray) -> np.ndarray:
"""
Expand Down Expand Up @@ -788,7 +819,7 @@ cdef class BaseMultiIndexCodesEngine:
raise KeyError(key)

# Transform indices into single integer:
lab_int = self._codes_to_ints(np.array(indices, dtype="uint64"))
lab_int = self._codes_to_ints(np.array(indices, dtype=self._codes_dtype))

return self._base.get_loc(self, lab_int)

Expand Down
106 changes: 44 additions & 62 deletions pandas/core/indexes/multi.py
Expand Up @@ -123,84 +123,56 @@
)


class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine):
"""
This class manages a MultiIndex by mapping label combinations to positive
integers.
class MultiIndexUInt64Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine):
"""Manages a MultiIndex by mapping label combinations to positive integers.
The number of possible label combinations must not overflow the 64 bits integers.
"""

_base = libindex.UInt64Engine
_codes_dtype = "uint64"

def _codes_to_ints(self, codes):
"""
Transform combination(s) of uint64 in one uint64 (each), in a strictly
monotonic way (i.e. respecting the lexicographic order of integer
combinations): see BaseMultiIndexCodesEngine documentation.

Parameters
----------
codes : 1- or 2-dimensional array of dtype uint64
Combinations of integers (one per row)
class MultiIndexUInt32Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt32Engine):
"""Manages a MultiIndex by mapping label combinations to positive integers.
Returns
-------
scalar or 1-dimensional array, of dtype uint64
Integer(s) representing one combination (each).
"""
# Shift the representation of each level by the pre-calculated number
# of bits:
codes <<= self.offsets
The number of possible label combinations must not overflow the 32 bits integers.
"""

# Now sum and OR are in fact interchangeable. This is a simple
# composition of the (disjunct) significant bits of each level (i.e.
# each column in "codes") in a single positive integer:
if codes.ndim == 1:
# Single key
return np.bitwise_or.reduce(codes)
_base = libindex.UInt32Engine
_codes_dtype = "uint32"

# Multiple keys
return np.bitwise_or.reduce(codes, axis=1)

class MultiIndexUInt16Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt16Engine):
"""Manages a MultiIndex by mapping label combinations to positive integers.
class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine):
"""
This class manages those (extreme) cases in which the number of possible
label combinations overflows the 64 bits integers, and uses an ObjectEngine
containing Python integers.
The number of possible label combinations must not overflow the 16 bits integers.
"""

_base = libindex.ObjectEngine
_base = libindex.UInt16Engine
_codes_dtype = "uint16"

def _codes_to_ints(self, codes):
"""
Transform combination(s) of uint64 in one Python integer (each), in a
strictly monotonic way (i.e. respecting the lexicographic order of
integer combinations): see BaseMultiIndexCodesEngine documentation.

Parameters
----------
codes : 1- or 2-dimensional array of dtype uint64
Combinations of integers (one per row)
class MultiIndexUInt8Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt8Engine):
"""Manages a MultiIndex by mapping label combinations to positive integers.
Returns
-------
int, or 1-dimensional array of dtype object
Integer(s) representing one combination (each).
"""
# Shift the representation of each level by the pre-calculated number
# of bits. Since this can overflow uint64, first make sure we are
# working with Python integers:
codes = codes.astype("object") << self.offsets
The number of possible label combinations must not overflow the 8 bits integers.
"""

# Now sum and OR are in fact interchangeable. This is a simple
# composition of the (disjunct) significant bits of each level (i.e.
# each column in "codes") in a single positive integer (per row):
if codes.ndim == 1:
# Single key
return np.bitwise_or.reduce(codes)
_base = libindex.UInt8Engine
_codes_dtype = "uint8"

# Multiple keys
return np.bitwise_or.reduce(codes, axis=1)

class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine):
"""Manages a MultiIndex by mapping label combinations to positive integers.
This class manages those (extreme) cases in which the number of possible
label combinations overflows the 64 bits integers, and uses an ObjectEngine
containing Python integers.
"""

_base = libindex.ObjectEngine
_codes_dtype = "object"


def names_compat(meth: F) -> F:
Expand Down Expand Up @@ -1235,7 +1207,17 @@ def _engine(self):
if lev_bits[0] > 64:
# The levels would overflow a 64 bit uint - use Python integers:
return MultiIndexPyIntEngine(self.levels, self.codes, offsets)
return MultiIndexUIntEngine(self.levels, self.codes, offsets)
if lev_bits[0] > 32:
# The levels would overflow a 32 bit uint - use uint64
return MultiIndexUInt64Engine(self.levels, self.codes, offsets)
if lev_bits[0] > 16:
# The levels would overflow a 16 bit uint - use uint8
return MultiIndexUInt32Engine(self.levels, self.codes, offsets)
if lev_bits[0] > 8:
# The levels would overflow a 8 bit uint - use uint16
return MultiIndexUInt16Engine(self.levels, self.codes, offsets)
# The levels fit in an 8 bit uint - use uint8
return MultiIndexUInt8Engine(self.levels, self.codes, offsets)

# Return type "Callable[..., MultiIndex]" of "_constructor" incompatible with return
# type "Type[MultiIndex]" in supertype "Index"
Expand Down
41 changes: 26 additions & 15 deletions pandas/tests/indexes/multi/test_indexing.py
Expand Up @@ -919,30 +919,41 @@ def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_id
assert result == expected


def test_pyint_engine():
@pytest.mark.parametrize(
"N, expected_dtype",
[
(1, "uint8"), # 2*4*N = 8
(2, "uint16"), # 2*4*N = 16
(4, "uint32"), # 2*4*N = 32
(8, "uint64"), # 2*4*N = 64
(10, "object"), # 2*4*N = 80
]
)
def test_pyint_engine(N, expected_dtype):
# GH#18519 : when combinations of codes cannot be represented in 64
# bits, the index underlying the MultiIndex engine works with Python
# integers, rather than uint64.
N = 5
keys = [
tuple(arr)
for arr in [
[0] * 10 * N,
[1] * 10 * N,
[2] * 10 * N,
[np.nan] * N + [2] * 9 * N,
[0] * N + [2] * 9 * N,
[np.nan] * N + [2] * 8 * N + [0] * N,
[0] * 4 * N,
[1] * 4 * N,
[np.nan] * N + [0] * 3 * N,
[0] * N + [1] * 3 * N,
[np.nan] * N + [1] * 2 * N + [0] * N,
]
]
# Each level contains 4 elements (including NaN), so it is represented
# in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a
# 64 bit engine and truncating the first levels, the fourth and fifth
# keys would collide; if truncating the last levels, the fifth and
# sixth; if rotating bits rather than shifting, the third and fifth.
# Each level contains 3 elements (NaN, 0, 1), and it's represented
# in 2 bits to store 4 possible values (0=notfound, 1=NaN, 2=0, 3=1), for
# a total of 2*N*4 = 80 > 64 bits where N=10 and the number of levels is N*4.
# If we were using a 64 bit engine and truncating the first levels, the
# fourth and fifth keys would collide; if truncating the last levels, the
# fifth and sixth; if rotating bits rather than shifting, the third and fifth.

index = MultiIndex.from_tuples(keys)
assert index._engine.values.dtype == expected_dtype

for idx, key_value in enumerate(keys):
index = MultiIndex.from_tuples(keys)
assert index.get_loc(key_value) == idx

expected = np.arange(idx + 1, dtype=np.intp)
Expand All @@ -952,7 +963,7 @@ def test_pyint_engine():
# With missing key:
idces = range(len(keys))
expected = np.array([-1] + list(idces), dtype=np.intp)
missing = tuple([0, 1] * 5 * N)
missing = tuple([0, 1, 0, 1] * N)
result = index.get_indexer([missing] + [keys[i] for i in idces])
tm.assert_numpy_array_equal(result, expected)

Expand Down

0 comments on commit d8fa119

Please sign in to comment.