Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: MultiIndex._engine use smaller dtypes #58411

Merged
merged 3 commits into from Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Expand Up @@ -332,6 +332,7 @@ Performance improvements
- Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`)
- Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
- Performance improvement in :meth:`Index.to_frame` returning a :class:`RangeIndex` columns of a :class:`Index` when possible. (:issue:`58018`)
- Performance improvement in :meth:`MultiIndex._engine` to use smaller dtypes if possible (:issue:`58411`)
- Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
- Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`)
- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`)
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/index.pyi
Expand Up @@ -74,13 +74,13 @@ class MaskedBoolEngine(MaskedUInt8Engine): ...

class BaseMultiIndexCodesEngine:
levels: list[np.ndarray]
offsets: np.ndarray # ndarray[uint64_t, ndim=1]
offsets: np.ndarray # np.ndarray[..., ndim=1]

def __init__(
self,
levels: list[Index], # all entries hashable
labels: list[np.ndarray], # all entries integer-dtyped
offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1]
offsets: np.ndarray, # np.ndarray[..., ndim=1]
) -> None: ...
def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ...
def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
Expand Down
45 changes: 38 additions & 7 deletions pandas/_libs/index.pyx
Expand Up @@ -712,14 +712,16 @@ cdef class BaseMultiIndexCodesEngine:
Pre-calculated offsets, one for each level of the index.
"""
self.levels = levels
self.offsets = offsets
# Downcast the type if possible, to prevent upcasting when shifting codes:
self.offsets = offsets.astype(np.min_scalar_type(offsets[0]), copy=False)
GianlucaFicarelli marked this conversation as resolved.
Show resolved Hide resolved

# Transform labels in a single array, and add 2 so that we are working
# with positive integers (-1 for NaN becomes 1). This enables us to
# differentiate between values that are missing in other and matching
# NaNs. We will set values that are not found to 0 later:
labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift
codes = labels_arr.astype("uint64", copy=False)
codes = np.array(labels).T
codes += multiindex_nulls_shift # inplace sum optimisation

self.level_has_nans = [-1 in lab for lab in labels]

# Map each codes combination in the index to an integer unambiguously
Expand All @@ -731,8 +733,37 @@ cdef class BaseMultiIndexCodesEngine:
# integers representing labels: we will use its get_loc and get_indexer
self._base.__init__(self, lab_ints)

def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
raise NotImplementedError("Implemented by subclass") # pragma: no cover
def _codes_to_ints(self, ndarray codes) -> np.ndarray:
"""
Transform combination(s) of uint in one uint or Python integer (each), in a
strictly monotonic way (i.e. respecting the lexicographic order of integer
combinations).

Parameters
----------
codes : 1- or 2-dimensional array of dtype uint
Combinations of integers (one per row)

Returns
-------
scalar or 1-dimensional array, of dtype _codes_dtype
Integer(s) representing one combination (each).
"""
# To avoid overflows, first make sure we are working with the right dtype:
codes = codes.astype(self._codes_dtype, copy=False)

# Shift the representation of each level by the pre-calculated number of bits:
codes <<= self.offsets # inplace shift optimisation

# Now sum and OR are in fact interchangeable. This is a simple
# composition of the (disjunct) significant bits of each level (i.e.
# each column in "codes") in a single positive integer (per row):
if codes.ndim == 1:
# Single key
return np.bitwise_or.reduce(codes)

# Multiple keys
return np.bitwise_or.reduce(codes, axis=1)

def _extract_level_codes(self, target) -> np.ndarray:
"""
Expand All @@ -757,7 +788,7 @@ cdef class BaseMultiIndexCodesEngine:
codes[codes > 0] += 1
if self.level_has_nans[i]:
codes[target.codes[i] == -1] += 1
return self._codes_to_ints(np.array(level_codes, dtype="uint64").T)
return self._codes_to_ints(np.array(level_codes, dtype=self._codes_dtype).T)

def get_indexer(self, target: np.ndarray) -> np.ndarray:
"""
Expand Down Expand Up @@ -788,7 +819,7 @@ cdef class BaseMultiIndexCodesEngine:
raise KeyError(key)

# Transform indices into single integer:
lab_int = self._codes_to_ints(np.array(indices, dtype="uint64"))
lab_int = self._codes_to_ints(np.array(indices, dtype=self._codes_dtype))

return self._base.get_loc(self, lab_int)

Expand Down
106 changes: 44 additions & 62 deletions pandas/core/indexes/multi.py
Expand Up @@ -123,84 +123,56 @@
)


class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine):
"""
This class manages a MultiIndex by mapping label combinations to positive
integers.
class MultiIndexUInt64Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine):
"""Manages a MultiIndex by mapping label combinations to positive integers.

The number of possible label combinations must not overflow the 64 bits integers.
"""

_base = libindex.UInt64Engine
_codes_dtype = "uint64"

def _codes_to_ints(self, codes):
"""
Transform combination(s) of uint64 in one uint64 (each), in a strictly
monotonic way (i.e. respecting the lexicographic order of integer
combinations): see BaseMultiIndexCodesEngine documentation.

Parameters
----------
codes : 1- or 2-dimensional array of dtype uint64
Combinations of integers (one per row)
class MultiIndexUInt32Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt32Engine):
"""Manages a MultiIndex by mapping label combinations to positive integers.

Returns
-------
scalar or 1-dimensional array, of dtype uint64
Integer(s) representing one combination (each).
"""
# Shift the representation of each level by the pre-calculated number
# of bits:
codes <<= self.offsets
The number of possible label combinations must not overflow the 32 bits integers.
"""

# Now sum and OR are in fact interchangeable. This is a simple
# composition of the (disjunct) significant bits of each level (i.e.
# each column in "codes") in a single positive integer:
if codes.ndim == 1:
# Single key
return np.bitwise_or.reduce(codes)
_base = libindex.UInt32Engine
_codes_dtype = "uint32"

# Multiple keys
return np.bitwise_or.reduce(codes, axis=1)

class MultiIndexUInt16Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt16Engine):
"""Manages a MultiIndex by mapping label combinations to positive integers.

class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine):
"""
This class manages those (extreme) cases in which the number of possible
label combinations overflows the 64 bits integers, and uses an ObjectEngine
containing Python integers.
The number of possible label combinations must not overflow the 16 bits integers.
"""

_base = libindex.ObjectEngine
_base = libindex.UInt16Engine
_codes_dtype = "uint16"

def _codes_to_ints(self, codes):
"""
Transform combination(s) of uint64 in one Python integer (each), in a
strictly monotonic way (i.e. respecting the lexicographic order of
integer combinations): see BaseMultiIndexCodesEngine documentation.

Parameters
----------
codes : 1- or 2-dimensional array of dtype uint64
Combinations of integers (one per row)
class MultiIndexUInt8Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt8Engine):
"""Manages a MultiIndex by mapping label combinations to positive integers.

Returns
-------
int, or 1-dimensional array of dtype object
Integer(s) representing one combination (each).
"""
# Shift the representation of each level by the pre-calculated number
# of bits. Since this can overflow uint64, first make sure we are
# working with Python integers:
codes = codes.astype("object") << self.offsets
The number of possible label combinations must not overflow the 8 bits integers.
"""

# Now sum and OR are in fact interchangeable. This is a simple
# composition of the (disjunct) significant bits of each level (i.e.
# each column in "codes") in a single positive integer (per row):
if codes.ndim == 1:
# Single key
return np.bitwise_or.reduce(codes)
_base = libindex.UInt8Engine
_codes_dtype = "uint8"

# Multiple keys
return np.bitwise_or.reduce(codes, axis=1)

class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine):
"""Manages a MultiIndex by mapping label combinations to positive integers.

This class manages those (extreme) cases in which the number of possible
label combinations overflows the 64 bits integers, and uses an ObjectEngine
containing Python integers.
"""

_base = libindex.ObjectEngine
_codes_dtype = "object"


def names_compat(meth: F) -> F:
Expand Down Expand Up @@ -1235,7 +1207,17 @@ def _engine(self):
if lev_bits[0] > 64:
# The levels would overflow a 64 bit uint - use Python integers:
return MultiIndexPyIntEngine(self.levels, self.codes, offsets)
return MultiIndexUIntEngine(self.levels, self.codes, offsets)
if lev_bits[0] > 32:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible for the existing engine to get resized where it would exceed e.g. int32 max? Or would a new engine just be created?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you clarify in what cases the existing engine would exceed int32 max?
In cases where MultiIndex._set_levels() is called internally, this would also call _reset_cache() so the engine is recreated.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't exactly remember what path this goes through, but a snippet like

n = np.iinfo(np.uint8).max
ser = pd.Series(range(n), index=pd.MultiIndex.from_arrays([range(n]))
ser.iloc[n+1] = n+1
ser.index.some_method_that_uses_the_engine

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried the snippet above with some slight adjustment:

In [34]: n = np.iinfo(np.uint8).max - 1

In [35]: ser = pd.Series(range(n), index=pd.MultiIndex.from_arrays([range(n)]))

In [36]: ser.index._cache
Out[36]: {}

In [37]: ser.index._engine
Out[37]: <pandas.core.indexes.multi.MultiIndexUInt8Engine at 0x118e0fed0>

In [38]: ser.index._engine.values
Out[38]: 
array([  2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
        15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
       119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
       132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
       145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
       158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170,
       171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183,
       184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
       197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
       210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
       223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235,
       236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248,
       249, 250, 251, 252, 253, 254, 255], dtype=uint8)

In [39]: n
Out[39]: 254

In [40]: ser.index._cache
Out[40]: 
{'levels': FrozenList([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]]),
 '_engine': <pandas.core.indexes.multi.MultiIndexUInt8Engine at 0x118e0fed0>}

In [41]: ser.loc[n+1] = n+1

In [42]: ser.index._cache
Out[42]: {'levels': FrozenList([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]])}

In [43]: ser.index._engine
Out[43]: <pandas.core.indexes.multi.MultiIndexUInt16Engine at 0x107288270>

In [44]: ser.index._cache
Out[44]: 
{'levels': FrozenList([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]]),
 '_engine': <pandas.core.indexes.multi.MultiIndexUInt16Engine at 0x107288270>}

So it seems that after calling ser.loc, the engine is automatically deleted from the cache (or the index is a copy, since id returns a different number)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great thanks for checking

# The levels would overflow a 32 bit uint - use uint64
return MultiIndexUInt64Engine(self.levels, self.codes, offsets)
if lev_bits[0] > 16:
# The levels would overflow a 16 bit uint - use uint8
return MultiIndexUInt32Engine(self.levels, self.codes, offsets)
if lev_bits[0] > 8:
# The levels would overflow a 8 bit uint - use uint16
return MultiIndexUInt16Engine(self.levels, self.codes, offsets)
# The levels fit in an 8 bit uint - use uint8
return MultiIndexUInt8Engine(self.levels, self.codes, offsets)

# Return type "Callable[..., MultiIndex]" of "_constructor" incompatible with return
# type "Type[MultiIndex]" in supertype "Index"
Expand Down
41 changes: 26 additions & 15 deletions pandas/tests/indexes/multi/test_indexing.py
Expand Up @@ -919,30 +919,41 @@ def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_id
assert result == expected


def test_pyint_engine():
@pytest.mark.parametrize(
"N, expected_dtype",
[
(1, "uint8"), # 2*4*N = 8
(2, "uint16"), # 2*4*N = 16
(4, "uint32"), # 2*4*N = 32
(8, "uint64"), # 2*4*N = 64
(10, "object"), # 2*4*N = 80
],
)
def test_pyint_engine(N, expected_dtype):
# GH#18519 : when combinations of codes cannot be represented in 64
# bits, the index underlying the MultiIndex engine works with Python
# integers, rather than uint64.
N = 5
keys = [
tuple(arr)
for arr in [
[0] * 10 * N,
[1] * 10 * N,
[2] * 10 * N,
[np.nan] * N + [2] * 9 * N,
[0] * N + [2] * 9 * N,
[np.nan] * N + [2] * 8 * N + [0] * N,
[0] * 4 * N,
[1] * 4 * N,
[np.nan] * N + [0] * 3 * N,
[0] * N + [1] * 3 * N,
[np.nan] * N + [1] * 2 * N + [0] * N,
]
]
# Each level contains 4 elements (including NaN), so it is represented
# in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a
# 64 bit engine and truncating the first levels, the fourth and fifth
# keys would collide; if truncating the last levels, the fifth and
# sixth; if rotating bits rather than shifting, the third and fifth.
# Each level contains 3 elements (NaN, 0, 1), and it's represented
# in 2 bits to store 4 possible values (0=notfound, 1=NaN, 2=0, 3=1), for
# a total of 2*N*4 = 80 > 64 bits where N=10 and the number of levels is N*4.
# If we were using a 64 bit engine and truncating the first levels, the
# fourth and fifth keys would collide; if truncating the last levels, the
# fifth and sixth; if rotating bits rather than shifting, the third and fifth.

index = MultiIndex.from_tuples(keys)
assert index._engine.values.dtype == expected_dtype

for idx, key_value in enumerate(keys):
index = MultiIndex.from_tuples(keys)
assert index.get_loc(key_value) == idx

expected = np.arange(idx + 1, dtype=np.intp)
Expand All @@ -952,7 +963,7 @@ def test_pyint_engine():
# With missing key:
idces = range(len(keys))
expected = np.array([-1] + list(idces), dtype=np.intp)
missing = tuple([0, 1] * 5 * N)
missing = tuple([0, 1, 0, 1] * N)
result = index.get_indexer([missing] + [keys[i] for i in idces])
tm.assert_numpy_array_equal(result, expected)

Expand Down