diff --git a/benchmarks/benchmarks/bench_lib.py b/benchmarks/benchmarks/bench_lib.py index f7884cd6c309..b64f8ab173cb 100644 --- a/benchmarks/benchmarks/bench_lib.py +++ b/benchmarks/benchmarks/bench_lib.py @@ -137,3 +137,22 @@ def setup(self, array_size, percent_nans): def time_unique(self, array_size, percent_nans): np.unique(self.arr) + + +class Isin(Benchmark): + """Benchmarks for `numpy.isin`.""" + + param_names = ["size", "highest_element"] + params = [ + [10, 100000, 3000000], + [10, 10000, int(1e8)] + ] + + def setup(self, size, highest_element): + self.array = np.random.randint( + low=0, high=highest_element, size=size) + self.in_array = np.random.randint( + low=0, high=highest_element, size=size) + + def time_isin(self, size, highest_element): + np.isin(self.array, self.in_array) diff --git a/doc/release/upcoming_changes/12065.performance.rst b/doc/release/upcoming_changes/12065.performance.rst new file mode 100644 index 000000000000..08d1263b94fb --- /dev/null +++ b/doc/release/upcoming_changes/12065.performance.rst @@ -0,0 +1,6 @@ +Faster version of ``np.isin`` and ``np.in1d`` for integer arrays +---------------------------------------------------------------- +``np.in1d`` (used by ``np.isin``) can now switch to a faster algorithm +(up to >10x faster) when it is passed two integer arrays. +This is often automatically used, but you can use ``kind="sort"`` or +``kind="table"`` to force the old or new method, respectively. \ No newline at end of file diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py index d42ab26758c7..9a44d7e443bb 100644 --- a/numpy/lib/arraysetops.py +++ b/numpy/lib/arraysetops.py @@ -516,12 +516,13 @@ def setxor1d(ar1, ar2, assume_unique=False): return aux[flag[1:] & flag[:-1]] -def _in1d_dispatcher(ar1, ar2, assume_unique=None, invert=None): +def _in1d_dispatcher(ar1, ar2, assume_unique=None, invert=None, *, + kind=None): return (ar1, ar2) @array_function_dispatch(_in1d_dispatcher) -def in1d(ar1, ar2, assume_unique=False, invert=False): +def in1d(ar1, ar2, assume_unique=False, invert=False, *, kind=None): """ Test whether each element of a 1-D array is also present in a second array. @@ -544,6 +545,26 @@ def in1d(ar1, ar2, assume_unique=False, invert=False): False where an element of `ar1` is in `ar2` and True otherwise). Default is False. ``np.in1d(a, b, invert=True)`` is equivalent to (but is faster than) ``np.invert(in1d(a, b))``. + kind : {None, 'sort', 'table'}, optional + The algorithm to use. This will not affect the final result, + but will affect the speed and memory use. The default, None, + will select automatically based on memory considerations. + + * If 'sort', will use a mergesort-based approach. This will have + a memory usage of roughly 6 times the sum of the sizes of + `ar1` and `ar2`, not accounting for size of dtypes. + * If 'table', will use a lookup table approach similar + to a counting sort. This is only available for boolean and + integer arrays. This will have a memory usage of the + size of `ar1` plus the max-min value of `ar2`. `assume_unique` + has no effect when the 'table' option is used. + * If None, will automatically choose 'table' if + the required memory allocation is less than or equal to + 6 times the sum of the sizes of `ar1` and `ar2`, + otherwise will use 'sort'. This is done to not use + a large amount of memory by default, even though + 'table' may be faster in most cases. If 'table' is chosen, + `assume_unique` will have no effect. .. versionadded:: 1.8.0 @@ -569,6 +590,13 @@ def in1d(ar1, ar2, assume_unique=False, invert=False): ``asarray(ar2)`` is an object array rather than the expected array of contained values. + Using ``kind='table'`` tends to be faster than `kind='sort'` if the + following relationship is true: + ``log10(len(ar2)) > (log10(max(ar2)-min(ar2)) - 2.27) / 0.927``, + but may use greater memory. The default value for `kind` will + be automatically selected based only on memory usage, so one may + manually set ``kind='table'`` if memory constraints can be relaxed. + .. versionadded:: 1.4.0 Examples @@ -593,6 +621,76 @@ def in1d(ar1, ar2, assume_unique=False, invert=False): # Ensure that iteration through object arrays yields size-1 arrays if ar2.dtype == object: ar2 = ar2.reshape(-1, 1) + # Convert booleans to uint8 so we can use the fast integer algorithm + if ar1.dtype == bool: + ar1 = ar1 + np.uint8(0) + if ar2.dtype == bool: + ar2 = ar2 + np.uint8(0) + + # Check if we can use a fast integer algorithm: + integer_arrays = (np.issubdtype(ar1.dtype, np.integer) and + np.issubdtype(ar2.dtype, np.integer)) + + if kind not in {None, 'sort', 'table'}: + raise ValueError( + f"Invalid kind: '{kind}'. Please use None, 'sort' or 'table'.") + + if integer_arrays and kind in {None, 'table'}: + ar2_min = np.min(ar2) + ar2_max = np.max(ar2) + + ar2_range = int(ar2_max) - int(ar2_min) + + # Constraints on whether we can actually use the table method: + range_safe_from_overflow = ar2_range < np.iinfo(ar2.dtype).max + below_memory_constraint = ar2_range <= 6 * (ar1.size + ar2.size) + + # Optimal performance is for approximately + # log10(size) > (log10(range) - 2.27) / 0.927. + # However, here we set the requirement that by default + # the intermediate array can only be 6x + # the combined memory allocation of the original + # arrays. See discussion on + # https://github.com/numpy/numpy/pull/12065. + + if ( + range_safe_from_overflow and + (below_memory_constraint or kind == 'table') + ): + + if invert: + outgoing_array = np.ones_like(ar1, dtype=bool) + else: + outgoing_array = np.zeros_like(ar1, dtype=bool) + + # Make elements 1 where the integer exists in ar2 + if invert: + isin_helper_ar = np.ones(ar2_range + 1, dtype=bool) + isin_helper_ar[ar2 - ar2_min] = 0 + else: + isin_helper_ar = np.zeros(ar2_range + 1, dtype=bool) + isin_helper_ar[ar2 - ar2_min] = 1 + + # Mask out elements we know won't work + basic_mask = (ar1 <= ar2_max) & (ar1 >= ar2_min) + outgoing_array[basic_mask] = isin_helper_ar[ar1[basic_mask] - + ar2_min] + + return outgoing_array + elif kind == 'table': # not range_safe_from_overflow + raise RuntimeError( + "You have specified kind='table', " + "but the range of values in `ar2` exceeds the " + "maximum integer of the datatype. " + "Please set `kind` to None or 'sort'." + ) + elif kind == 'table': + raise ValueError( + "The 'table' method is only " + "supported for boolean or integer arrays. " + "Please select 'sort' or None for kind." + ) + # Check if one of the arrays may contain arbitrary objects contains_object = ar1.dtype.hasobject or ar2.dtype.hasobject @@ -637,12 +735,14 @@ def in1d(ar1, ar2, assume_unique=False, invert=False): return ret[rev_idx] -def _isin_dispatcher(element, test_elements, assume_unique=None, invert=None): +def _isin_dispatcher(element, test_elements, assume_unique=None, invert=None, + *, kind=None): return (element, test_elements) @array_function_dispatch(_isin_dispatcher) -def isin(element, test_elements, assume_unique=False, invert=False): +def isin(element, test_elements, assume_unique=False, invert=False, *, + kind=None): """ Calculates ``element in test_elements``, broadcasting over `element` only. Returns a boolean array of the same shape as `element` that is True @@ -664,6 +764,27 @@ def isin(element, test_elements, assume_unique=False, invert=False): calculating `element not in test_elements`. Default is False. ``np.isin(a, b, invert=True)`` is equivalent to (but faster than) ``np.invert(np.isin(a, b))``. + kind : {None, 'sort', 'table'}, optional + The algorithm to use. This will not affect the final result, + but will affect the speed and memory use. The default, None, + will select automatically based on memory considerations. + + * If 'sort', will use a mergesort-based approach. This will have + a memory usage of roughly 6 times the sum of the sizes of + `ar1` and `ar2`, not accounting for size of dtypes. + * If 'table', will use a lookup table approach similar + to a counting sort. This is only available for boolean and + integer arrays. This will have a memory usage of the + size of `ar1` plus the max-min value of `ar2`. `assume_unique` + has no effect when the 'table' option is used. + * If None, will automatically choose 'table' if + the required memory allocation is less than or equal to + 6 times the sum of the sizes of `ar1` and `ar2`, + otherwise will use 'sort'. This is done to not use + a large amount of memory by default, even though + 'table' may be faster in most cases. If 'table' is chosen, + `assume_unique` will have no effect. + Returns ------- @@ -691,6 +812,13 @@ def isin(element, test_elements, assume_unique=False, invert=False): of the `array` constructor's way of handling non-sequence collections. Converting the set to a list usually gives the desired behavior. + Using ``kind='table'`` tends to be faster than `kind='sort'` if the + following relationship is true: + ``log10(len(ar2)) > (log10(max(ar2)-min(ar2)) - 2.27) / 0.927``, + but may use greater memory. The default value for `kind` will + be automatically selected based only on memory usage, so one may + manually set ``kind='table'`` if memory constraints can be relaxed. + .. versionadded:: 1.13.0 Examples @@ -737,7 +865,7 @@ def isin(element, test_elements, assume_unique=False, invert=False): """ element = np.asarray(element) return in1d(element, test_elements, assume_unique=assume_unique, - invert=invert).reshape(element.shape) + invert=invert, kind=kind).reshape(element.shape) def _union1d_dispatcher(ar1, ar2): diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py index e64634b6939f..d91d362825df 100644 --- a/numpy/lib/tests/test_arraysetops.py +++ b/numpy/lib/tests/test_arraysetops.py @@ -195,7 +195,8 @@ def test_ediff1d_scalar_handling(self, assert_equal(actual, expected) assert actual.dtype == expected.dtype - def test_isin(self): + @pytest.mark.parametrize("kind", [None, "sort", "table"]) + def test_isin(self, kind): # the tests for in1d cover most of isin's behavior # if in1d is removed, would need to change those tests to test # isin instead. @@ -205,7 +206,7 @@ def _isin_slow(a, b): isin_slow = np.vectorize(_isin_slow, otypes=[bool], excluded={1}) def assert_isin_equal(a, b): - x = isin(a, b) + x = isin(a, b, kind=kind) y = isin_slow(a, b) assert_array_equal(x, y) @@ -231,12 +232,14 @@ def assert_isin_equal(a, b): assert_isin_equal(5, 6) # empty array-like: - x = [] - assert_isin_equal(x, b) - assert_isin_equal(a, x) - assert_isin_equal(x, x) - - def test_in1d(self): + if kind in {None, "sort"}: + x = [] + assert_isin_equal(x, b) + assert_isin_equal(a, x) + assert_isin_equal(x, x) + + @pytest.mark.parametrize("kind", [None, "sort", "table"]) + def test_in1d(self, kind): # we use two different sizes for the b array here to test the # two different paths in in1d(). for mult in (1, 10): @@ -244,57 +247,58 @@ def test_in1d(self): a = [5, 7, 1, 2] b = [2, 4, 3, 1, 5] * mult ec = np.array([True, False, True, True]) - c = in1d(a, b, assume_unique=True) + c = in1d(a, b, assume_unique=True, kind=kind) assert_array_equal(c, ec) a[0] = 8 ec = np.array([False, False, True, True]) - c = in1d(a, b, assume_unique=True) + c = in1d(a, b, assume_unique=True, kind=kind) assert_array_equal(c, ec) a[0], a[3] = 4, 8 ec = np.array([True, False, True, False]) - c = in1d(a, b, assume_unique=True) + c = in1d(a, b, assume_unique=True, kind=kind) assert_array_equal(c, ec) a = np.array([5, 4, 5, 3, 4, 4, 3, 4, 3, 5, 2, 1, 5, 5]) b = [2, 3, 4] * mult ec = [False, True, False, True, True, True, True, True, True, False, True, False, False, False] - c = in1d(a, b) + c = in1d(a, b, kind=kind) assert_array_equal(c, ec) b = b + [5, 5, 4] * mult ec = [True, True, True, True, True, True, True, True, True, True, True, False, True, True] - c = in1d(a, b) + c = in1d(a, b, kind=kind) assert_array_equal(c, ec) a = np.array([5, 7, 1, 2]) b = np.array([2, 4, 3, 1, 5] * mult) ec = np.array([True, False, True, True]) - c = in1d(a, b) + c = in1d(a, b, kind=kind) assert_array_equal(c, ec) a = np.array([5, 7, 1, 1, 2]) b = np.array([2, 4, 3, 3, 1, 5] * mult) ec = np.array([True, False, True, True, True]) - c = in1d(a, b) + c = in1d(a, b, kind=kind) assert_array_equal(c, ec) a = np.array([5, 5]) b = np.array([2, 2] * mult) ec = np.array([False, False]) - c = in1d(a, b) + c = in1d(a, b, kind=kind) assert_array_equal(c, ec) a = np.array([5]) b = np.array([2]) ec = np.array([False]) - c = in1d(a, b) + c = in1d(a, b, kind=kind) assert_array_equal(c, ec) - assert_array_equal(in1d([], []), []) + if kind in {None, "sort"}: + assert_array_equal(in1d([], [], kind=kind), []) def test_in1d_char_array(self): a = np.array(['a', 'b', 'c', 'd', 'e', 'c', 'e', 'b']) @@ -305,16 +309,29 @@ def test_in1d_char_array(self): assert_array_equal(c, ec) - def test_in1d_invert(self): + @pytest.mark.parametrize("kind", [None, "sort", "table"]) + def test_in1d_invert(self, kind): "Test in1d's invert parameter" # We use two different sizes for the b array here to test the # two different paths in in1d(). for mult in (1, 10): a = np.array([5, 4, 5, 3, 4, 4, 3, 4, 3, 5, 2, 1, 5, 5]) b = [2, 3, 4] * mult - assert_array_equal(np.invert(in1d(a, b)), in1d(a, b, invert=True)) - - def test_in1d_ravel(self): + assert_array_equal(np.invert(in1d(a, b, kind=kind)), + in1d(a, b, invert=True, kind=kind)) + + # float: + if kind in {None, "sort"}: + for mult in (1, 10): + a = np.array([5, 4, 5, 3, 4, 4, 3, 4, 3, 5, 2, 1, 5, 5], + dtype=np.float32) + b = [2, 3, 4] * mult + b = np.array(b, dtype=np.float32) + assert_array_equal(np.invert(in1d(a, b, kind=kind)), + in1d(a, b, invert=True, kind=kind)) + + @pytest.mark.parametrize("kind", [None, "sort", "table"]) + def test_in1d_ravel(self, kind): # Test that in1d ravels its input arrays. This is not documented # behavior however. The test is to ensure consistentency. a = np.arange(6).reshape(2, 3) @@ -322,10 +339,44 @@ def test_in1d_ravel(self): long_b = np.arange(3, 63).reshape(30, 2) ec = np.array([False, False, False, True, True, True]) - assert_array_equal(in1d(a, b, assume_unique=True), ec) - assert_array_equal(in1d(a, b, assume_unique=False), ec) - assert_array_equal(in1d(a, long_b, assume_unique=True), ec) - assert_array_equal(in1d(a, long_b, assume_unique=False), ec) + assert_array_equal(in1d(a, b, assume_unique=True, kind=kind), + ec) + assert_array_equal(in1d(a, b, assume_unique=False, + kind=kind), + ec) + assert_array_equal(in1d(a, long_b, assume_unique=True, + kind=kind), + ec) + assert_array_equal(in1d(a, long_b, assume_unique=False, + kind=kind), + ec) + + def test_in1d_hit_alternate_algorithm(self): + """Hit the standard isin code with integers""" + # Need extreme range to hit standard code + # This hits it without the use of kind='table' + a = np.array([5, 4, 5, 3, 4, 4, 1e9], dtype=np.int64) + b = np.array([2, 3, 4, 1e9], dtype=np.int64) + expected = np.array([0, 1, 0, 1, 1, 1, 1], dtype=bool) + assert_array_equal(expected, in1d(a, b)) + assert_array_equal(np.invert(expected), in1d(a, b, invert=True)) + + a = np.array([5, 7, 1, 2], dtype=np.int64) + b = np.array([2, 4, 3, 1, 5, 1e9], dtype=np.int64) + ec = np.array([True, False, True, True]) + c = in1d(a, b, assume_unique=True) + assert_array_equal(c, ec) + + @pytest.mark.parametrize("kind", [None, "sort", "table"]) + def test_in1d_boolean(self, kind): + """Test that in1d works for boolean input""" + a = np.array([True, False]) + b = np.array([False, False, False]) + expected = np.array([False, True]) + assert_array_equal(expected, + in1d(a, b, kind=kind)) + assert_array_equal(np.invert(expected), + in1d(a, b, invert=True, kind=kind)) def test_in1d_first_array_is_object(self): ar1 = [None] @@ -391,6 +442,40 @@ def test_in1d_with_arrays_containing_tuples(self): result = np.in1d(ar1, ar2, invert=True) assert_array_equal(result, np.invert(expected)) + def test_in1d_errors(self): + """Test that in1d raises expected errors.""" + + # Error 1: `kind` is not one of 'sort' 'table' or None. + ar1 = np.array([1, 2, 3, 4, 5]) + ar2 = np.array([2, 4, 6, 8, 10]) + assert_raises(ValueError, in1d, ar1, ar2, kind='quicksort') + + # Error 2: `kind="table"` does not work for non-integral arrays. + obj_ar1 = np.array([1, 'a', 3, 'b', 5], dtype=object) + obj_ar2 = np.array([1, 'a', 3, 'b', 5], dtype=object) + assert_raises(ValueError, in1d, obj_ar1, obj_ar2, kind='table') + + for dtype in [np.int32, np.int64]: + ar1 = np.array([-1, 2, 3, 4, 5], dtype=dtype) + # The range of this array will overflow: + overflow_ar2 = np.array([-1, np.iinfo(dtype).max], dtype=dtype) + + # Error 3: `kind="table"` will trigger a runtime error + # if there is an integer overflow expected when computing the + # range of ar2 + assert_raises( + RuntimeError, + in1d, ar1, overflow_ar2, kind='table' + ) + + # Non-error: `kind=None` will *not* trigger a runtime error + # if there is an integer overflow, it will switch to + # the `sort` algorithm. + result = np.in1d(ar1, overflow_ar2, kind=None) + assert_array_equal(result, [True] + [False] * 4) + result = np.in1d(ar1, overflow_ar2, kind='sort') + assert_array_equal(result, [True] + [False] * 4) + def test_union1d(self): a = np.array([5, 4, 7, 1, 2]) b = np.array([2, 4, 3, 3, 2, 1, 5])