numpy · seberg · Jun 23, 2022 · Oct 1, 2018 · Oct 2, 2018 · Oct 3, 2018
diff --git a/benchmarks/benchmarks/bench_lib.py b/benchmarks/benchmarks/bench_lib.py
@@ -137,3 +137,22 @@ def setup(self, array_size, percent_nans):
 
     def time_unique(self, array_size, percent_nans):
         np.unique(self.arr)
+
+
+class Isin(Benchmark):
+    """Benchmarks for `numpy.isin`."""
+
+    param_names = ["size", "highest_element"]
+    params = [
+        [10, 100000, 3000000],
+        [10, 10000, int(1e8)]
+    ]
+
+    def setup(self, size, highest_element):
+        self.array = np.random.randint(
+                low=0, high=highest_element, size=size)
+        self.in_array = np.random.randint(
+                low=0, high=highest_element, size=size)
+
+    def time_isin(self, size, highest_element):
+        np.isin(self.array, self.in_array)
diff --git a/doc/release/upcoming_changes/12065.performance.rst b/doc/release/upcoming_changes/12065.performance.rst
@@ -0,0 +1,8 @@
+Faster version of ``np.isin`` and ``np.in1d`` for integer arrays
+----------------------------------------------------------------
+``np.in1d`` (used by ``np.isin``) can now switch to a faster algorithm
+(up to >10x faster) when it is fed two integer arrays.
+The algorithm bares similarities to a counting sort in that it
+uses the ``test_elements`` argument to index a boolean helper
+array with ``True`` values where elements exist. The ``element``
+argument simply indexes this array of booleans.
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
@@ -516,12 +516,13 @@ def setxor1d(ar1, ar2, assume_unique=False):
     return aux[flag[1:] & flag[:-1]]
 
 
-def _in1d_dispatcher(ar1, ar2, assume_unique=None, invert=None):
+def _in1d_dispatcher(ar1, ar2, assume_unique=None, invert=None,
+                     method='auto'):
     return (ar1, ar2)
 
 
 @array_function_dispatch(_in1d_dispatcher)
-def in1d(ar1, ar2, assume_unique=False, invert=False):
+def in1d(ar1, ar2, assume_unique=False, invert=False, method='auto'):
     """
     Test whether each element of a 1-D array is also present in a second array.
 
@@ -544,6 +545,19 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
         False where an element of `ar1` is in `ar2` and True otherwise).
         Default is False. ``np.in1d(a, b, invert=True)`` is equivalent
         to (but is faster than) ``np.invert(in1d(a, b))``.
+    method : {'auto', 'sort', 'dictionary'}, optional
+        The algorithm to use. This will not affect the final result,
+        but will affect the speed. Default is 'auto'.
+
+        - If 'sort', will use a sort-based approach.
+        - If 'dictionary', will use a key-dictionary approach similar
+          to a radix sort. This is only available for boolean and
+          integer arrays.
+        - If 'auto', will automatically choose the method which is
+          expected to perform the fastest, which depends
+          on the size and range of `ar2`. For larger sizes,
+          'dictionary' is chosen. For larger range or smaller
+          sizes, 'sort' is chosen.
 
         .. versionadded:: 1.8.0
 
@@ -593,6 +607,70 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
     # Ensure that iteration through object arrays yields size-1 arrays
     if ar2.dtype == object:
         ar2 = ar2.reshape(-1, 1)
+    # Convert booleans to uint8 so we can use the fast integer algorithm
+    if ar1.dtype == np.bool_:
+        ar1 = ar1.view(np.uint8)
+    if ar2.dtype == np.bool_:
+        ar2 = ar2.view(np.uint8)
+
+    # Check if we can use a fast integer algorithm:
+    integer_arrays = (np.issubdtype(ar1.dtype, np.integer) and
+                      np.issubdtype(ar2.dtype, np.integer))
+
+    if method not in ['auto', 'sort', 'dictionary']:
+        raise ValueError(
+            "Invalid method: {0}. ".format(method)
+            + "Please use 'auto', 'sort' or 'dictionary'.")
+
+    if integer_arrays and method in ['auto', 'dictionary']:
+        ar2_min = np.min(ar2)
+        ar2_max = np.max(ar2)
+        ar2_size = ar2.size
+
+        # Check for integer overflow
+        with np.errstate(over='raise'):
+            try:
+                ar2_range = ar2_max - ar2_min
+
+                # Optimal performance is for approximately
+                # log10(size) > (log10(range) - 2.27) / 0.927.
+                # See discussion on
+                # https://github.com/numpy/numpy/pull/12065
+                optimal_parameters = (
+                        np.log10(ar2_size) >
+                        ((np.log10(ar2_range + 1.0) - 2.27) / 0.927)
+                    )
+            except FloatingPointError:
+                optimal_parameters = False
+
+        # Use the fast integer algorithm
+        if optimal_parameters or method == 'dictionary':
+
+            if invert:
+                outgoing_array = np.ones_like(ar1, dtype=np.bool_)
+            else:
+                outgoing_array = np.zeros_like(ar1, dtype=np.bool_)
+
+            # Make elements 1 where the integer exists in ar2
+            if invert:
+                isin_helper_ar = np.ones(ar2_range + 1, dtype=np.bool_)
+                isin_helper_ar[ar2 - ar2_min] = 0
+            else:
+                isin_helper_ar = np.zeros(ar2_range + 1, dtype=np.bool_)
+                isin_helper_ar[ar2 - ar2_min] = 1
+
+            # Mask out elements we know won't work
+            basic_mask = (ar1 <= ar2_max) & (ar1 >= ar2_min)
+            outgoing_array[basic_mask] = isin_helper_ar[ar1[basic_mask] -
+                                                        ar2_min]
+
+            return outgoing_array
+    elif method == 'dictionary':
+        raise ValueError(
+            "'dictionary' method is only supported for non-integer arrays. "
+            "Please select 'sort' or 'auto' for the method."
+        )
+
 
     # Check if one of the arrays may contain arbitrary objects
     contains_object = ar1.dtype.hasobject or ar2.dtype.hasobject
@@ -637,12 +715,14 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
         return ret[rev_idx]
 
 
-def _isin_dispatcher(element, test_elements, assume_unique=None, invert=None):
+def _isin_dispatcher(element, test_elements, assume_unique=None, invert=None,
+                     method='auto'):
     return (element, test_elements)
 
 
 @array_function_dispatch(_isin_dispatcher)
-def isin(element, test_elements, assume_unique=False, invert=False):
+def isin(element, test_elements, assume_unique=False, invert=False,
+         method='auto'):
     """
     Calculates ``element in test_elements``, broadcasting over `element` only.
     Returns a boolean array of the same shape as `element` that is True
@@ -664,6 +744,19 @@ def isin(element, test_elements, assume_unique=False, invert=False):
         calculating `element not in test_elements`. Default is False.
         ``np.isin(a, b, invert=True)`` is equivalent to (but faster
         than) ``np.invert(np.isin(a, b))``.
+    method : {'auto', 'sort', 'dictionary'}, optional
+        The algorithm to use. This will not affect the final result,
+        but will affect the speed. Default is 'auto'.
+
+        - If 'sort', will use a sort-based approach.
+        - If 'dictionary', will use a key-dictionary approach similar
+          to a radix sort. This is only available for boolean and
+          integer arrays.
+        - If 'auto', will automatically choose the method which is
+          expected to perform the fastest, which depends
+          on the size and range of `ar2`. For larger sizes,
+          'dictionary' is chosen. For larger range or smaller
+          sizes, 'sort' is chosen.
 
     Returns
     -------
@@ -737,7 +830,7 @@ def isin(element, test_elements, assume_unique=False, invert=False):
     """
     element = np.asarray(element)
     return in1d(element, test_elements, assume_unique=assume_unique,
-                invert=invert).reshape(element.shape)
+                invert=invert, method=method).reshape(element.shape)
 
 
 def _union1d_dispatcher(ar1, ar2):