From 1134be2ca463d3f5d788c4c7b1dcf3ef1f9e3d33 Mon Sep 17 00:00:00 2001
From: Rafael Cardoso Fernandes Sousa <rafaelcfsousa@ibm.com>
Date: Fri, 29 Apr 2022 17:58:53 -0500
Subject: [PATCH 1/6] SIMD: Use universal intrinsics to implement comparison
 functions

---
 .gitignore                                    |   1 +
 benchmarks/benchmarks/bench_ufunc.py          |  19 +-
 numpy/core/code_generators/generate_umath.py  |  12 +-
 numpy/core/setup.py                           |   1 +
 numpy/core/src/_simd/_simd.dispatch.c.src     |   9 +
 numpy/core/src/_simd/_simd_easyintrin.inc     |  44 ++
 numpy/core/src/common/simd/avx2/conversion.h  |  30 ++
 .../core/src/common/simd/avx512/conversion.h  |  42 ++
 numpy/core/src/common/simd/avx512/utils.h     |  12 +
 numpy/core/src/common/simd/neon/conversion.h  |  24 +
 numpy/core/src/common/simd/sse/conversion.h   |  24 +
 numpy/core/src/common/simd/vsx/conversion.h   |  23 +
 numpy/core/src/umath/loops.c.src              |  53 +--
 numpy/core/src/umath/loops.h.src              |  83 +++-
 .../src/umath/loops_comparison.dispatch.c.src | 421 ++++++++++++++++++
 numpy/core/src/umath/simd.inc.src             | 144 ------
 numpy/core/tests/test_simd.py                 |  34 ++
 numpy/core/tests/test_umath.py                |  76 ++++
 18 files changed, 852 insertions(+), 200 deletions(-)
 create mode 100644 numpy/core/src/umath/loops_comparison.dispatch.c.src

diff --git a/.gitignore b/.gitignore
index 632f13674da6..8dd4a5344ec5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -225,6 +225,7 @@ numpy/core/src/umath/loops_exponent_log.dispatch.c
 numpy/core/src/umath/loops_umath_fp.dispatch.c
 numpy/core/src/umath/loops_hyperbolic.dispatch.c
 numpy/core/src/umath/loops_modulo.dispatch.c
+numpy/core/src/umath/loops_comparison.dispatch.c
 # npysort module
 numpy/core/src/npysort/x86-qsort.dispatch.c
 numpy/core/src/npysort/x86-qsort.dispatch.*.cpp
diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py
index cfa29017d239..858dcccfcd84 100644
--- a/benchmarks/benchmarks/bench_ufunc.py
+++ b/benchmarks/benchmarks/bench_ufunc.py
@@ -170,8 +170,25 @@ def time_divide_scalar2(self, dtype):
     def time_divide_scalar2_inplace(self, dtype):
         np.divide(self.d, 1, out=self.d)
 
+
+class CustomComparison(Benchmark):
+    params = (np.int8,  np.int16,  np.int32,  np.int64, np.uint8, np.uint16,
+              np.uint32, np.uint64, np.float32, np.float64, np.bool_)
+    param_names = ['dtype']
+
+    def setup(self, dtype):
+        self.x = np.ones(50000, dtype=dtype)
+        self.y = np.ones(50000, dtype=dtype)
+        self.s = np.ones(1, dtype=dtype)
+
+    def time_less_than_binary(self, dtype):
+        (self.x < self.y)
+
+    def time_less_than_scalar1(self, dtype):
+        (self.s < self.x)
+
     def time_less_than_scalar2(self, dtype):
-        (self.d < 1)
+        (self.x < self.s)
 
 
 class CustomScalarFloorDivideInt(Benchmark):
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 266fccefb2c6..cc0e93d4350b 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -445,7 +445,7 @@ def english_upper(s):
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.greater'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
@@ -453,7 +453,7 @@ def english_upper(s):
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.greater_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
@@ -461,7 +461,7 @@ def english_upper(s):
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.less'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
@@ -469,7 +469,7 @@ def english_upper(s):
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.less_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
@@ -477,7 +477,7 @@ def english_upper(s):
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
@@ -485,7 +485,7 @@ def english_upper(s):
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.not_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index dd60a00dbd76..8018a489f245 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -1070,6 +1070,7 @@ def generate_umath_doc_header(ext, build_dir):
             join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
             join('src', 'umath', 'loops_hyperbolic.dispatch.c.src'),
             join('src', 'umath', 'loops_modulo.dispatch.c.src'),
+            join('src', 'umath', 'loops_comparison.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
             join('src', 'umath', 'clip.h'),
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index ab48db5b108d..ad8c538e4cfb 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -472,6 +472,10 @@ SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
 SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@)
 /**end repeat**/
 
+SIMD_IMPL_INTRIN_2(pack_b8_b16, vb8, vb16, vb16)
+SIMD_IMPL_INTRIN_4(pack_b8_b32, vb8, vb32, vb32, vb32, vb32)
+SIMD_IMPL_INTRIN_8(pack_b8_b64, vb8, vb64, vb64, vb64, vb64,
+                                     vb64, vb64, vb64, vb64)
 
 //#########################################################################
 //## Attach module functions
@@ -716,6 +720,11 @@ SIMD_INTRIN_DEF(not_@bsfx@)
 SIMD_INTRIN_DEF(tobits_@bsfx@)
 /**end repeat**/
 
+// Pack multiple vectors into one
+SIMD_INTRIN_DEF(pack_b8_b16)
+SIMD_INTRIN_DEF(pack_b8_b32)
+SIMD_INTRIN_DEF(pack_b8_b64)
+
 /************************************************************************/
 {NULL, NULL, 0, NULL}
 }; // PyMethodDef
diff --git a/numpy/core/src/_simd/_simd_easyintrin.inc b/numpy/core/src/_simd/_simd_easyintrin.inc
index 4521b2d87f07..f2e0da26ecef 100644
--- a/numpy/core/src/_simd/_simd_easyintrin.inc
+++ b/numpy/core/src/_simd/_simd_easyintrin.inc
@@ -153,6 +153,50 @@
         return simd_arg_to_obj(&ret);                     \
     }
 
+#define SIMD_IMPL_INTRIN_8(NAME, RET, IN0, IN1, IN2, IN3, \
+                                      IN4, IN5, IN6, IN7) \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg arg1 = {.dtype = simd_data_##IN0};       \
+        simd_arg arg2 = {.dtype = simd_data_##IN1};       \
+        simd_arg arg3 = {.dtype = simd_data_##IN2};       \
+        simd_arg arg4 = {.dtype = simd_data_##IN3};       \
+        simd_arg arg5 = {.dtype = simd_data_##IN4};       \
+        simd_arg arg6 = {.dtype = simd_data_##IN5};       \
+        simd_arg arg7 = {.dtype = simd_data_##IN6};       \
+        simd_arg arg8 = {.dtype = simd_data_##IN7};       \
+        if (!PyArg_ParseTuple(                            \
+            args, "O&O&O&O&O&O&O&O&:"NPY_TOSTRING(NAME),  \
+            simd_arg_converter, &arg1,                    \
+            simd_arg_converter, &arg2,                    \
+            simd_arg_converter, &arg3,                    \
+            simd_arg_converter, &arg4,                    \
+            simd_arg_converter, &arg5,                    \
+            simd_arg_converter, &arg6,                    \
+            simd_arg_converter, &arg7,                    \
+            simd_arg_converter, &arg8                     \
+        )) return NULL;                                   \
+        simd_data data = {.RET = npyv_##NAME(             \
+            arg1.data.IN0, arg2.data.IN1,                 \
+            arg3.data.IN2, arg4.data.IN3,                 \
+            arg5.data.IN4, arg6.data.IN5,                 \
+            arg7.data.IN6, arg8.data.IN7                  \
+        )};                                               \
+        simd_arg_free(&arg1);                             \
+        simd_arg_free(&arg2);                             \
+        simd_arg_free(&arg3);                             \
+        simd_arg_free(&arg4);                             \
+        simd_arg_free(&arg5);                             \
+        simd_arg_free(&arg6);                             \
+        simd_arg_free(&arg7);                             \
+        simd_arg_free(&arg8);                             \
+        simd_arg ret = {                                  \
+            .data = data, .dtype = simd_data_##RET        \
+        };                                                \
+        return simd_arg_to_obj(&ret);                     \
+    }
+
 /**
  * Helper macros for repeating and expand a certain macro.
  * Mainly used for converting a scalar to an immediate constant.
diff --git a/numpy/core/src/common/simd/avx2/conversion.h b/numpy/core/src/common/simd/avx2/conversion.h
index 64e051686794..00ac0d38a31a 100644
--- a/numpy/core/src/common/simd/avx2/conversion.h
+++ b/numpy/core/src/common/simd/avx2/conversion.h
@@ -58,6 +58,36 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
     return r;
 }
 
+// pack two 16-bit boolean into one 8-bit boolean vector
+NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
+    __m256i ab = _mm256_packs_epi16(a, b);
+    return npyv256_shuffle_odd(ab);
+}
+
+// pack four 32-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
+    __m256i ab = _mm256_packs_epi32(a, b);
+    __m256i cd = _mm256_packs_epi32(c, d);
+    __m256i abcd = npyv_pack_b8_b16(ab, cd);
+    return _mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+// pack eight 64-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
+                 npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
+    __m256i ab = _mm256_packs_epi32(a, b);
+    __m256i cd = _mm256_packs_epi32(c, d);
+    __m256i ef = _mm256_packs_epi32(e, f);
+    __m256i gh = _mm256_packs_epi32(g, h);
+    __m256i abcd = _mm256_packs_epi32(ab, cd);
+    __m256i efgh = _mm256_packs_epi32(ef, gh);
+    __m256i all  = npyv256_shuffle_odd(_mm256_packs_epi16(abcd, efgh));
+    __m256i rev128 = _mm256_alignr_epi8(all, all, 8);
+    return _mm256_unpacklo_epi16(all, rev128);
+}
+
 // round to nearest integer (assuming even)
 #define npyv_round_s32_f32 _mm256_cvtps_epi32
 NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h
index 0bd44179b332..a2f56b2ae654 100644
--- a/numpy/core/src/common/simd/avx512/conversion.h
+++ b/numpy/core/src/common/simd/avx512/conversion.h
@@ -90,6 +90,48 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
     return r;
 }
 
+// pack two 16-bit boolean into one 8-bit boolean vector
+NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
+#ifdef NPY_HAVE_AVX512BW
+    return _mm512_kunpackd((__mmask64)b, (__mmask64)a);
+#else
+    const __m512i idx = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+    return _mm512_permutexvar_epi64(idx, npyv512_packs_epi16(a, b));
+#endif
+}
+
+// pack four 32-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
+#ifdef NPY_HAVE_AVX512BW
+    __mmask32 ab = (__mmask64)_mm512_kunpackw((__mmask32)b, (__mmask32)a);
+    __mmask32 cd = (__mmask64)_mm512_kunpackw((__mmask32)d, (__mmask32)c);
+    return npyv_pack_b8_b16(ab, cd);
+#else
+    const __m512i idx = _mm512_setr_epi32(
+        0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15);
+    __m256i ta = npyv512_pack_lo_hi(npyv_cvt_u32_b32(a));
+    __m256i tb = npyv512_pack_lo_hi(npyv_cvt_u32_b32(b));
+    __m256i tc = npyv512_pack_lo_hi(npyv_cvt_u32_b32(c));
+    __m256i td = npyv512_pack_lo_hi(npyv_cvt_u32_b32(d));
+    __m256i ab = _mm256_packs_epi16(ta, tb);
+    __m256i cd = _mm256_packs_epi16(tc, td);
+    __m512i abcd = npyv512_combine_si256(ab, cd);
+    return _mm512_permutexvar_epi32(idx, abcd);
+#endif
+}
+
+// pack eight 64-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
+                 npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
+    __mmask16 ab = _mm512_kunpackb((__mmask16)b, (__mmask16)a);
+    __mmask16 cd = _mm512_kunpackb((__mmask16)d, (__mmask16)c);
+    __mmask16 ef = _mm512_kunpackb((__mmask16)f, (__mmask16)e);
+    __mmask16 gh = _mm512_kunpackb((__mmask16)h, (__mmask16)g);
+    return npyv_pack_b8_b32(ab, cd, ef, gh);
+}
+
 // convert boolean vectors to integer bitfield
 NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
 {
diff --git a/numpy/core/src/common/simd/avx512/utils.h b/numpy/core/src/common/simd/avx512/utils.h
index c3079283f491..ced3bfef0ef9 100644
--- a/numpy/core/src/common/simd/avx512/utils.h
+++ b/numpy/core/src/common/simd/avx512/utils.h
@@ -87,4 +87,16 @@
         ));                                                  \
     }
 
+#ifndef NPY_HAVE_AVX512BW
+    NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv512_packs_epi16,  _mm256_packs_epi16)
+#else
+    #define npyv512_packs_epi16 _mm512_packs_epi16
+#endif
+
+NPY_FINLINE __m256i npyv512_pack_lo_hi(__m512i a) {
+    __m256i lo = npyv512_lower_si256(a);
+    __m256i hi = npyv512_higher_si256(a);
+    return _mm256_packs_epi32(lo, hi);
+}
+
 #endif // _NPY_SIMD_AVX512_UTILS_H
diff --git a/numpy/core/src/common/simd/neon/conversion.h b/numpy/core/src/common/simd/neon/conversion.h
index 7487559d1c30..b6a50dc7a642 100644
--- a/numpy/core/src/common/simd/neon/conversion.h
+++ b/numpy/core/src/common/simd/neon/conversion.h
@@ -86,6 +86,30 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
     return r;
 }
 
+// pack two 16-bit boolean into one 8-bit boolean vector
+NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
+    return vcombine_u8(vmovn_u16(a), vmovn_u16(b));
+}
+
+// pack four 32-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
+    npyv_b16 ab = vcombine_u16(vmovn_u32(a), vmovn_u32(b));
+    npyv_b16 cd = vcombine_u16(vmovn_u32(c), vmovn_u32(d));
+    return npyv_pack_b8_b16(ab, cd);
+}
+
+// pack eight 64-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
+                 npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
+    npyv_b32 ab = vcombine_u32(vmovn_u64(a), vmovn_u64(b));
+    npyv_b32 cd = vcombine_u32(vmovn_u64(c), vmovn_u64(d));
+    npyv_b32 ef = vcombine_u32(vmovn_u64(e), vmovn_u64(f));
+    npyv_b32 gh = vcombine_u32(vmovn_u64(g), vmovn_u64(h));
+    return npyv_pack_b8_b32(ab, cd, ef, gh);
+}
+
 // round to nearest integer
 #if NPY_SIMD_F64
     #define npyv_round_s32_f32 vcvtnq_s32_f32
diff --git a/numpy/core/src/common/simd/sse/conversion.h b/numpy/core/src/common/simd/sse/conversion.h
index ab7eb490727b..0811bf06ae4a 100644
--- a/numpy/core/src/common/simd/sse/conversion.h
+++ b/numpy/core/src/common/simd/sse/conversion.h
@@ -59,6 +59,30 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
     return r;
 }
 
+// pack two 16-bit boolean into one 8-bit boolean vector
+NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
+    return _mm_packs_epi16(a, b);
+}
+
+// pack four 32-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
+    npyv_b16 ab = _mm_packs_epi32(a, b);
+    npyv_b16 cd = _mm_packs_epi32(c, d);
+    return npyv_pack_b8_b16(ab, cd);
+}
+
+// pack eight 64-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
+                 npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
+    npyv_b32 ab = _mm_packs_epi32(a, b);
+    npyv_b32 cd = _mm_packs_epi32(c, d);
+    npyv_b32 ef = _mm_packs_epi32(e, f);
+    npyv_b32 gh = _mm_packs_epi32(g, h);
+    return npyv_pack_b8_b32(ab, cd, ef, gh);
+}
+
 // round to nearest integer (assuming even)
 #define npyv_round_s32_f32 _mm_cvtps_epi32
 NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h
index 36bea7bbaddf..a599f3950fe5 100644
--- a/numpy/core/src/common/simd/vsx/conversion.h
+++ b/numpy/core/src/common/simd/vsx/conversion.h
@@ -48,6 +48,29 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
     return r;
 }
 
+// pack two 16-bit boolean into one 8-bit boolean vector
+NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
+    return vec_pack(a, b);
+}
+
+// pack four 32-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
+    npyv_b16 ab = vec_pack(a, b);
+    npyv_b16 cd = vec_pack(c, d);
+    return npyv_pack_b8_b16(ab, cd);
+}
+
+// pack eight 64-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
+                 npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
+    npyv_b32 ab = vec_pack(a, b);
+    npyv_b32 cd = vec_pack(c, d);
+    npyv_b32 ef = vec_pack(e, f);
+    npyv_b32 gh = vec_pack(g, h);
+    return npyv_pack_b8_b32(ab, cd, ef, gh);
+}
+
 // convert boolean vector to integer bitfield
 NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
 {
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 3a8a549131a2..9ae686399dbd 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -400,23 +400,6 @@ PyUFunc_On_Om(char **args, npy_intp const *dimensions, npy_intp const *steps, vo
  *****************************************************************************
  */
 
-/**begin repeat
- * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
- * #OP =  ==, !=, >, >=, <, <=#
- **/
-
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP {
-        npy_bool in1 = *((npy_bool *)ip1) != 0;
-        npy_bool in2 = *((npy_bool *)ip2) != 0;
-        *((npy_bool *)op1)= in1 @OP@ in2;
-    }
-}
-/**end repeat**/
-
-
 /**begin repeat
  * #kind = logical_and, logical_or#
  * #OP =  &&, ||#
@@ -688,9 +671,8 @@ void
 
 
 /**begin repeat2
- * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
- *         logical_and, logical_or#
- * #OP =  ==, !=, >, >=, <, <=, &&, ||#
+ * #kind = logical_and, logical_or#
+ * #OP = &&, ||#
  */
 
 #if @CHK@
@@ -1408,19 +1390,16 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
  *  #C = F, , L#
  */
 /**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
- *        logical_and, logical_or#
- * #OP = ==, !=, <, <=, >, >=, &&, ||#
+ * #kind = logical_and, logical_or#
+ * #OP = &&, ||#
  */
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            *((npy_bool *)op1) = in1 @OP@ in2;
-        }
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        *((npy_bool *)op1) = in1 @OP@ in2;
     }
     npy_clear_floatstatus_barrier((char*)dimensions);
 }
@@ -1654,6 +1633,22 @@ LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps
 }
 /**end repeat**/
 
+/**begin repeat
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ */
+NPY_NO_EXPORT void
+LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_bool *)op1) = in1 @OP@ in2;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat**/
+
 NPY_NO_EXPORT void
 LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 694518ae0e20..5af9f1788758 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -28,9 +28,19 @@
  *****************************************************************************
  */
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_comparison.dispatch.h"
+#endif
+
+/**begin repeat
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat**/
+
 /**begin repeat
- * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
- *         logical_and, logical_or, absolute, logical_not#
+ * #kind = logical_and, logical_or, absolute, logical_not#
  **/
 NPY_NO_EXPORT void
 BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -60,8 +70,8 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
            BYTE,  SHORT,  INT,  LONG,  LONGLONG#
  */
- NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
-     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
 /**end repeat**/
 
 #ifndef NPY_DISABLE_OPTIMIZATION
@@ -72,14 +82,28 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
            BYTE,  SHORT,  INT,  LONG,  LONGLONG#
  */
- NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divmod,
-     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**begin repeat1
+ * #kind = divmod, fmod, remainder#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
 
- NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_fmod,
-     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_comparison.dispatch.h"
+#endif
 
- NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_remainder,
-     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+           BYTE,  SHORT,  INT,  LONG,  LONGLONG#
+ */
+/**begin repeat1
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
 /**end repeat**/
 
 /**begin repeat
@@ -136,9 +160,8 @@ NPY_NO_EXPORT void
 /**end repeat3**/
 
 /**begin repeat3
- * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
- *         logical_and, logical_or#
- * #OP =  ==, !=, >, >=, <, <=, &&, ||#
+ * #kind = logical_and, logical_or#
+ * #OP = &&, ||#
  */
 NPY_NO_EXPORT void
 @S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -232,9 +255,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
 /**end repeat1**/
 /**end repeat**/
 
-/**end repeat1**/
-/**end repeat**/
-
 // SVML
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_umath_fp.dispatch.h"
@@ -300,6 +320,21 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
 /**end repeat1**/
 /**end repeat**/
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_comparison.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+/**end repeat1**/
+/**end repeat**/
+
 /**begin repeat
  * Float types
  *  #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
@@ -307,7 +342,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
  *  #C = F, F, , L#
  */
 
-
 /**begin repeat1
  * Arithmetic
  * # kind = add, subtract, multiply, divide#
@@ -318,9 +352,8 @@ NPY_NO_EXPORT void
 /**end repeat1**/
 
 /**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
- *        logical_and, logical_or#
- * #OP = ==, !=, <, <=, >, >=, &&, ||#
+ * #kind = logical_and, logical_or#
+ * #OP = &&, ||#
  */
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -407,6 +440,16 @@ NPY_NO_EXPORT void
 @TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 /**end repeat**/
 
+/**begin repeat
+ * #TYPE = HALF, LONGDOUBLE#
+ */
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+/**end repeat**/
 
 /*
  *****************************************************************************
diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src
new file mode 100644
index 000000000000..e5518afb36f6
--- /dev/null
+++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src
@@ -0,0 +1,421 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse41 avx2 avx512f avx512_skx
+ ** vsx2 vsx3
+ ** neon
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/**begin repeat
+ * #sfx    = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #len    =  8,  8,  16,  16,  32,  32,  64,  64,  32,  64#
+ * #VECTOR = NPY_SIMD*9, NPY_SIMD_F64#
+ */
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge#
+ */
+
+#if @VECTOR@
+static void simd_binary_@kind@_@sfx@(char **args, npy_intp len)
+{
+    npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
+    npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if @len@ >= 8
+        npyv_@sfx@  a1 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 0);
+        npyv_@sfx@  b1 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 0);
+        npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a1, b1);
+#if @len@ >= 16
+        npyv_@sfx@  a2 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 1);
+        npyv_@sfx@  b2 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 1);
+        npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a2, b2);
+#if @len@ >= 32
+        npyv_@sfx@  a3 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 2);
+        npyv_@sfx@  b3 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 2);
+        npyv_@sfx@  a4 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 3);
+        npyv_@sfx@  b4 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 3);
+        npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a3, b3);
+        npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a4, b4);
+#if @len@ == 64
+        npyv_@sfx@  a5 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 4);
+        npyv_@sfx@  b5 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 4);
+        npyv_@sfx@  a6 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 5);
+        npyv_@sfx@  b6 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 5);
+        npyv_@sfx@  a7 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 6);
+        npyv_@sfx@  b7 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 6);
+        npyv_@sfx@  a8 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 7);
+        npyv_@sfx@  b8 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 7);
+        npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a5, b5);
+        npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a6, b6);
+        npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a7, b7);
+        npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a8, b8);
+#endif // @len@ >= 64
+#endif // @len@ >= 32
+#endif // @len@ >= 16
+#endif // @len@ >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if @len@ == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif @len@ == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif @len@ == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif @len@ == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_@sfx@ a = *src1;
+        const npyv_lanetype_@sfx@ b = *src2;
+        *dst = a @OP@ b;
+    }
+}
+
+static void simd_binary_scalar1_@kind@_@sfx@(char **args, npy_intp len)
+{
+    npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[0];
+    npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_@sfx@ a         = npyv_setall_@sfx@(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if @len@ >= 8
+        npyv_@sfx@  b1 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 0);
+        npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a, b1);
+#if @len@ >= 16
+        npyv_@sfx@  b2 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 1);
+        npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a, b2);
+#if @len@ >= 32
+        npyv_@sfx@  b3 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 2);
+        npyv_@sfx@  b4 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 3);
+        npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a, b3);
+        npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a, b4);
+#if @len@ == 64
+        npyv_@sfx@  b5 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 4);
+        npyv_@sfx@  b6 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 5);
+        npyv_@sfx@  b7 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 6);
+        npyv_@sfx@  b8 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 7);
+        npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a, b5);
+        npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a, b6);
+        npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a, b7);
+        npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a, b8);
+#endif // @len@ >= 64
+#endif // @len@ >= 32
+#endif // @len@ >= 16
+#endif // @len@ >= 8
+
+#if @len@ == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif @len@ == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif @len@ == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif @len@ == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_@sfx@ b = *src;
+        *dst = scalar @OP@ b;
+    }
+}
+
+static void simd_binary_scalar2_@kind@_@sfx@(char **args, npy_intp len)
+{
+    npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[0];
+    npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_@sfx@ b         = npyv_setall_@sfx@(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if @len@ >= 8
+        npyv_@sfx@  a1 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 0);
+        npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a1, b);
+#if @len@ >= 16
+        npyv_@sfx@  a2 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 1);
+        npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a2, b);
+#if @len@ >= 32
+        npyv_@sfx@  a3 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 2);
+        npyv_@sfx@  a4 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 3);
+        npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a3, b);
+        npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a4, b);
+#if @len@ == 64
+        npyv_@sfx@  a5 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 4);
+        npyv_@sfx@  a6 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 5);
+        npyv_@sfx@  a7 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 6);
+        npyv_@sfx@  a8 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 7);
+        npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a5, b);
+        npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a6, b);
+        npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a7, b);
+        npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a8, b);
+#endif // @len@ >= 64
+#endif // @len@ >= 32
+#endif // @len@ >= 16
+#endif // @len@ >= 8
+
+#if @len@ == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif @len@ == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif @len@ == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif @len@ == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_@sfx@ a = *src;
+        *dst = a @OP@ scalar;
+    }
+}
+#endif
+
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge#
+ */
+
+#if NPY_SIMD
+static void simd_binary_@kind@_b8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst  = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask = npyv_setall_u8(0x1);
+    const npyv_u8 vzero    = npyv_setall_u8(0x0);
+    const int vstep        = npyv_nlanes_u8;
+
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+        // Whatever element in src != 0x0 is converted to 0xFF
+        npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src1), vzero);
+        npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src2), vzero);
+        npyv_b8 c = npyv_@VOP@_u8(npyv_cvt_u8_b8(a), npyv_cvt_u8_b8(b));
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u8 a = *src1 != 0;
+        const npyv_lanetype_u8 b = *src2 != 0;
+        *dst = a @OP@ b;
+    }
+}
+
+static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 vzero     = npyv_setall_u8(0x0);
+    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
+    const npyv_u8 a         = npyv_cvt_u8_b8(npyv_cmpneq_u8(vscalar, vzero));
+    const npyv_u8 truemask  = npyv_setall_u8(0x1);
+    const int vstep         = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src), vzero);
+        npyv_b8 c = npyv_@VOP@_u8(a, npyv_cvt_u8_b8(b));
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 b = *src != 0;
+        *dst = scalar @OP@ b;
+    }
+}
+
+static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 vzero     = npyv_setall_u8(0x0);
+    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
+    const npyv_u8 b         = npyv_cvt_u8_b8(npyv_cmpneq_u8(vscalar, vzero));
+    const npyv_u8 truemask  = npyv_setall_u8(0x1);
+    const int vstep         = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src), vzero);
+        npyv_b8 c = npyv_@VOP@_u8(npyv_cvt_u8_b8(a), b);
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 a = *src != 0;
+        *dst = a @OP@ scalar;
+    }
+}
+#endif
+/**end repeat**/
+
+
+/**begin repeat
+ * #type = npy_ubyte*2, npy_byte, npy_ushort, npy_short, npy_uint, npy_int,
+           npy_ulonglong, npy_longlong, npy_float, npy_double#
+ * #sfx = b8, u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #VECTOR = NPY_SIMD*10, NPY_SIMD_F64#
+ */
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ */
+static NPY_INLINE int
+run_binary_simd_@kind@_@sfx@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if @VECTOR@
+    /* argument one scalar */
+    if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]);
+        return 1;
+    }
+    /* argument two scalar */
+    else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]);
+        return 1;
+    }
+    else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_@kind@_@sfx@(args, dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+/**end repeat1**/
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                             BOOLEAN LOOPS                               **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (!run_binary_simd_@kind@_b8(args, dimensions, steps)) {
+        BINARY_LOOP {
+            npy_bool in1 = *((npy_bool *)ip1) != 0;
+            npy_bool in2 = *((npy_bool *)ip2) != 0;
+            *((npy_bool *)op1)= in1 @OP@ in2;
+        }
+    }
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                           INTEGER LOOPS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * Signed and Unsigned types
+ *  #type  = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *           npy_byte,  npy_short,  npy_int,  npy_long,  npy_longlong#
+ *  #TYPE  = UBYTE,     USHORT,     UINT,     ULONG,     ULONGLONG,
+ *           BYTE,      SHORT,      INT,      LONG,      LONGLONG#
+ *  #STYPE = BYTE,      SHORT,      INT,      LONG,      LONGLONG,
+ *           BYTE,      SHORT,      INT,      LONG,      LONGLONG#
+ *  #signed = 0, 0, 0, 0, 0, 1, 1, 1, 1, 1#
+ */
+#undef TO_SIMD_SFX
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_BITSOF_@STYPE@ == @len@
+    #if @signed@
+        #define TO_SIMD_SFX(X) X##_s@len@
+    #else
+        #define TO_SIMD_SFX(X) X##_u@len@
+    #endif
+/**end repeat1**/
+#endif
+
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (!TO_SIMD_SFX(run_binary_simd_@kind@)(args, dimensions, steps)) {
+        BINARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ in2 = *(@type@ *)ip2;
+            *((npy_bool *)op1) = in1 @OP@ in2;
+        }
+    }
+}
+/**end repeat1**/
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                             FLOAT LOOPS                                 **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * Float types
+ * #type = npy_float, npy_double#
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx = f32, f64#
+ */
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (!run_binary_simd_@kind@_@sfx@(args, dimensions, steps)) {
+        BINARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ in2 = *(@type@ *)ip2;
+            *((npy_bool *)op1) = in1 @OP@ in2;
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat1**/
+/**end repeat**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index b477027b3c8a..d6c9a7e65385 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -158,55 +158,6 @@ run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp
 
 /**end repeat1**/
 
-/**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
- *         logical_and, logical_or#
- * #simd = 1, 1, 1, 1, 1, 1, 0, 0#
- */
-
-#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS
-
-/* prototypes */
-static void
-sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
-                          npy_intp n);
-static void
-sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
-                                  npy_intp n);
-static void
-sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
-                                  npy_intp n);
-
-#endif
-
-static NPY_INLINE int
-run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS
-    @type@ * ip1 = (@type@ *)args[0];
-    @type@ * ip2 = (@type@ *)args[1];
-    npy_bool * op = (npy_bool *)args[2];
-    npy_intp n = dimensions[0];
-    /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
-        sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
-        sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
-        sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-/**end repeat1**/
-
 /**begin repeat1
  * #kind = isnan, isfinite, isinf, signbit#
  */
@@ -476,101 +427,6 @@ sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
 
 /**end repeat1**/
 
-/**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
- * #OP = ==, !=, <, <=, >, >=#
- * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge#
-*/
-
-/* sets invalid fpu flag on QNaN for consistency with packed compare */
-NPY_FINLINE int
-sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b)
-{
-    @vtype@ one = @vpre@_set1_@vsuf@(1);
-    @type@ tmp;
-    @vtype@ v = @vpre@_@VOP@_@vsufs@(@vpre@_load_@vsufs@(&a),
-                                     @vpre@_load_@vsufs@(&b));
-    v = @vpre@_and_@vsuf@(v, one);
-    @vpre@_store_@vsufs@(&tmp, v);
-    return tmp;
-}
-
-static void
-sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
-    }
-    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
-        @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2);
-        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2);
-        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2);
-        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2);
-        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
-    }
-}
-
-
-static void
-sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-    @vtype@ s = @vpre@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(ip2, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
-    }
-    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vpre@_load_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ b = @vpre@_load_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ c = @vpre@_load_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ d = @vpre@_load_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(s, a);
-        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(s, b);
-        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(s, c);
-        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(s, d);
-        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
-    }
-}
-
-
-static void
-sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-    @vtype@ s = @vpre@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
-    }
-    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a, s);
-        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b, s);
-        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c, s);
-        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d, s);
-        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
-    }
-}
-/**end repeat1**/
-
-
 static void
 sse2_negative_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
 {
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index e4b5e0c8f474..b0b62b4a2e17 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -156,6 +156,40 @@ def test_tobits(self):
             tobits = bin(self.tobits(vdata))
             assert tobits == bin(data_bits)
 
+    def test_pack(self):
+        """
+        Pack multiple vectors into one
+        Test intrinsics:
+            npyv_pack_b8_b16
+            npyv_pack_b8_b32
+            npyv_pack_b8_b64
+        """
+        if self.sfx not in ("b16", "b32", "b64"):
+            return
+
+        # create the vectors
+        data = self._data()
+        rdata = self._data(reverse=True)
+        vdata = self._load_b(data)
+        vrdata = self._load_b(rdata)
+
+        pack_simd = getattr(self.npyv, f"pack_b8_{self.sfx}")
+
+        # for scalar execution, concatenate the elements of the multiple lists
+        # into a single list (spack) and then iterate over the elements of
+        # the created list applying a mask to capture the first byte of them.
+        if self.sfx == "b16":
+            spack = [(i & 0xFF) for i in (list(rdata) + list(data))]
+            vpack = pack_simd(vrdata, vdata)
+        elif self.sfx == "b32":
+            spack = [(i & 0xFF) for i in (2*list(rdata) + 2*list(data))]
+            vpack = pack_simd(vrdata, vrdata, vdata, vdata)
+        elif self.sfx == "b64":
+            spack = [(i & 0xFF) for i in (4*list(rdata) + 4*list(data))]
+            vpack = pack_simd(vrdata, vrdata, vrdata, vrdata,
+                               vdata,  vdata,  vdata,  vdata)
+        assert vpack == spack
+
 class _SIMD_INT(_Test_Utility):
     """
     To test all integer vector types at once
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index dd0bb88fff73..01ef9365e7de 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -185,6 +185,82 @@ def __array_wrap__(self, arr, context):
 
 
 class TestComparisons:
+    @pytest.mark.parametrize('dtype', np.sctypes['uint'] + np.sctypes['int'] +
+                             np.sctypes['float'] + [np.bool_])
+    def test_comparison_functions(self, dtype):
+        # Initialize input arrays
+        if dtype == np.bool_:
+            a = np.random.choice(a=[False, True], size=1000)
+            b = np.random.choice(a=[False, True], size=1000)
+            scalar = True
+        else:
+            a = np.random.randint(low=1, high=10, size=1000).astype(dtype)
+            b = np.random.randint(low=1, high=10, size=1000).astype(dtype)
+            scalar = 5
+        scalar_np = np.dtype(dtype).type(scalar)
+        a_lst = a.tolist()
+        b_lst = b.tolist()
+
+        # (Binary) Comparison (x1=array, x2=array)
+        lt_b = np.less(a, b)
+        le_b = np.less_equal(a, b)
+        gt_b = np.greater(a, b)
+        ge_b = np.greater_equal(a, b)
+        eq_b = np.equal(a, b)
+        ne_b = np.not_equal(a, b)
+        lt_b_lst = [x < y for x, y in zip(a_lst, b_lst)]
+        le_b_lst = [x <= y for x, y in zip(a_lst, b_lst)]
+        gt_b_lst = [x > y for x, y in zip(a_lst, b_lst)]
+        ge_b_lst = [x >= y for x, y in zip(a_lst, b_lst)]
+        eq_b_lst = [x == y for x, y in zip(a_lst, b_lst)]
+        ne_b_lst = [x != y for x, y in zip(a_lst, b_lst)]
+
+        # (Scalar1) Comparison (x1=scalar, x2=array)
+        lt_s1 = np.less(scalar_np, b)
+        le_s1 = np.less_equal(scalar_np, b)
+        gt_s1 = np.greater(scalar_np, b)
+        ge_s1 = np.greater_equal(scalar_np, b)
+        eq_s1 = np.equal(scalar_np, b)
+        ne_s1 = np.not_equal(scalar_np, b)
+        lt_s1_lst = [scalar < x for x in b_lst]
+        le_s1_lst = [scalar <= x for x in b_lst]
+        gt_s1_lst = [scalar > x for x in b_lst]
+        ge_s1_lst = [scalar >= x for x in b_lst]
+        eq_s1_lst = [scalar == x for x in b_lst]
+        ne_s1_lst = [scalar != x for x in b_lst]
+
+        # (Scalar2) Comparison (x1=array, x2=scalar)
+        lt_s2 = np.less(a, scalar_np)
+        le_s2 = np.less_equal(a, scalar_np)
+        gt_s2 = np.greater(a, scalar_np)
+        ge_s2 = np.greater_equal(a, scalar_np)
+        eq_s2 = np.equal(a, scalar_np)
+        ne_s2 = np.not_equal(a, scalar_np)
+        lt_s2_lst = [x < scalar for x in a_lst]
+        le_s2_lst = [x <= scalar for x in a_lst]
+        gt_s2_lst = [x > scalar for x in a_lst]
+        ge_s2_lst = [x >= scalar for x in a_lst]
+        eq_s2_lst = [x == scalar for x in a_lst]
+        ne_s2_lst = [x != scalar for x in a_lst]
+
+        # Compare comparison functions (Python vs NumPy) using native Python
+        def compare(lt, le, gt, ge, eq, ne, lt_lst, le_lst, gt_lst, ge_lst,
+                    eq_lst, ne_lst):
+            assert_(lt.tolist() == lt_lst, "Comparison function check (lt)")
+            assert_(le.tolist() == le_lst, "Comparison function check (le)")
+            assert_(gt.tolist() == gt_lst, "Comparison function check (gt)")
+            assert_(ge.tolist() == ge_lst, "Comparison function check (ge)")
+            assert_(eq.tolist() == eq_lst, "Comparison function check (eq)")
+            assert_(ne.tolist() == ne_lst, "Comparison function check (ne)")
+
+        # Sequence: Binary, Scalar1 and Scalar2
+        compare(lt_b, le_b, gt_b, ge_b, eq_b, ne_b, lt_b_lst, le_b_lst,
+                gt_b_lst, ge_b_lst, eq_b_lst, ne_b_lst)
+        compare(lt_s1, le_s1, gt_s1, ge_s1, eq_s1, ne_s1, lt_s1_lst, le_s1_lst,
+                gt_s1_lst, ge_s1_lst, eq_s1_lst, ne_s1_lst)
+        compare(lt_s2, le_s2, gt_s2, ge_s2, eq_s2, ne_s2, lt_s2_lst, le_s2_lst,
+                gt_s2_lst, ge_s2_lst, eq_s2_lst, ne_s2_lst)
+
     def test_ignore_object_identity_in_equal(self):
         # Check comparing identical objects whose comparison
         # is not a simple boolean, e.g., arrays that are compared elementwise.

From ff03e72646ee0e4727a1cc333d5ea4945724d44b Mon Sep 17 00:00:00 2001
From: Rafael CF Sousa <rafaelcfsousa@ibm.com>
Date: Mon, 23 May 2022 16:19:47 -0300
Subject: [PATCH 2/6] TST: Rewrite the test that checks the universal SIMD
 intrinsic (pack)

---
 numpy/core/tests/test_umath.py | 78 +++++++++++-----------------------
 1 file changed, 24 insertions(+), 54 deletions(-)

diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 01ef9365e7de..7b6e2ee92276 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -185,9 +185,19 @@ def __array_wrap__(self, arr, context):
 
 
 class TestComparisons:
+    import operator
+
     @pytest.mark.parametrize('dtype', np.sctypes['uint'] + np.sctypes['int'] +
                              np.sctypes['float'] + [np.bool_])
-    def test_comparison_functions(self, dtype):
+    @pytest.mark.parametrize('py_comp,np_comp', [
+        (operator.lt, np.less),
+        (operator.le, np.less_equal),
+        (operator.gt, np.greater),
+        (operator.ge, np.greater_equal),
+        (operator.eq, np.equal),
+        (operator.ne, np.not_equal)
+    ])
+    def test_comparison_functions(self, dtype, py_comp, np_comp):
         # Initialize input arrays
         if dtype == np.bool_:
             a = np.random.choice(a=[False, True], size=1000)
@@ -197,69 +207,29 @@ def test_comparison_functions(self, dtype):
             a = np.random.randint(low=1, high=10, size=1000).astype(dtype)
             b = np.random.randint(low=1, high=10, size=1000).astype(dtype)
             scalar = 5
-        scalar_np = np.dtype(dtype).type(scalar)
+        np_scalar = np.dtype(dtype).type(scalar)
         a_lst = a.tolist()
         b_lst = b.tolist()
 
         # (Binary) Comparison (x1=array, x2=array)
-        lt_b = np.less(a, b)
-        le_b = np.less_equal(a, b)
-        gt_b = np.greater(a, b)
-        ge_b = np.greater_equal(a, b)
-        eq_b = np.equal(a, b)
-        ne_b = np.not_equal(a, b)
-        lt_b_lst = [x < y for x, y in zip(a_lst, b_lst)]
-        le_b_lst = [x <= y for x, y in zip(a_lst, b_lst)]
-        gt_b_lst = [x > y for x, y in zip(a_lst, b_lst)]
-        ge_b_lst = [x >= y for x, y in zip(a_lst, b_lst)]
-        eq_b_lst = [x == y for x, y in zip(a_lst, b_lst)]
-        ne_b_lst = [x != y for x, y in zip(a_lst, b_lst)]
+        comp_b = np_comp(a, b)
+        comp_b_list = [py_comp(x, y) for x, y in zip(a_lst, b_lst)]
 
         # (Scalar1) Comparison (x1=scalar, x2=array)
-        lt_s1 = np.less(scalar_np, b)
-        le_s1 = np.less_equal(scalar_np, b)
-        gt_s1 = np.greater(scalar_np, b)
-        ge_s1 = np.greater_equal(scalar_np, b)
-        eq_s1 = np.equal(scalar_np, b)
-        ne_s1 = np.not_equal(scalar_np, b)
-        lt_s1_lst = [scalar < x for x in b_lst]
-        le_s1_lst = [scalar <= x for x in b_lst]
-        gt_s1_lst = [scalar > x for x in b_lst]
-        ge_s1_lst = [scalar >= x for x in b_lst]
-        eq_s1_lst = [scalar == x for x in b_lst]
-        ne_s1_lst = [scalar != x for x in b_lst]
+        comp_s1 = np_comp(np_scalar, b)
+        comp_s1_list = [py_comp(scalar, x) for x in b_lst]
 
         # (Scalar2) Comparison (x1=array, x2=scalar)
-        lt_s2 = np.less(a, scalar_np)
-        le_s2 = np.less_equal(a, scalar_np)
-        gt_s2 = np.greater(a, scalar_np)
-        ge_s2 = np.greater_equal(a, scalar_np)
-        eq_s2 = np.equal(a, scalar_np)
-        ne_s2 = np.not_equal(a, scalar_np)
-        lt_s2_lst = [x < scalar for x in a_lst]
-        le_s2_lst = [x <= scalar for x in a_lst]
-        gt_s2_lst = [x > scalar for x in a_lst]
-        ge_s2_lst = [x >= scalar for x in a_lst]
-        eq_s2_lst = [x == scalar for x in a_lst]
-        ne_s2_lst = [x != scalar for x in a_lst]
-
-        # Compare comparison functions (Python vs NumPy) using native Python
-        def compare(lt, le, gt, ge, eq, ne, lt_lst, le_lst, gt_lst, ge_lst,
-                    eq_lst, ne_lst):
-            assert_(lt.tolist() == lt_lst, "Comparison function check (lt)")
-            assert_(le.tolist() == le_lst, "Comparison function check (le)")
-            assert_(gt.tolist() == gt_lst, "Comparison function check (gt)")
-            assert_(ge.tolist() == ge_lst, "Comparison function check (ge)")
-            assert_(eq.tolist() == eq_lst, "Comparison function check (eq)")
-            assert_(ne.tolist() == ne_lst, "Comparison function check (ne)")
+        comp_s2 = np_comp(a, np_scalar)
+        comp_s2_list = [py_comp(x, scalar) for x in a_lst]
 
         # Sequence: Binary, Scalar1 and Scalar2
-        compare(lt_b, le_b, gt_b, ge_b, eq_b, ne_b, lt_b_lst, le_b_lst,
-                gt_b_lst, ge_b_lst, eq_b_lst, ne_b_lst)
-        compare(lt_s1, le_s1, gt_s1, ge_s1, eq_s1, ne_s1, lt_s1_lst, le_s1_lst,
-                gt_s1_lst, ge_s1_lst, eq_s1_lst, ne_s1_lst)
-        compare(lt_s2, le_s2, gt_s2, ge_s2, eq_s2, ne_s2, lt_s2_lst, le_s2_lst,
-                gt_s2_lst, ge_s2_lst, eq_s2_lst, ne_s2_lst)
+        assert_(comp_b.tolist() == comp_b_list,
+            f"Failed comparision ({py_comp.__name__})")
+        assert_(comp_s1.tolist() == comp_s1_list,
+            f"Failed comparision ({py_comp.__name__})")
+        assert_(comp_s2.tolist() == comp_s2_list,
+            f"Failed comparision ({py_comp.__name__})")
 
     def test_ignore_object_identity_in_equal(self):
         # Check comparing identical objects whose comparison

From 09b22a118466ff85fe365f451139cc3da2e8bc43 Mon Sep 17 00:00:00 2001
From: Rafael CF Sousa <rafaelcfsousa@ibm.com>
Date: Mon, 23 May 2022 16:20:47 -0300
Subject: [PATCH 3/6] SIMD, ENH: Use logical bitwise to implement comparison
 functions (bool_)

---
 numpy/core/src/_simd/_simd.dispatch.c.src     |  6 +++
 numpy/core/src/common/simd/avx2/operators.h   |  5 +++
 numpy/core/src/common/simd/avx512/operators.h | 12 +++++
 numpy/core/src/common/simd/neon/operators.h   |  5 +++
 numpy/core/src/common/simd/sse/operators.h    |  5 +++
 numpy/core/src/common/simd/vsx/operators.h    |  5 +++
 .../src/umath/loops_comparison.dispatch.c.src | 25 ++++++++---
 numpy/core/tests/test_simd.py                 | 45 +++++++++++++++++--
 8 files changed, 99 insertions(+), 9 deletions(-)

diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index ad8c538e4cfb..f8a0a3196c9d 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -462,6 +462,9 @@ SIMD_IMPL_INTRIN_2(or_@bsfx@,  v@bsfx@, v@bsfx@, v@bsfx@)
 SIMD_IMPL_INTRIN_2(xor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
 SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
 /**end repeat**/
+SIMD_IMPL_INTRIN_2(andc_b8, vb8, vb8, vb8)
+SIMD_IMPL_INTRIN_2(orc_b8,  vb8, vb8, vb8)
+SIMD_IMPL_INTRIN_2(xnor_b8, vb8, vb8, vb8)
 /***************************
  * Conversions
  ***************************/
@@ -710,6 +713,9 @@ SIMD_INTRIN_DEF(or_@bsfx@)
 SIMD_INTRIN_DEF(xor_@bsfx@)
 SIMD_INTRIN_DEF(not_@bsfx@)
 /**end repeat**/
+SIMD_INTRIN_DEF(andc_b8)
+SIMD_INTRIN_DEF(orc_b8)
+SIMD_INTRIN_DEF(xnor_b8)
 /***************************
  * Conversions
  ***************************/
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
index 5fc7719e916d..0e77fc6bef99 100644
--- a/numpy/core/src/common/simd/avx2/operators.h
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -114,6 +114,11 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
 #define npyv_not_b32 npyv_not_u8
 #define npyv_not_b64 npyv_not_u8
 
+// ANDC, ORC and XNOR
+#define npyv_andc_b8(A, B) _mm256_andnot_si256(A, B)
+#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
+#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
+
 /***************************
  * Comparison
  ***************************/
diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h
index d53932fa8726..8c98b72ddd5a 100644
--- a/numpy/core/src/common/simd/avx512/operators.h
+++ b/numpy/core/src/common/simd/avx512/operators.h
@@ -152,6 +152,9 @@
     #define npyv_xor_b16 _kxor_mask32
     #define npyv_not_b8  _knot_mask64
     #define npyv_not_b16 _knot_mask32
+    #define npyv_andc_b8 _kandn_mask64
+    #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
+    #define npyv_xnor_b8 _kxnor_mask64
 #elif defined(NPY_HAVE_AVX512BW)
     NPY_FINLINE npyv_b8  npyv_and_b8(npyv_b8 a, npyv_b8 b)
     { return a & b; }
@@ -169,6 +172,12 @@
     { return ~a; }
     NPY_FINLINE npyv_b16 npyv_not_b16(npyv_b16 a)
     { return ~a; }
+    NPY_FINLINE npyv_b8  npyv_andc_b8(npyv_b8 a, npyv_b8 b)
+    { return (~a) & b; }
+    NPY_FINLINE npyv_b8  npyv_orc_b8(npyv_b8 a, npyv_b8 b)
+    { return (~a) | b; }
+    NPY_FINLINE npyv_b8  npyv_xnor_b8(npyv_b8 a, npyv_b8 b)
+    { return ~(a ^ b); }
 #else
     #define npyv_and_b8  _mm512_and_si512
     #define npyv_and_b16 _mm512_and_si512
@@ -178,6 +187,9 @@
     #define npyv_xor_b16 _mm512_xor_si512
     #define npyv_not_b8  npyv_not_u8
     #define npyv_not_b16 npyv_not_u8
+    #define npyv_andc_b8 _mm512_andnot_si512
+    #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
+    #define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
 #endif
 
 #define npyv_and_b32 _mm512_kand
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
index b43ba36537e9..6c155fc67bc1 100644
--- a/numpy/core/src/common/simd/neon/operators.h
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -116,6 +116,11 @@
 #define npyv_not_b32  vmvnq_u32
 #define npyv_not_b64  npyv_not_u64
 
+// ANDC, ORC and XNOR
+#define npyv_andc_b8(A, B) vbicq_u8(B, A)
+#define npyv_orc_b8(A, B) vornq_u8(B, A)
+#define npyv_xnor_b8 vceqq_u8
+
 /***************************
  * Comparison
  ***************************/
diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h
index 51c84fb4e9d9..51bdca356988 100644
--- a/numpy/core/src/common/simd/sse/operators.h
+++ b/numpy/core/src/common/simd/sse/operators.h
@@ -115,6 +115,11 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c)
 #define npyv_not_b32 npyv_not_u8
 #define npyv_not_b64 npyv_not_u8
 
+// ANDC, ORC and XNOR
+#define npyv_andc_b8(A, B) _mm_andnot_si128(A, B)
+#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
+#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
+
 /***************************
  * Comparison
  ***************************/
diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vsx/operators.h
index d34057ff3f38..fc29ba920905 100644
--- a/numpy/core/src/common/simd/vsx/operators.h
+++ b/numpy/core/src/common/simd/vsx/operators.h
@@ -133,6 +133,11 @@ NPY_FINLINE npyv_f32 npyv_not_f32(npyv_f32 a)
 NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
 { return vec_nor(a, a); }
 
+// ANDC, ORC and XNOR
+#define npyv_andc_b8(A, B) vec_andc(B, A)
+#define npyv_orc_b8(A, B) vec_orc(B, A)
+#define npyv_xnor_b8 vec_eqv
+
 /***************************
  * Comparison
  ***************************/
diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src
index e5518afb36f6..07bbf035454e 100644
--- a/numpy/core/src/umath/loops_comparison.dispatch.c.src
+++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src
@@ -207,7 +207,8 @@ static void simd_binary_scalar2_@kind@_@sfx@(char **args, npy_intp len)
 /**begin repeat
  * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
  * #OP = ==, !=, <, <=, >, >=#
- * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge#
+ * #VOP = xnor, xor, andc, orc, andc, orc#
+ * #rev = 0, 0, 0, 0, 1, 1#
  */
 
 #if NPY_SIMD
@@ -225,7 +226,11 @@ static void simd_binary_@kind@_b8(char **args, npy_intp len)
         // Whatever element in src != 0x0 is converted to 0xFF
         npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src1), vzero);
         npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src2), vzero);
-        npyv_b8 c = npyv_@VOP@_u8(npyv_cvt_u8_b8(a), npyv_cvt_u8_b8(b));
+#if !@rev@
+        npyv_b8 c = npyv_@VOP@_b8(a, b);
+#else
+        npyv_b8 c = npyv_@VOP@_b8(b, a);
+#endif
         npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
@@ -243,13 +248,17 @@ static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len)
     npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
     const npyv_u8 vzero     = npyv_setall_u8(0x0);
     const npyv_u8 vscalar   = npyv_setall_u8(scalar);
-    const npyv_u8 a         = npyv_cvt_u8_b8(npyv_cmpneq_u8(vscalar, vzero));
+    const npyv_b8 a         = npyv_cmpneq_u8(vscalar, vzero);
     const npyv_u8 truemask  = npyv_setall_u8(0x1);
     const int vstep         = npyv_nlanes_u8;
 
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
         npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src), vzero);
-        npyv_b8 c = npyv_@VOP@_u8(a, npyv_cvt_u8_b8(b));
+#if !@rev@
+        npyv_b8 c = npyv_@VOP@_b8(a, b);
+#else
+        npyv_b8 c = npyv_@VOP@_b8(b, a);
+#endif
         npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
@@ -266,13 +275,17 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len)
     npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
     const npyv_u8 vzero     = npyv_setall_u8(0x0);
     const npyv_u8 vscalar   = npyv_setall_u8(scalar);
-    const npyv_u8 b         = npyv_cvt_u8_b8(npyv_cmpneq_u8(vscalar, vzero));
+    const npyv_b8 b         = npyv_cmpneq_u8(vscalar, vzero);
     const npyv_u8 truemask  = npyv_setall_u8(0x1);
     const int vstep         = npyv_nlanes_u8;
 
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
         npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src), vzero);
-        npyv_b8 c = npyv_@VOP@_u8(npyv_cvt_u8_b8(a), b);
+#if !@rev@
+        npyv_b8 c = npyv_@VOP@_b8(a, b);
+#else
+        npyv_b8 c = npyv_@VOP@_b8(b, a);
+#endif
         npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index b0b62b4a2e17..81d759ad42ea 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -156,6 +156,48 @@ def test_tobits(self):
             tobits = bin(self.tobits(vdata))
             assert tobits == bin(data_bits)
 
+    def test_andc(self):
+        if self.sfx not in ("b8"):
+            return
+        andc_simd = getattr(self.npyv, f"andc_b8")
+        # create the vectors
+        data = self._data()
+        rdata = self._data(reverse=True)
+        vdata = self._load_b(data)
+        vrdata = self._load_b(rdata)
+        # check andc
+        sandc = [(~x & y) & 0xFF for x, y in zip(rdata, data)]
+        vandc = andc_simd(vrdata, vdata)
+        assert sandc == vandc
+
+    def test_orc(self):
+        if self.sfx not in ("b8"):
+            return
+        orc_simd = getattr(self.npyv, f"orc_b8")
+        # create the vectors
+        data = self._data()
+        rdata = self._data(reverse=True)
+        vdata = self._load_b(data)
+        vrdata = self._load_b(rdata)
+        # check orc
+        sorc = [(~x | y) & 0xFF for x, y in zip(rdata, data)]
+        vorc = orc_simd(vrdata, vdata)
+        assert sorc == vorc
+
+    def test_xnor(self):
+        if self.sfx not in ("b8"):
+            return
+        xnor_simd = getattr(self.npyv, f"xnor_b8")
+        # create the vectors
+        data = self._data()
+        rdata = self._data(reverse=True)
+        vdata = self._load_b(data)
+        vrdata = self._load_b(rdata)
+        # check orc
+        sxnor = [~(x ^ y) & 0xFF for x, y in zip(rdata, data)]
+        vxnor = xnor_simd(vrdata, vdata)
+        assert sxnor == vxnor
+
     def test_pack(self):
         """
         Pack multiple vectors into one
@@ -166,15 +208,12 @@ def test_pack(self):
         """
         if self.sfx not in ("b16", "b32", "b64"):
             return
-
         # create the vectors
         data = self._data()
         rdata = self._data(reverse=True)
         vdata = self._load_b(data)
         vrdata = self._load_b(rdata)
-
         pack_simd = getattr(self.npyv, f"pack_b8_{self.sfx}")
-
         # for scalar execution, concatenate the elements of the multiple lists
         # into a single list (spack) and then iterate over the elements of
         # the created list applying a mask to capture the first byte of them.

From d5d6eb567ec228bcc65da184240349239db4f080 Mon Sep 17 00:00:00 2001
From: Rafael CF Sousa <rafaelcfsousa@ibm.com>
Date: Fri, 27 May 2022 10:17:29 -0300
Subject: [PATCH 4/6] SIMD, ENH: Add universal intrinsic andc8 and use it to
 remove ifneq

This commit also applies some techniques to reduce the size of the binary generated from the source loops_comparison.dispatch.c.src
---
 numpy/core/src/_simd/_simd.dispatch.c.src     |  22 +-
 numpy/core/src/common/simd/avx2/operators.h   |   7 +-
 .../core/src/common/simd/avx512/conversion.h  |   4 +-
 numpy/core/src/common/simd/avx512/operators.h |  15 +-
 numpy/core/src/common/simd/neon/operators.h   |   5 +-
 numpy/core/src/common/simd/sse/operators.h    |   7 +-
 numpy/core/src/common/simd/vsx/operators.h    |   5 +-
 .../src/umath/loops_comparison.dispatch.c.src | 201 ++++++++++--------
 8 files changed, 150 insertions(+), 116 deletions(-)

diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index f8a0a3196c9d..0f3e4fc8f8da 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -31,6 +31,7 @@
  * #intdiv_sup= 1,  1,  1,   1,   1,   1,   1,   1,   0,   0#
  * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
  * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
+ * #bitw8b_sup= 1,  0,  0,   0,   0,   0,   0,   0,   0,   0#
  */
 #if @simd_sup@
 /***************************
@@ -332,6 +333,13 @@ SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@)
 SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@)
 /**end repeat1**/
 
+#if @bitw8b_sup@
+SIMD_IMPL_INTRIN_2(andc_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+SIMD_IMPL_INTRIN_2(andc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
+SIMD_IMPL_INTRIN_2(orc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
+SIMD_IMPL_INTRIN_2(xnor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
+#endif
+
 /***************************
  * Conversion
  ***************************/
@@ -462,9 +470,6 @@ SIMD_IMPL_INTRIN_2(or_@bsfx@,  v@bsfx@, v@bsfx@, v@bsfx@)
 SIMD_IMPL_INTRIN_2(xor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
 SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
 /**end repeat**/
-SIMD_IMPL_INTRIN_2(andc_b8, vb8, vb8, vb8)
-SIMD_IMPL_INTRIN_2(orc_b8,  vb8, vb8, vb8)
-SIMD_IMPL_INTRIN_2(xnor_b8, vb8, vb8, vb8)
 /***************************
  * Conversions
  ***************************/
@@ -503,6 +508,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
  * #intdiv_sup= 1,  1,  1,   1,   1,   1,   1,   1,   0,   0#
  * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
  * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
+ * #bitw8b_sup= 1,  0,  0,   0,   0,   0,   0,   0,   0,   0#
  */
 #if @simd_sup@
 
@@ -584,6 +590,13 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
 
+#if @bitw8b_sup@
+SIMD_INTRIN_DEF(andc_@sfx@)
+SIMD_INTRIN_DEF(andc_@bsfx@)
+SIMD_INTRIN_DEF(orc_@bsfx@)
+SIMD_INTRIN_DEF(xnor_@bsfx@)
+#endif
+
 /***************************
  * Conversion
  ***************************/
@@ -713,9 +726,6 @@ SIMD_INTRIN_DEF(or_@bsfx@)
 SIMD_INTRIN_DEF(xor_@bsfx@)
 SIMD_INTRIN_DEF(not_@bsfx@)
 /**end repeat**/
-SIMD_INTRIN_DEF(andc_b8)
-SIMD_INTRIN_DEF(orc_b8)
-SIMD_INTRIN_DEF(xnor_b8)
 /***************************
  * Conversions
  ***************************/
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
index 0e77fc6bef99..99ef76dcb1dc 100644
--- a/numpy/core/src/common/simd/avx2/operators.h
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -115,9 +115,10 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
 #define npyv_not_b64 npyv_not_u8
 
 // ANDC, ORC and XNOR
-#define npyv_andc_b8(A, B) _mm256_andnot_si256(A, B)
-#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
-#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
+#define npyv_andc_u8(A, B) _mm256_andnot_si256(B, A)
+#define npyv_andc_b8(A, B) _mm256_andnot_si256(B, A)
+#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
+#define npyv_xnor_b8 _mm256_cmpeq_epi8
 
 /***************************
  * Comparison
diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h
index a2f56b2ae654..474aee446b6a 100644
--- a/numpy/core/src/common/simd/avx512/conversion.h
+++ b/numpy/core/src/common/simd/avx512/conversion.h
@@ -104,8 +104,8 @@ NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
 NPY_FINLINE npyv_b8
 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
 #ifdef NPY_HAVE_AVX512BW
-    __mmask32 ab = (__mmask64)_mm512_kunpackw((__mmask32)b, (__mmask32)a);
-    __mmask32 cd = (__mmask64)_mm512_kunpackw((__mmask32)d, (__mmask32)c);
+    __mmask32 ab = _mm512_kunpackw((__mmask32)b, (__mmask32)a);
+    __mmask32 cd = _mm512_kunpackw((__mmask32)d, (__mmask32)c);
     return npyv_pack_b8_b16(ab, cd);
 #else
     const __m512i idx = _mm512_setr_epi32(
diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h
index 8c98b72ddd5a..b856b345ae97 100644
--- a/numpy/core/src/common/simd/avx512/operators.h
+++ b/numpy/core/src/common/simd/avx512/operators.h
@@ -140,6 +140,9 @@
     #define npyv_not_f64(A) _mm512_castsi512_pd(npyv_not_u64(_mm512_castpd_si512(A)))
 #endif
 
+// ANDC
+#define npyv_andc_u8(A, B) _mm512_andnot_si512(B, A)
+
 /***************************
  * Logical (boolean)
  ***************************/
@@ -152,8 +155,8 @@
     #define npyv_xor_b16 _kxor_mask32
     #define npyv_not_b8  _knot_mask64
     #define npyv_not_b16 _knot_mask32
-    #define npyv_andc_b8 _kandn_mask64
-    #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
+    #define npyv_andc_b8(A, B) _kandn_mask64(B, A)
+    #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
     #define npyv_xnor_b8 _kxnor_mask64
 #elif defined(NPY_HAVE_AVX512BW)
     NPY_FINLINE npyv_b8  npyv_and_b8(npyv_b8 a, npyv_b8 b)
@@ -173,9 +176,9 @@
     NPY_FINLINE npyv_b16 npyv_not_b16(npyv_b16 a)
     { return ~a; }
     NPY_FINLINE npyv_b8  npyv_andc_b8(npyv_b8 a, npyv_b8 b)
-    { return (~a) & b; }
+    { return a & (~b); }
     NPY_FINLINE npyv_b8  npyv_orc_b8(npyv_b8 a, npyv_b8 b)
-    { return (~a) | b; }
+    { return a | (~b); }
     NPY_FINLINE npyv_b8  npyv_xnor_b8(npyv_b8 a, npyv_b8 b)
     { return ~(a ^ b); }
 #else
@@ -187,8 +190,8 @@
     #define npyv_xor_b16 _mm512_xor_si512
     #define npyv_not_b8  npyv_not_u8
     #define npyv_not_b16 npyv_not_u8
-    #define npyv_andc_b8 _mm512_andnot_si512
-    #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
+    #define npyv_andc_b8(A, B) _mm512_andnot_si512(B, A)
+    #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
     #define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
 #endif
 
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
index 6c155fc67bc1..a08fa5390d03 100644
--- a/numpy/core/src/common/simd/neon/operators.h
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -117,8 +117,9 @@
 #define npyv_not_b64  npyv_not_u64
 
 // ANDC, ORC and XNOR
-#define npyv_andc_b8(A, B) vbicq_u8(B, A)
-#define npyv_orc_b8(A, B) vornq_u8(B, A)
+#define npyv_andc_u8 vbicq_u8
+#define npyv_andc_b8 vbicq_u8
+#define npyv_orc_b8 vornq_u8
 #define npyv_xnor_b8 vceqq_u8
 
 /***************************
diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h
index 51bdca356988..86dbcfea5eca 100644
--- a/numpy/core/src/common/simd/sse/operators.h
+++ b/numpy/core/src/common/simd/sse/operators.h
@@ -116,9 +116,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c)
 #define npyv_not_b64 npyv_not_u8
 
 // ANDC, ORC and XNOR
-#define npyv_andc_b8(A, B) _mm_andnot_si128(A, B)
-#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
-#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
+#define npyv_andc_u8(A, B) _mm_andnot_si128(B, A)
+#define npyv_andc_b8(A, B) _mm_andnot_si128(B, A)
+#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
+#define npyv_xnor_b8 _mm_cmpeq_epi8
 
 /***************************
  * Comparison
diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vsx/operators.h
index fc29ba920905..b01d8532159b 100644
--- a/numpy/core/src/common/simd/vsx/operators.h
+++ b/numpy/core/src/common/simd/vsx/operators.h
@@ -134,8 +134,9 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
 { return vec_nor(a, a); }
 
 // ANDC, ORC and XNOR
-#define npyv_andc_b8(A, B) vec_andc(B, A)
-#define npyv_orc_b8(A, B) vec_orc(B, A)
+#define npyv_andc_u8 vec_andc
+#define npyv_andc_b8 vec_andc
+#define npyv_orc_b8 vec_orc
 #define npyv_xnor_b8 vec_eqv
 
 /***************************
diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src
index 07bbf035454e..01d58fbf9c92 100644
--- a/numpy/core/src/umath/loops_comparison.dispatch.c.src
+++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src
@@ -1,6 +1,6 @@
 /*@targets
  ** $maxopt baseline
- ** sse2 sse41 avx2 avx512f avx512_skx
+ ** sse2 sse42 avx2 avx512f avx512_skx
  ** vsx2 vsx3
  ** neon
  **/
@@ -15,18 +15,23 @@
 // Provides the various *_LOOP macros
 #include "fast_loop_macros.h"
 
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
 /**begin repeat
  * #sfx    = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
  * #len    =  8,  8,  16,  16,  32,  32,  64,  64,  32,  64#
+ * #signed =  0,  1,   0,   1,   0,   1,   0,   1,   0,   0#
  * #VECTOR = NPY_SIMD*9, NPY_SIMD_F64#
  */
 /**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
- * #OP = ==, !=, <, <=, >, >=#
- * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge#
+ * #kind = equal, not_equal, less, less_equal#
+ * #eq = 1, 0, 0, 0#
+ * #neq = 0, 1, 0, 0#
+ * #OP = ==, !=, <, <=#
+ * #VOP = cmpeq, cmpneq, cmplt, cmple#
  */
-
-#if @VECTOR@
+#if @VECTOR@ && !((@eq@ || @neq@) && @signed@)
 static void simd_binary_@kind@_@sfx@(char **args, npy_intp len)
 {
     npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
@@ -205,10 +210,11 @@ static void simd_binary_scalar2_@kind@_@sfx@(char **args, npy_intp len)
 /**end repeat**/
 
 /**begin repeat
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
- * #OP = ==, !=, <, <=, >, >=#
- * #VOP = xnor, xor, andc, orc, andc, orc#
- * #rev = 0, 0, 0, 0, 1, 1#
+ * #kind = equal, not_equal, less, less_equal#
+ * #eq = 1, 0, 0, 0#
+ * #neq = 0, 1, 0, 0#
+ * #OP = ==, !=, <, <=#
+ * #VOP = xnor, xor, andc, orc#
  */
 
 #if NPY_SIMD
@@ -224,14 +230,10 @@ static void simd_binary_@kind@_b8(char **args, npy_intp len)
     for (; len >= vstep;
          len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
         // Whatever element in src != 0x0 is converted to 0xFF
-        npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src1), vzero);
-        npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src2), vzero);
-#if !@rev@
+        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
+        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
         npyv_b8 c = npyv_@VOP@_b8(a, b);
-#else
-        npyv_b8 c = npyv_@VOP@_b8(b, a);
-#endif
-        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+        npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
     for (; len > 0; --len, ++src1, ++src2, ++dst) {
@@ -248,18 +250,14 @@ static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len)
     npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
     const npyv_u8 vzero     = npyv_setall_u8(0x0);
     const npyv_u8 vscalar   = npyv_setall_u8(scalar);
-    const npyv_b8 a         = npyv_cmpneq_u8(vscalar, vzero);
+    const npyv_b8 a         = npyv_cmpeq_u8(vscalar, vzero);
     const npyv_u8 truemask  = npyv_setall_u8(0x1);
     const int vstep         = npyv_nlanes_u8;
 
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-        npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src), vzero);
-#if !@rev@
+        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
         npyv_b8 c = npyv_@VOP@_b8(a, b);
-#else
-        npyv_b8 c = npyv_@VOP@_b8(b, a);
-#endif
-        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+        npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
     for (; len > 0; --len, ++src, ++dst) {
@@ -275,18 +273,14 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len)
     npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
     const npyv_u8 vzero     = npyv_setall_u8(0x0);
     const npyv_u8 vscalar   = npyv_setall_u8(scalar);
-    const npyv_b8 b         = npyv_cmpneq_u8(vscalar, vzero);
+    const npyv_b8 b         = npyv_cmpeq_u8(vscalar, vzero);
     const npyv_u8 truemask  = npyv_setall_u8(0x1);
     const int vstep         = npyv_nlanes_u8;
 
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-        npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src), vzero);
-#if !@rev@
+        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
         npyv_b8 c = npyv_@VOP@_b8(a, b);
-#else
-        npyv_b8 c = npyv_@VOP@_b8(b, a);
-#endif
-        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+        npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
     for (; len > 0; --len, ++src, ++dst) {
@@ -297,73 +291,73 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len)
 #endif
 /**end repeat**/
 
-
 /**begin repeat
  * #type = npy_ubyte*2, npy_byte, npy_ushort, npy_short, npy_uint, npy_int,
            npy_ulonglong, npy_longlong, npy_float, npy_double#
  * #sfx = b8, u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #bool = 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0#
+ * #fp = 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #signed = 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0#
  * #VECTOR = NPY_SIMD*10, NPY_SIMD_F64#
  */
 /**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #kind = equal, not_equal, less, less_equal#
+ * #eq   = 1, 0, 0, 0#
+ * #neq  = 0, 1, 0, 0#
+ * #OP = ==, !=, <, <=#
  */
-static NPY_INLINE int
+#if !((@eq@ || @neq@) && @signed@)
+static NPY_INLINE void
 run_binary_simd_@kind@_@sfx@(char **args, npy_intp const *dimensions, npy_intp const *steps)
 {
 #if @VECTOR@
     /* argument one scalar */
     if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
         simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]);
-        return 1;
+        return;
     }
     /* argument two scalar */
     else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
         simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]);
-        return 1;
+        return;
     }
     else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
         simd_binary_@kind@_@sfx@(args, dimensions[0]);
-        return 1;
+        return;
     }
 #endif
-    return 0;
-}
-/**end repeat1**/
-/**end repeat**/
 
-/*
- *****************************************************************************
- **                             BOOLEAN LOOPS                               **
- *****************************************************************************
- */
-
-/**begin repeat
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
- * #OP = ==, !=, <, <=, >, >=#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (!run_binary_simd_@kind@_b8(args, dimensions, steps)) {
-        BINARY_LOOP {
-            npy_bool in1 = *((npy_bool *)ip1) != 0;
-            npy_bool in2 = *((npy_bool *)ip2) != 0;
-            *((npy_bool *)op1)= in1 @OP@ in2;
-        }
+    BINARY_LOOP {
+#if @bool@
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 @OP@ in2;
     }
 }
+#endif
+/**end repeat1**/
 /**end repeat**/
 
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+
 /*
- *****************************************************************************
- **                           INTEGER LOOPS
- *****************************************************************************
+ * In order to reduce the size of the binary generated from this source, the
+ * following rules are applied: 1) each data type implements its function
+ * 'greater' as a call to the function 'less' but with the arguments swapped,
+ * the same applies to the function 'greater_equal', which is implemented
+ * with a call to the function 'less_equal', and 2) for the integer datatypes
+ * of the same size (eg 8-bit), a single kernel of the functions 'equal' and
+ * 'not_equal' is used to implement both signed and unsigned types.
  */
 
 /**begin repeat
  * Signed and Unsigned types
- *  #type  = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
- *           npy_byte,  npy_short,  npy_int,  npy_long,  npy_longlong#
  *  #TYPE  = UBYTE,     USHORT,     UINT,     ULONG,     ULONGLONG,
  *           BYTE,      SHORT,      INT,      LONG,      LONGLONG#
  *  #STYPE = BYTE,      SHORT,      INT,      LONG,      LONGLONG,
@@ -371,11 +365,13 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
  *  #signed = 0, 0, 0, 0, 0, 1, 1, 1, 1, 1#
  */
 #undef TO_SIMD_SFX
+#undef TO_SIMD_UTYPE
 #if 0
 /**begin repeat1
  * #len = 8, 16, 32, 64#
  */
 #elif NPY_BITSOF_@STYPE@ == @len@
+    #define TO_SIMD_UTYPE(X) X##_u@len@
     #if @signed@
         #define TO_SIMD_SFX(X) X##_s@len@
     #else
@@ -385,50 +381,71 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
 #endif
 
 /**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
- * #OP = ==, !=, <, <=, >, >=#
+ * #kind = greater, greater_equal#
+ * #kind_to = less, less_equal#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    if (!TO_SIMD_SFX(run_binary_simd_@kind@)(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            *((npy_bool *)op1) = in1 @OP@ in2;
-        }
-    }
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_@kind_to@)(nargs, dimensions, nsteps);
 }
 /**end repeat1**/
-/**end repeat**/
 
-/*
- *****************************************************************************
- **                             FLOAT LOOPS                                 **
- *****************************************************************************
+/**begin repeat1
+ * #kind = less, less_equal#
  */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_@kind@)(args, dimensions, steps);
+}
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = equal, not_equal#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_@kind@)(args, dimensions, steps);
+}
+/**end repeat1**/
+/**end repeat**/
 
 /**begin repeat
- * Float types
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #sfx = f32, f64#
+ * Boolean & Float types
+ * #TYPE = BOOL, FLOAT, DOUBLE#
+ * #sfx = b8, f32, f64#
+ * #fp = 0, 1, 1#
  */
 /**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
- * #OP = ==, !=, <, <=, >, >=#
+ * #kind = greater, greater_equal#
+ * #kind_to = less, less_equal#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    if (!run_binary_simd_@kind@_@sfx@(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            *((npy_bool *)op1) = in1 @OP@ in2;
-        }
-    }
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    run_binary_simd_@kind_to@_@sfx@(nargs, dimensions, nsteps);
+#if @fp@
     npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    run_binary_simd_@kind@_@sfx@(args, dimensions, steps);
+#if @fp@
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
 }
 /**end repeat1**/
 /**end repeat**/

From dc4a9e39dfb13aecb61d955445538838fbb2233d Mon Sep 17 00:00:00 2001
From: Rafael CF Sousa <rafaelcfsousa@ibm.com>
Date: Fri, 27 May 2022 12:34:43 -0300
Subject: [PATCH 5/6] TST: Add test for andc (u8)

This commit also rewrite the tests andc, orc and xnor
---
 numpy/core/tests/test_simd.py | 67 +++++++++++++----------------------
 1 file changed, 24 insertions(+), 43 deletions(-)

diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 81d759ad42ea..f33db95fc415 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -126,7 +126,8 @@ def test_operators_logical(self):
         """
         Logical operations for boolean types.
         Test intrinsics:
-            npyv_xor_##SFX, npyv_and_##SFX, npyv_or_##SFX, npyv_not_##SFX
+            npyv_xor_##SFX, npyv_and_##SFX, npyv_or_##SFX, npyv_not_##SFX,
+            npyv_andc_b8, npvy_orc_b8, nvpy_xnor_b8
         """
         data_a = self._data()
         data_b = self._data(reverse=True)
@@ -148,6 +149,22 @@ def test_operators_logical(self):
         vnot = getattr(self, "not")(vdata_a)
         assert vnot == data_b
 
+        # among the boolean types, andc, orc and xnor only support b8
+        if self.sfx not in ("b8"):
+            return
+
+        data_andc = [(a & ~b) & 0xFF for a, b in zip(data_a, data_b)]
+        vandc = getattr(self, "andc")(vdata_a, vdata_b)
+        assert data_andc == vandc
+
+        data_orc = [(a | ~b) & 0xFF for a, b in zip(data_a, data_b)]
+        vorc = getattr(self, "orc")(vdata_a, vdata_b)
+        assert data_orc == vorc
+
+        data_xnor = [~(a ^ b) & 0xFF for a, b in zip(data_a, data_b)]
+        vxnor = getattr(self, "xnor")(vdata_a, vdata_b)
+        assert data_xnor == vxnor
+
     def test_tobits(self):
         data2bits = lambda data: sum([int(x != 0) << i for i, x in enumerate(data, 0)])
         for data in (self._data(), self._data(reverse=True)):
@@ -156,48 +173,6 @@ def test_tobits(self):
             tobits = bin(self.tobits(vdata))
             assert tobits == bin(data_bits)
 
-    def test_andc(self):
-        if self.sfx not in ("b8"):
-            return
-        andc_simd = getattr(self.npyv, f"andc_b8")
-        # create the vectors
-        data = self._data()
-        rdata = self._data(reverse=True)
-        vdata = self._load_b(data)
-        vrdata = self._load_b(rdata)
-        # check andc
-        sandc = [(~x & y) & 0xFF for x, y in zip(rdata, data)]
-        vandc = andc_simd(vrdata, vdata)
-        assert sandc == vandc
-
-    def test_orc(self):
-        if self.sfx not in ("b8"):
-            return
-        orc_simd = getattr(self.npyv, f"orc_b8")
-        # create the vectors
-        data = self._data()
-        rdata = self._data(reverse=True)
-        vdata = self._load_b(data)
-        vrdata = self._load_b(rdata)
-        # check orc
-        sorc = [(~x | y) & 0xFF for x, y in zip(rdata, data)]
-        vorc = orc_simd(vrdata, vdata)
-        assert sorc == vorc
-
-    def test_xnor(self):
-        if self.sfx not in ("b8"):
-            return
-        xnor_simd = getattr(self.npyv, f"xnor_b8")
-        # create the vectors
-        data = self._data()
-        rdata = self._data(reverse=True)
-        vdata = self._load_b(data)
-        vrdata = self._load_b(rdata)
-        # check orc
-        sxnor = [~(x ^ y) & 0xFF for x, y in zip(rdata, data)]
-        vxnor = xnor_simd(vrdata, vdata)
-        assert sxnor == vxnor
-
     def test_pack(self):
         """
         Pack multiple vectors into one
@@ -865,6 +840,12 @@ def test_operators_logical(self):
         vnot = cast(getattr(self, "not")(vdata_a))
         assert vnot == data_not
 
+        if self.sfx not in ("u8"):
+            return
+        data_andc = [a & ~b for a, b in zip(data_cast_a, data_cast_b)]
+        vandc = cast(getattr(self, "andc")(vdata_a, vdata_b))
+        assert vandc == data_andc
+
     def test_conversion_boolean(self):
         bsfx = "b" + self.sfx[1:]
         to_boolean = getattr(self.npyv, "cvt_%s_%s" % (bsfx, self.sfx))

From 2701a5a38e1521e5dd66e343c81bf075c0c30d17 Mon Sep 17 00:00:00 2001
From: Rafael CF Sousa <rafaelcfsousa@ibm.com>
Date: Sun, 29 May 2022 21:29:02 -0300
Subject: [PATCH 6/6] DOC: Add a release note for the comparison functions

The PR #21483 improves the execution time of the comparison functions by using universal intrinsics
---
 doc/release/upcoming_changes/21483.performance.rst | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 doc/release/upcoming_changes/21483.performance.rst

diff --git a/doc/release/upcoming_changes/21483.performance.rst b/doc/release/upcoming_changes/21483.performance.rst
new file mode 100644
index 000000000000..f9456d69f9ef
--- /dev/null
+++ b/doc/release/upcoming_changes/21483.performance.rst
@@ -0,0 +1,7 @@
+Faster comparison operators
+----------------------------
+The comparison functions (``numpy.equal``, ``numpy.not_equal``, ``numpy.less``,
+``numpy.less_equal``, ``numpy.greater`` and ``numpy.greater_equal``) are now
+much faster as they are now vectorized with universal intrinsics. For a CPU
+with SIMD extension AVX512BW, the performance gain is up to 2.57x, 1.65x and
+19.15x for integer, float and boolean data types, respectively (with N=50000).