WIP::ENH:SIMD Improve the performance of comparison operators

TODO
numpy · Aug 3, 2020 · 95c485a · 95c485a
1 parent d161288
commit 95c485a
Show file tree

Hide file tree

Showing 7 changed files with 586 additions and 519 deletions.
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
@@ -425,55 +425,55 @@ def english_upper(s):
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.greater'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
 'greater_equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.greater_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
 'less':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.less'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
 'less_equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.less_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
 'equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
 'not_equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.not_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', simd=[('avx2', ints)]),
+          TD(all, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           TD('O', out='?'),
           ),
 'logical_and':
     Ufunc(2, 1, True_,
           docstrings.get('numpy.core.umath.logical_and'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(nodatetime_or_obj, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
           TD(O, f='npy_ObjectLogicalAnd'),
           TD(O, f='npy_ObjectLogicalAnd', out='?'),
           ),
@@ -489,7 +489,7 @@ def english_upper(s):
     Ufunc(2, 1, False_,
           docstrings.get('numpy.core.umath.logical_or'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(nodatetime_or_obj, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
           TD(O, f='npy_ObjectLogicalOr'),
           TD(O, f='npy_ObjectLogicalOr', out='?'),
           ),

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
@@ -902,6 +902,7 @@ def generate_umath_c(ext, build_dir):
             join('src', 'umath', 'simd.inc.src'),
             join('src', 'umath', 'loops.h.src'),
             join('src', 'umath', 'loops.c.src'),
+            join('src', 'umath', 'loops_cmp.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
             join('src', 'umath', 'clip.h.src'),

diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h
@@ -83,7 +83,7 @@
 #define npyv_and_s32 _mm512_and_si512
 #define npyv_and_u64 _mm512_and_si512
 #define npyv_and_s64 _mm512_and_si512
-#define npyv_and_b32 _kand_mask16
+#define npyv_and_b32 _mm512_kand
 #ifdef NPY_HAVE_AVX512BW
     #define npyv_and_b8  _kand_mask64
     #define npyv_and_b16 _kand_mask32
@@ -96,7 +96,7 @@
     #define npyv_and_f32 _mm512_and_ps
     #define npyv_and_f64 _mm512_and_pd
 #else
-    #define npyv_and_b64(A, B) ((__mmask8)_kand_mask16((__mmask16)(A), (__mmask16)(B)))
+    #define npyv_and_b64(A, B) ((__mmask8)_mm512_kand((__mmask16)(A), (__mmask16)(B)))
     NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_and_f32, _mm512_and_si512)
     NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_and_f64, _mm512_and_si512)
 #endif
@@ -109,7 +109,7 @@
 #define npyv_or_s32 _mm512_or_si512
 #define npyv_or_u64 _mm512_or_si512
 #define npyv_or_s64 _mm512_or_si512
-#define npyv_or_b32 _kor_mask16
+#define npyv_or_b32 _mm512_kor
 #ifdef NPY_HAVE_AVX512BW
     #define npyv_or_b8  _kor_mask64
     #define npyv_or_b16 _kor_mask32
@@ -122,7 +122,7 @@
     #define npyv_or_f32 _mm512_or_ps
     #define npyv_or_f64 _mm512_or_pd
 #else
-    #define npyv_or_b64(A, B) ((__mmask8)_kor_mask16((__mmask16)(A), (__mmask16)(B)))
+    #define npyv_or_b64(A, B) ((__mmask8)_mm512_kor((__mmask16)(A), (__mmask16)(B)))
     NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_or_f32, _mm512_or_si512)
     NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_or_f64, _mm512_or_si512)
 #endif
@@ -135,7 +135,7 @@
 #define npyv_xor_s32 _mm512_xor_si512
 #define npyv_xor_u64 _mm512_xor_si512
 #define npyv_xor_s64 _mm512_xor_si512
-#define npyv_xor_b32 _kxor_mask16
+#define npyv_xor_b32 _mm512_kxor
 #ifdef NPY_HAVE_AVX512BW
     #define npyv_xor_b8  _kxor_mask64
     #define npyv_xor_b16 _kxor_mask32
@@ -148,7 +148,7 @@
     #define npyv_xor_f32 _mm512_xor_ps
     #define npyv_xor_f64 _mm512_xor_pd
 #else
-    #define npyv_xor_b64(A, B) ((__mmask8)_kxor_mask16((__mmask16)(A), (__mmask16)(B)))
+    #define npyv_xor_b64(A, B) ((__mmask8)_mm512_kxor((__mmask16)(A), (__mmask16)(B)))
     NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_xor_f32, _mm512_xor_si512)
     NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_xor_f64, _mm512_xor_si512)
 #endif
@@ -161,7 +161,7 @@
 #define npyv_not_s32 npyv_not_u8
 #define npyv_not_u64 npyv_not_u8
 #define npyv_not_s64 npyv_not_u8
-#define npyv_not_b32 _knot_mask16
+#define npyv_not_b32 _mm512_knot
 #ifdef NPY_HAVE_AVX512BW
     #define npyv_not_b8  _knot_mask64
     #define npyv_not_b16 _knot_mask32
@@ -174,7 +174,7 @@
     #define npyv_not_f32(A) _mm512_xor_ps(A, _mm512_castsi512_ps(_mm512_set1_epi32(-1)))
     #define npyv_not_f64(A) _mm512_xor_pd(A, _mm512_castsi512_pd(_mm512_set1_epi32(-1)))
 #else
-    #define npyv_not_b64(A, B) ((__mmask8)_knot_mask16((__mmask16)(A), (__mmask16)(B)))
+    #define npyv_not_b64(A) ((__mmask8)_mm512_knot((__mmask16)(A))
     #define npyv_not_f32(A) _mm512_castsi512_ps(npyv_not_u32(_mm512_castps_si512(A)))
     #define npyv_not_f64(A) _mm512_castsi512_pd(npyv_not_u64(_mm512_castpd_si512(A)))
 #endif

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
@@ -397,96 +397,6 @@ PyUFunc_On_Om(char **args, npy_intp const *dimensions, npy_intp const *steps, vo
  *****************************************************************************
  */
 
-/**begin repeat
- * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
- * #OP =  ==, !=, >, >=, <, <=#
- **/
-
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP {
-        npy_bool in1 = *((npy_bool *)ip1) != 0;
-        npy_bool in2 = *((npy_bool *)ip2) != 0;
-        *((npy_bool *)op1)= in1 @OP@ in2;
-    }
-}
-/**end repeat**/
-
-
-/**begin repeat
- * #kind = logical_and, logical_or#
- * #OP =  &&, ||#
- * #SC =  ==, !=#
- * #and = 1, 0#
- **/
-
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if(IS_BINARY_REDUCE) {
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-        /*
-         * stick with our variant for more reliable performance, only known
-         * platform which outperforms it by ~20% is an i7 with glibc 2.17
-         */
-        if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
-            return;
-        }
-#else
-        /* for now only use libc on 32-bit/non-x86 */
-        if (steps[1] == 1) {
-            npy_bool * op = (npy_bool *)args[0];
-#if @and@
-            /* np.all(), search for a zero (false) */
-            if (*op) {
-                *op = memchr(args[1], 0, dimensions[0]) == NULL;
-            }
-#else
-            /*
-             * np.any(), search for a non-zero (true) via comparing against
-             * zero blocks, memcmp is faster than memchr on SSE4 machines
-             * with glibc >= 2.12 and memchr can only check for equal 1
-             */
-            static const npy_bool zero[4096]; /* zero by C standard */
-            npy_uintp i, n = dimensions[0];
-
-            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
-                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
-            }
-            if (!*op && n - i > 0) {
-                *op = memcmp(&args[1][i], zero, n - i) != 0;
-            }
-#endif
-            return;
-        }
-#endif
-        else {
-            BINARY_REDUCE_LOOP(npy_bool) {
-                const npy_bool in2 = *(npy_bool *)ip2;
-                io1 = io1 @OP@ in2;
-                if (io1 @SC@ 0) {
-                    break;
-                }
-            }
-            *((npy_bool *)iop1) = io1;
-        }
-    }
-    else {
-        if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
-            return;
-        }
-        else {
-            BINARY_LOOP {
-                const npy_bool in1 = *(npy_bool *)ip1;
-                const npy_bool in2 = *(npy_bool *)ip2;
-                *((npy_bool *)op1) = in1 @OP@ in2;
-            }
-        }
-    }
-}
-/**end repeat**/
-
 /**begin repeat
  * #kind = absolute, logical_not#
  * #OP =  !=, ==#
@@ -686,27 +596,6 @@ void
     BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
 }
 
-
-/**begin repeat2
- * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
- *         logical_and, logical_or#
- * #OP =  ==, !=, >, >=, <, <=, &&, ||#
- */
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*
-     * gcc vectorization of this is not good (PR60575) but manual integer
-     * vectorization is too tedious to be worthwhile
-     */
-    BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
-}
-#endif
-
-/**end repeat2**/
-
 #if @CHK@
 NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
 @TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
@@ -1839,25 +1728,6 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-/**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
- *        logical_and, logical_or#
- * #OP = ==, !=, <, <=, >, >=, &&, ||#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            *((npy_bool *)op1) = in1 @OP@ in2;
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-/**end repeat1**/
-
 NPY_NO_EXPORT void
 @TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -2254,27 +2124,6 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
 }
 /**end repeat**/
 
-#define _HALF_LOGICAL_AND(a,b) (!npy_half_iszero(a) && !npy_half_iszero(b))
-#define _HALF_LOGICAL_OR(a,b) (!npy_half_iszero(a) || !npy_half_iszero(b))
-/**begin repeat
- * #kind = equal, not_equal, less, less_equal, greater,
- *         greater_equal, logical_and, logical_or#
- * #OP = npy_half_eq, npy_half_ne, npy_half_lt, npy_half_le, npy_half_gt,
- *       npy_half_ge, _HALF_LOGICAL_AND, _HALF_LOGICAL_OR#
- */
-NPY_NO_EXPORT void
-HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP {
-        const npy_half in1 = *(npy_half *)ip1;
-        const npy_half in2 = *(npy_half *)ip2;
-        *((npy_bool *)op1) = @OP@(in1, in2);
-    }
-}
-/**end repeat**/
-#undef _HALF_LOGICAL_AND
-#undef _HALF_LOGICAL_OR
-
 NPY_NO_EXPORT void
 HALF_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -2763,41 +2612,6 @@ NPY_NO_EXPORT void
     }
 }
 
-/**begin repeat1
- * #kind= greater, greater_equal, less, less_equal, equal, not_equal#
- * #OP = CGT, CGE, CLT, CLE, CEQ, CNE#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP {
-        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
-        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
-        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
-        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
-        *((npy_bool *)op1) = @OP@(in1r,in1i,in2r,in2i);
-    }
-}
-/**end repeat1**/
-
-/**begin repeat1
-   #kind = logical_and, logical_or#
-   #OP1 = ||, ||#
-   #OP2 = &&, ||#
-*/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP {
-        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
-        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
-        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
-        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
-        *((npy_bool *)op1) = (in1r @OP1@ in1i) @OP2@ (in2r @OP1@ in2i);
-    }
-}
-/**end repeat1**/
-
 NPY_NO_EXPORT void
 @TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {