From 1134be2ca463d3f5d788c4c7b1dcf3ef1f9e3d33 Mon Sep 17 00:00:00 2001 From: Rafael Cardoso Fernandes Sousa Date: Fri, 29 Apr 2022 17:58:53 -0500 Subject: [PATCH 1/6] SIMD: Use universal intrinsics to implement comparison functions --- .gitignore | 1 + benchmarks/benchmarks/bench_ufunc.py | 19 +- numpy/core/code_generators/generate_umath.py | 12 +- numpy/core/setup.py | 1 + numpy/core/src/_simd/_simd.dispatch.c.src | 9 + numpy/core/src/_simd/_simd_easyintrin.inc | 44 ++ numpy/core/src/common/simd/avx2/conversion.h | 30 ++ .../core/src/common/simd/avx512/conversion.h | 42 ++ numpy/core/src/common/simd/avx512/utils.h | 12 + numpy/core/src/common/simd/neon/conversion.h | 24 + numpy/core/src/common/simd/sse/conversion.h | 24 + numpy/core/src/common/simd/vsx/conversion.h | 23 + numpy/core/src/umath/loops.c.src | 53 +-- numpy/core/src/umath/loops.h.src | 83 +++- .../src/umath/loops_comparison.dispatch.c.src | 421 ++++++++++++++++++ numpy/core/src/umath/simd.inc.src | 144 ------ numpy/core/tests/test_simd.py | 34 ++ numpy/core/tests/test_umath.py | 76 ++++ 18 files changed, 852 insertions(+), 200 deletions(-) create mode 100644 numpy/core/src/umath/loops_comparison.dispatch.c.src diff --git a/.gitignore b/.gitignore index 632f13674da6..8dd4a5344ec5 100644 --- a/.gitignore +++ b/.gitignore @@ -225,6 +225,7 @@ numpy/core/src/umath/loops_exponent_log.dispatch.c numpy/core/src/umath/loops_umath_fp.dispatch.c numpy/core/src/umath/loops_hyperbolic.dispatch.c numpy/core/src/umath/loops_modulo.dispatch.c +numpy/core/src/umath/loops_comparison.dispatch.c # npysort module numpy/core/src/npysort/x86-qsort.dispatch.c numpy/core/src/npysort/x86-qsort.dispatch.*.cpp diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py index cfa29017d239..858dcccfcd84 100644 --- a/benchmarks/benchmarks/bench_ufunc.py +++ b/benchmarks/benchmarks/bench_ufunc.py @@ -170,8 +170,25 @@ def time_divide_scalar2(self, dtype): def time_divide_scalar2_inplace(self, dtype): np.divide(self.d, 1, out=self.d) + +class CustomComparison(Benchmark): + params = (np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, + np.uint32, np.uint64, np.float32, np.float64, np.bool_) + param_names = ['dtype'] + + def setup(self, dtype): + self.x = np.ones(50000, dtype=dtype) + self.y = np.ones(50000, dtype=dtype) + self.s = np.ones(1, dtype=dtype) + + def time_less_than_binary(self, dtype): + (self.x < self.y) + + def time_less_than_scalar1(self, dtype): + (self.s < self.x) + def time_less_than_scalar2(self, dtype): - (self.d < 1) + (self.x < self.s) class CustomScalarFloorDivideInt(Benchmark): diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 266fccefb2c6..cc0e93d4350b 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -445,7 +445,7 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.greater'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?', simd=[('avx2', ints)]), + TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]), [TypeDescription('O', FullTypeDescr, 'OO', 'O')], TD('O', out='?'), ), @@ -453,7 +453,7 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.greater_equal'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?', simd=[('avx2', ints)]), + TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]), [TypeDescription('O', FullTypeDescr, 'OO', 'O')], TD('O', out='?'), ), @@ -461,7 +461,7 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.less'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?', simd=[('avx2', ints)]), + TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]), [TypeDescription('O', FullTypeDescr, 'OO', 'O')], TD('O', out='?'), ), @@ -469,7 +469,7 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.less_equal'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?', simd=[('avx2', ints)]), + TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]), [TypeDescription('O', FullTypeDescr, 'OO', 'O')], TD('O', out='?'), ), @@ -477,7 +477,7 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.equal'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?', simd=[('avx2', ints)]), + TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]), [TypeDescription('O', FullTypeDescr, 'OO', 'O')], TD('O', out='?'), ), @@ -485,7 +485,7 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.not_equal'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(all, out='?', simd=[('avx2', ints)]), + TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]), [TypeDescription('O', FullTypeDescr, 'OO', 'O')], TD('O', out='?'), ), diff --git a/numpy/core/setup.py b/numpy/core/setup.py index dd60a00dbd76..8018a489f245 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -1070,6 +1070,7 @@ def generate_umath_doc_header(ext, build_dir): join('src', 'umath', 'loops_exponent_log.dispatch.c.src'), join('src', 'umath', 'loops_hyperbolic.dispatch.c.src'), join('src', 'umath', 'loops_modulo.dispatch.c.src'), + join('src', 'umath', 'loops_comparison.dispatch.c.src'), join('src', 'umath', 'matmul.h.src'), join('src', 'umath', 'matmul.c.src'), join('src', 'umath', 'clip.h'), diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index ab48db5b108d..ad8c538e4cfb 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -472,6 +472,10 @@ SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@) /**end repeat**/ +SIMD_IMPL_INTRIN_2(pack_b8_b16, vb8, vb16, vb16) +SIMD_IMPL_INTRIN_4(pack_b8_b32, vb8, vb32, vb32, vb32, vb32) +SIMD_IMPL_INTRIN_8(pack_b8_b64, vb8, vb64, vb64, vb64, vb64, + vb64, vb64, vb64, vb64) //######################################################################### //## Attach module functions @@ -716,6 +720,11 @@ SIMD_INTRIN_DEF(not_@bsfx@) SIMD_INTRIN_DEF(tobits_@bsfx@) /**end repeat**/ +// Pack multiple vectors into one +SIMD_INTRIN_DEF(pack_b8_b16) +SIMD_INTRIN_DEF(pack_b8_b32) +SIMD_INTRIN_DEF(pack_b8_b64) + /************************************************************************/ {NULL, NULL, 0, NULL} }; // PyMethodDef diff --git a/numpy/core/src/_simd/_simd_easyintrin.inc b/numpy/core/src/_simd/_simd_easyintrin.inc index 4521b2d87f07..f2e0da26ecef 100644 --- a/numpy/core/src/_simd/_simd_easyintrin.inc +++ b/numpy/core/src/_simd/_simd_easyintrin.inc @@ -153,6 +153,50 @@ return simd_arg_to_obj(&ret); \ } +#define SIMD_IMPL_INTRIN_8(NAME, RET, IN0, IN1, IN2, IN3, \ + IN4, IN5, IN6, IN7) \ + static PyObject *simd__intrin_##NAME \ + (PyObject* NPY_UNUSED(self), PyObject *args) \ + { \ + simd_arg arg1 = {.dtype = simd_data_##IN0}; \ + simd_arg arg2 = {.dtype = simd_data_##IN1}; \ + simd_arg arg3 = {.dtype = simd_data_##IN2}; \ + simd_arg arg4 = {.dtype = simd_data_##IN3}; \ + simd_arg arg5 = {.dtype = simd_data_##IN4}; \ + simd_arg arg6 = {.dtype = simd_data_##IN5}; \ + simd_arg arg7 = {.dtype = simd_data_##IN6}; \ + simd_arg arg8 = {.dtype = simd_data_##IN7}; \ + if (!PyArg_ParseTuple( \ + args, "O&O&O&O&O&O&O&O&:"NPY_TOSTRING(NAME), \ + simd_arg_converter, &arg1, \ + simd_arg_converter, &arg2, \ + simd_arg_converter, &arg3, \ + simd_arg_converter, &arg4, \ + simd_arg_converter, &arg5, \ + simd_arg_converter, &arg6, \ + simd_arg_converter, &arg7, \ + simd_arg_converter, &arg8 \ + )) return NULL; \ + simd_data data = {.RET = npyv_##NAME( \ + arg1.data.IN0, arg2.data.IN1, \ + arg3.data.IN2, arg4.data.IN3, \ + arg5.data.IN4, arg6.data.IN5, \ + arg7.data.IN6, arg8.data.IN7 \ + )}; \ + simd_arg_free(&arg1); \ + simd_arg_free(&arg2); \ + simd_arg_free(&arg3); \ + simd_arg_free(&arg4); \ + simd_arg_free(&arg5); \ + simd_arg_free(&arg6); \ + simd_arg_free(&arg7); \ + simd_arg_free(&arg8); \ + simd_arg ret = { \ + .data = data, .dtype = simd_data_##RET \ + }; \ + return simd_arg_to_obj(&ret); \ + } + /** * Helper macros for repeating and expand a certain macro. * Mainly used for converting a scalar to an immediate constant. diff --git a/numpy/core/src/common/simd/avx2/conversion.h b/numpy/core/src/common/simd/avx2/conversion.h index 64e051686794..00ac0d38a31a 100644 --- a/numpy/core/src/common/simd/avx2/conversion.h +++ b/numpy/core/src/common/simd/avx2/conversion.h @@ -58,6 +58,36 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) { return r; } +// pack two 16-bit boolean into one 8-bit boolean vector +NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) { + __m256i ab = _mm256_packs_epi16(a, b); + return npyv256_shuffle_odd(ab); +} + +// pack four 32-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 +npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) { + __m256i ab = _mm256_packs_epi32(a, b); + __m256i cd = _mm256_packs_epi32(c, d); + __m256i abcd = npyv_pack_b8_b16(ab, cd); + return _mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0)); +} + +// pack eight 64-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 +npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, + npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) { + __m256i ab = _mm256_packs_epi32(a, b); + __m256i cd = _mm256_packs_epi32(c, d); + __m256i ef = _mm256_packs_epi32(e, f); + __m256i gh = _mm256_packs_epi32(g, h); + __m256i abcd = _mm256_packs_epi32(ab, cd); + __m256i efgh = _mm256_packs_epi32(ef, gh); + __m256i all = npyv256_shuffle_odd(_mm256_packs_epi16(abcd, efgh)); + __m256i rev128 = _mm256_alignr_epi8(all, all, 8); + return _mm256_unpacklo_epi16(all, rev128); +} + // round to nearest integer (assuming even) #define npyv_round_s32_f32 _mm256_cvtps_epi32 NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h index 0bd44179b332..a2f56b2ae654 100644 --- a/numpy/core/src/common/simd/avx512/conversion.h +++ b/numpy/core/src/common/simd/avx512/conversion.h @@ -90,6 +90,48 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) return r; } +// pack two 16-bit boolean into one 8-bit boolean vector +NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) { +#ifdef NPY_HAVE_AVX512BW + return _mm512_kunpackd((__mmask64)b, (__mmask64)a); +#else + const __m512i idx = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + return _mm512_permutexvar_epi64(idx, npyv512_packs_epi16(a, b)); +#endif +} + +// pack four 32-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 +npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) { +#ifdef NPY_HAVE_AVX512BW + __mmask32 ab = (__mmask64)_mm512_kunpackw((__mmask32)b, (__mmask32)a); + __mmask32 cd = (__mmask64)_mm512_kunpackw((__mmask32)d, (__mmask32)c); + return npyv_pack_b8_b16(ab, cd); +#else + const __m512i idx = _mm512_setr_epi32( + 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15); + __m256i ta = npyv512_pack_lo_hi(npyv_cvt_u32_b32(a)); + __m256i tb = npyv512_pack_lo_hi(npyv_cvt_u32_b32(b)); + __m256i tc = npyv512_pack_lo_hi(npyv_cvt_u32_b32(c)); + __m256i td = npyv512_pack_lo_hi(npyv_cvt_u32_b32(d)); + __m256i ab = _mm256_packs_epi16(ta, tb); + __m256i cd = _mm256_packs_epi16(tc, td); + __m512i abcd = npyv512_combine_si256(ab, cd); + return _mm512_permutexvar_epi32(idx, abcd); +#endif +} + +// pack eight 64-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 +npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, + npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) { + __mmask16 ab = _mm512_kunpackb((__mmask16)b, (__mmask16)a); + __mmask16 cd = _mm512_kunpackb((__mmask16)d, (__mmask16)c); + __mmask16 ef = _mm512_kunpackb((__mmask16)f, (__mmask16)e); + __mmask16 gh = _mm512_kunpackb((__mmask16)h, (__mmask16)g); + return npyv_pack_b8_b32(ab, cd, ef, gh); +} + // convert boolean vectors to integer bitfield NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) { diff --git a/numpy/core/src/common/simd/avx512/utils.h b/numpy/core/src/common/simd/avx512/utils.h index c3079283f491..ced3bfef0ef9 100644 --- a/numpy/core/src/common/simd/avx512/utils.h +++ b/numpy/core/src/common/simd/avx512/utils.h @@ -87,4 +87,16 @@ )); \ } +#ifndef NPY_HAVE_AVX512BW + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv512_packs_epi16, _mm256_packs_epi16) +#else + #define npyv512_packs_epi16 _mm512_packs_epi16 +#endif + +NPY_FINLINE __m256i npyv512_pack_lo_hi(__m512i a) { + __m256i lo = npyv512_lower_si256(a); + __m256i hi = npyv512_higher_si256(a); + return _mm256_packs_epi32(lo, hi); +} + #endif // _NPY_SIMD_AVX512_UTILS_H diff --git a/numpy/core/src/common/simd/neon/conversion.h b/numpy/core/src/common/simd/neon/conversion.h index 7487559d1c30..b6a50dc7a642 100644 --- a/numpy/core/src/common/simd/neon/conversion.h +++ b/numpy/core/src/common/simd/neon/conversion.h @@ -86,6 +86,30 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) { return r; } +// pack two 16-bit boolean into one 8-bit boolean vector +NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) { + return vcombine_u8(vmovn_u16(a), vmovn_u16(b)); +} + +// pack four 32-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 +npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) { + npyv_b16 ab = vcombine_u16(vmovn_u32(a), vmovn_u32(b)); + npyv_b16 cd = vcombine_u16(vmovn_u32(c), vmovn_u32(d)); + return npyv_pack_b8_b16(ab, cd); +} + +// pack eight 64-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 +npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, + npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) { + npyv_b32 ab = vcombine_u32(vmovn_u64(a), vmovn_u64(b)); + npyv_b32 cd = vcombine_u32(vmovn_u64(c), vmovn_u64(d)); + npyv_b32 ef = vcombine_u32(vmovn_u64(e), vmovn_u64(f)); + npyv_b32 gh = vcombine_u32(vmovn_u64(g), vmovn_u64(h)); + return npyv_pack_b8_b32(ab, cd, ef, gh); +} + // round to nearest integer #if NPY_SIMD_F64 #define npyv_round_s32_f32 vcvtnq_s32_f32 diff --git a/numpy/core/src/common/simd/sse/conversion.h b/numpy/core/src/common/simd/sse/conversion.h index ab7eb490727b..0811bf06ae4a 100644 --- a/numpy/core/src/common/simd/sse/conversion.h +++ b/numpy/core/src/common/simd/sse/conversion.h @@ -59,6 +59,30 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) { return r; } +// pack two 16-bit boolean into one 8-bit boolean vector +NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) { + return _mm_packs_epi16(a, b); +} + +// pack four 32-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 +npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) { + npyv_b16 ab = _mm_packs_epi32(a, b); + npyv_b16 cd = _mm_packs_epi32(c, d); + return npyv_pack_b8_b16(ab, cd); +} + +// pack eight 64-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 +npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, + npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) { + npyv_b32 ab = _mm_packs_epi32(a, b); + npyv_b32 cd = _mm_packs_epi32(c, d); + npyv_b32 ef = _mm_packs_epi32(e, f); + npyv_b32 gh = _mm_packs_epi32(g, h); + return npyv_pack_b8_b32(ab, cd, ef, gh); +} + // round to nearest integer (assuming even) #define npyv_round_s32_f32 _mm_cvtps_epi32 NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h index 36bea7bbaddf..a599f3950fe5 100644 --- a/numpy/core/src/common/simd/vsx/conversion.h +++ b/numpy/core/src/common/simd/vsx/conversion.h @@ -48,6 +48,29 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) return r; } +// pack two 16-bit boolean into one 8-bit boolean vector +NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) { + return vec_pack(a, b); +} + +// pack four 32-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) { + npyv_b16 ab = vec_pack(a, b); + npyv_b16 cd = vec_pack(c, d); + return npyv_pack_b8_b16(ab, cd); +} + +// pack eight 64-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 +npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, + npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) { + npyv_b32 ab = vec_pack(a, b); + npyv_b32 cd = vec_pack(c, d); + npyv_b32 ef = vec_pack(e, f); + npyv_b32 gh = vec_pack(g, h); + return npyv_pack_b8_b32(ab, cd, ef, gh); +} + // convert boolean vector to integer bitfield NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) { diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 3a8a549131a2..9ae686399dbd 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -400,23 +400,6 @@ PyUFunc_On_Om(char **args, npy_intp const *dimensions, npy_intp const *steps, vo ***************************************************************************** */ -/**begin repeat - * #kind = equal, not_equal, greater, greater_equal, less, less_equal# - * #OP = ==, !=, >, >=, <, <=# - **/ - -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - BINARY_LOOP { - npy_bool in1 = *((npy_bool *)ip1) != 0; - npy_bool in2 = *((npy_bool *)ip2) != 0; - *((npy_bool *)op1)= in1 @OP@ in2; - } -} -/**end repeat**/ - - /**begin repeat * #kind = logical_and, logical_or# * #OP = &&, ||# @@ -688,9 +671,8 @@ void /**begin repeat2 - * #kind = equal, not_equal, greater, greater_equal, less, less_equal, - * logical_and, logical_or# - * #OP = ==, !=, >, >=, <, <=, &&, ||# + * #kind = logical_and, logical_or# + * #OP = &&, ||# */ #if @CHK@ @@ -1408,19 +1390,16 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const * * #C = F, , L# */ /**begin repeat1 - * #kind = equal, not_equal, less, less_equal, greater, greater_equal, - * logical_and, logical_or# - * #OP = ==, !=, <, <=, >, >=, &&, ||# + * #kind = logical_and, logical_or# + * #OP = &&, ||# */ NPY_NO_EXPORT void @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((npy_bool *)op1) = in1 @OP@ in2; - } + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + *((npy_bool *)op1) = in1 @OP@ in2; } npy_clear_floatstatus_barrier((char*)dimensions); } @@ -1654,6 +1633,22 @@ LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps } /**end repeat**/ +/**begin repeat + * #kind = equal, not_equal, less, less_equal, greater, greater_equal# + * #OP = ==, !=, <, <=, >, >=# + */ +NPY_NO_EXPORT void +LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + BINARY_LOOP { + const npy_longdouble in1 = *(npy_longdouble *)ip1; + const npy_longdouble in2 = *(npy_longdouble *)ip2; + *((npy_bool *)op1) = in1 @OP@ in2; + } + npy_clear_floatstatus_barrier((char*)dimensions); +} +/**end repeat**/ + NPY_NO_EXPORT void LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 694518ae0e20..5af9f1788758 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -28,9 +28,19 @@ ***************************************************************************** */ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_comparison.dispatch.h" +#endif + +/**begin repeat + * #kind = equal, not_equal, greater, greater_equal, less, less_equal# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat**/ + /**begin repeat - * #kind = equal, not_equal, greater, greater_equal, less, less_equal, - * logical_and, logical_or, absolute, logical_not# + * #kind = logical_and, logical_or, absolute, logical_not# **/ NPY_NO_EXPORT void BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); @@ -60,8 +70,8 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, BYTE, SHORT, INT, LONG, LONGLONG# */ - NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) /**end repeat**/ #ifndef NPY_DISABLE_OPTIMIZATION @@ -72,14 +82,28 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, BYTE, SHORT, INT, LONG, LONGLONG# */ - NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divmod, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**begin repeat1 + * #kind = divmod, fmod, remainder# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat1**/ +/**end repeat**/ - NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_fmod, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_comparison.dispatch.h" +#endif - NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_remainder, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + BYTE, SHORT, INT, LONG, LONGLONG# + */ +/**begin repeat1 + * #kind = equal, not_equal, greater, greater_equal, less, less_equal# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat1**/ /**end repeat**/ /**begin repeat @@ -136,9 +160,8 @@ NPY_NO_EXPORT void /**end repeat3**/ /**begin repeat3 - * #kind = equal, not_equal, greater, greater_equal, less, less_equal, - * logical_and, logical_or# - * #OP = ==, !=, >, >=, <, <=, &&, ||# + * #kind = logical_and, logical_or# + * #OP = &&, ||# */ NPY_NO_EXPORT void @S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); @@ -232,9 +255,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, /**end repeat1**/ /**end repeat**/ -/**end repeat1**/ -/**end repeat**/ - // SVML #ifndef NPY_DISABLE_OPTIMIZATION #include "loops_umath_fp.dispatch.h" @@ -300,6 +320,21 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( /**end repeat1**/ /**end repeat**/ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_comparison.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #kind = equal, not_equal, less, less_equal, greater, greater_equal# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( + char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) +)) +/**end repeat1**/ +/**end repeat**/ + /**begin repeat * Float types * #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE# @@ -307,7 +342,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( * #C = F, F, , L# */ - /**begin repeat1 * Arithmetic * # kind = add, subtract, multiply, divide# @@ -318,9 +352,8 @@ NPY_NO_EXPORT void /**end repeat1**/ /**begin repeat1 - * #kind = equal, not_equal, less, less_equal, greater, greater_equal, - * logical_and, logical_or# - * #OP = ==, !=, <, <=, >, >=, &&, ||# + * #kind = logical_and, logical_or# + * #OP = &&, ||# */ NPY_NO_EXPORT void @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); @@ -407,6 +440,16 @@ NPY_NO_EXPORT void @TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**end repeat**/ +/**begin repeat + * #TYPE = HALF, LONGDOUBLE# + */ +/**begin repeat1 + * #kind = equal, not_equal, less, less_equal, greater, greater_equal# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ +/**end repeat**/ /* ***************************************************************************** diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src new file mode 100644 index 000000000000..e5518afb36f6 --- /dev/null +++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src @@ -0,0 +1,421 @@ +/*@targets + ** $maxopt baseline + ** sse2 sse41 avx2 avx512f avx512_skx + ** vsx2 vsx3 + ** neon + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +/**begin repeat + * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# + * #len = 8, 8, 16, 16, 32, 32, 64, 64, 32, 64# + * #VECTOR = NPY_SIMD*9, NPY_SIMD_F64# + */ +/**begin repeat1 + * #kind = equal, not_equal, less, less_equal, greater, greater_equal# + * #OP = ==, !=, <, <=, >, >=# + * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge# + */ + +#if @VECTOR@ +static void simd_binary_@kind@_@sfx@(char **args, npy_intp len) +{ + npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; + const npyv_u8 truemask = npyv_setall_u8(0x1); + const int vstep = npyv_nlanes_u8; + + // Unroll the loop to get a resultant vector with 'vsteps' elements. + for (; len >= vstep; + len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) { +#if @len@ >= 8 + npyv_@sfx@ a1 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 0); + npyv_@sfx@ b1 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 0); + npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a1, b1); +#if @len@ >= 16 + npyv_@sfx@ a2 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 1); + npyv_@sfx@ b2 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 1); + npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a2, b2); +#if @len@ >= 32 + npyv_@sfx@ a3 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 2); + npyv_@sfx@ b3 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 2); + npyv_@sfx@ a4 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 3); + npyv_@sfx@ b4 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 3); + npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a3, b3); + npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a4, b4); +#if @len@ == 64 + npyv_@sfx@ a5 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 4); + npyv_@sfx@ b5 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 4); + npyv_@sfx@ a6 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 5); + npyv_@sfx@ b6 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 5); + npyv_@sfx@ a7 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 6); + npyv_@sfx@ b7 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 6); + npyv_@sfx@ a8 = npyv_load_@sfx@(src1 + npyv_nlanes_@sfx@ * 7); + npyv_@sfx@ b8 = npyv_load_@sfx@(src2 + npyv_nlanes_@sfx@ * 7); + npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a5, b5); + npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a6, b6); + npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a7, b7); + npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a8, b8); +#endif // @len@ >= 64 +#endif // @len@ >= 32 +#endif // @len@ >= 16 +#endif // @len@ >= 8 + + // Pack the 'c' vectors into a single vector 'r' +#if @len@ == 8 + npyv_u8 r = npyv_cvt_u8_b8(c1); +#elif @len@ == 16 + npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2)); +#elif @len@ == 32 + npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4)); +#elif @len@ == 64 + npyv_u8 r = + npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8)); +#endif + npyv_store_u8(dst, npyv_and_u8(r, truemask)); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst) { + const npyv_lanetype_@sfx@ a = *src1; + const npyv_lanetype_@sfx@ b = *src2; + *dst = a @OP@ b; + } +} + +static void simd_binary_scalar1_@kind@_@sfx@(char **args, npy_intp len) +{ + npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; + const npyv_@sfx@ a = npyv_setall_@sfx@(scalar); + const npyv_u8 truemask = npyv_setall_u8(0x1); + const int vstep = npyv_nlanes_u8; + + for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { +#if @len@ >= 8 + npyv_@sfx@ b1 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 0); + npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a, b1); +#if @len@ >= 16 + npyv_@sfx@ b2 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 1); + npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a, b2); +#if @len@ >= 32 + npyv_@sfx@ b3 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 2); + npyv_@sfx@ b4 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 3); + npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a, b3); + npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a, b4); +#if @len@ == 64 + npyv_@sfx@ b5 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 4); + npyv_@sfx@ b6 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 5); + npyv_@sfx@ b7 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 6); + npyv_@sfx@ b8 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 7); + npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a, b5); + npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a, b6); + npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a, b7); + npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a, b8); +#endif // @len@ >= 64 +#endif // @len@ >= 32 +#endif // @len@ >= 16 +#endif // @len@ >= 8 + +#if @len@ == 8 + npyv_u8 r = npyv_cvt_u8_b8(c1); +#elif @len@ == 16 + npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2)); +#elif @len@ == 32 + npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4)); +#elif @len@ == 64 + npyv_u8 r = + npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8)); +#endif + npyv_store_u8(dst, npyv_and_u8(r, truemask)); + } + + for (; len > 0; --len, ++src, ++dst) { + const npyv_lanetype_@sfx@ b = *src; + *dst = scalar @OP@ b; + } +} + +static void simd_binary_scalar2_@kind@_@sfx@(char **args, npy_intp len) +{ + npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; + const npyv_@sfx@ b = npyv_setall_@sfx@(scalar); + const npyv_u8 truemask = npyv_setall_u8(0x1); + const int vstep = npyv_nlanes_u8; + + for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { +#if @len@ >= 8 + npyv_@sfx@ a1 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 0); + npyv_b@len@ c1 = npyv_@VOP@_@sfx@(a1, b); +#if @len@ >= 16 + npyv_@sfx@ a2 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 1); + npyv_b@len@ c2 = npyv_@VOP@_@sfx@(a2, b); +#if @len@ >= 32 + npyv_@sfx@ a3 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 2); + npyv_@sfx@ a4 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 3); + npyv_b@len@ c3 = npyv_@VOP@_@sfx@(a3, b); + npyv_b@len@ c4 = npyv_@VOP@_@sfx@(a4, b); +#if @len@ == 64 + npyv_@sfx@ a5 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 4); + npyv_@sfx@ a6 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 5); + npyv_@sfx@ a7 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 6); + npyv_@sfx@ a8 = npyv_load_@sfx@(src + npyv_nlanes_@sfx@ * 7); + npyv_b@len@ c5 = npyv_@VOP@_@sfx@(a5, b); + npyv_b@len@ c6 = npyv_@VOP@_@sfx@(a6, b); + npyv_b@len@ c7 = npyv_@VOP@_@sfx@(a7, b); + npyv_b@len@ c8 = npyv_@VOP@_@sfx@(a8, b); +#endif // @len@ >= 64 +#endif // @len@ >= 32 +#endif // @len@ >= 16 +#endif // @len@ >= 8 + +#if @len@ == 8 + npyv_u8 r = npyv_cvt_u8_b8(c1); +#elif @len@ == 16 + npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2)); +#elif @len@ == 32 + npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4)); +#elif @len@ == 64 + npyv_u8 r = + npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8)); +#endif + npyv_store_u8(dst, npyv_and_u8(r, truemask)); + } + + for (; len > 0; --len, ++src, ++dst) { + const npyv_lanetype_@sfx@ a = *src; + *dst = a @OP@ scalar; + } +} +#endif + +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #kind = equal, not_equal, less, less_equal, greater, greater_equal# + * #OP = ==, !=, <, <=, >, >=# + * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge# + */ + +#if NPY_SIMD +static void simd_binary_@kind@_b8(char **args, npy_intp len) +{ + npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0]; + npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1]; + npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; + const npyv_u8 truemask = npyv_setall_u8(0x1); + const npyv_u8 vzero = npyv_setall_u8(0x0); + const int vstep = npyv_nlanes_u8; + + for (; len >= vstep; + len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) { + // Whatever element in src != 0x0 is converted to 0xFF + npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src1), vzero); + npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src2), vzero); + npyv_b8 c = npyv_@VOP@_u8(npyv_cvt_u8_b8(a), npyv_cvt_u8_b8(b)); + npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst) { + const npyv_lanetype_u8 a = *src1 != 0; + const npyv_lanetype_u8 b = *src2 != 0; + *dst = a @OP@ b; + } +} + +static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len) +{ + npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0]; + npyv_lanetype_u8 *src = (npyv_lanetype_u8 *) args[1]; + npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; + const npyv_u8 vzero = npyv_setall_u8(0x0); + const npyv_u8 vscalar = npyv_setall_u8(scalar); + const npyv_u8 a = npyv_cvt_u8_b8(npyv_cmpneq_u8(vscalar, vzero)); + const npyv_u8 truemask = npyv_setall_u8(0x1); + const int vstep = npyv_nlanes_u8; + + for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { + npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src), vzero); + npyv_b8 c = npyv_@VOP@_u8(a, npyv_cvt_u8_b8(b)); + npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); + } + + for (; len > 0; --len, ++src, ++dst) { + const npyv_lanetype_u8 b = *src != 0; + *dst = scalar @OP@ b; + } +} + +static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len) +{ + npyv_lanetype_u8 *src = (npyv_lanetype_u8 *) args[0]; + npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1]; + npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; + const npyv_u8 vzero = npyv_setall_u8(0x0); + const npyv_u8 vscalar = npyv_setall_u8(scalar); + const npyv_u8 b = npyv_cvt_u8_b8(npyv_cmpneq_u8(vscalar, vzero)); + const npyv_u8 truemask = npyv_setall_u8(0x1); + const int vstep = npyv_nlanes_u8; + + for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { + npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src), vzero); + npyv_b8 c = npyv_@VOP@_u8(npyv_cvt_u8_b8(a), b); + npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); + } + + for (; len > 0; --len, ++src, ++dst) { + const npyv_lanetype_u8 a = *src != 0; + *dst = a @OP@ scalar; + } +} +#endif +/**end repeat**/ + + +/**begin repeat + * #type = npy_ubyte*2, npy_byte, npy_ushort, npy_short, npy_uint, npy_int, + npy_ulonglong, npy_longlong, npy_float, npy_double# + * #sfx = b8, u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# + * #VECTOR = NPY_SIMD*10, NPY_SIMD_F64# + */ +/**begin repeat1 + * #kind = equal, not_equal, less, less_equal, greater, greater_equal# + */ +static NPY_INLINE int +run_binary_simd_@kind@_@sfx@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if @VECTOR@ + /* argument one scalar */ + if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) { + simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]); + return 1; + } + /* argument two scalar */ + else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) { + simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]); + return 1; + } + else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) { + simd_binary_@kind@_@sfx@(args, dimensions[0]); + return 1; + } +#endif + return 0; +} +/**end repeat1**/ +/**end repeat**/ + +/* + ***************************************************************************** + ** BOOLEAN LOOPS ** + ***************************************************************************** + */ + +/**begin repeat + * #kind = equal, not_equal, less, less_equal, greater, greater_equal# + * #OP = ==, !=, <, <=, >, >=# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (!run_binary_simd_@kind@_b8(args, dimensions, steps)) { + BINARY_LOOP { + npy_bool in1 = *((npy_bool *)ip1) != 0; + npy_bool in2 = *((npy_bool *)ip2) != 0; + *((npy_bool *)op1)= in1 @OP@ in2; + } + } +} +/**end repeat**/ + +/* + ***************************************************************************** + ** INTEGER LOOPS + ***************************************************************************** + */ + +/**begin repeat + * Signed and Unsigned types + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, + * npy_byte, npy_short, npy_int, npy_long, npy_longlong# + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG# + * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG# + * #signed = 0, 0, 0, 0, 0, 1, 1, 1, 1, 1# + */ +#undef TO_SIMD_SFX +#if 0 +/**begin repeat1 + * #len = 8, 16, 32, 64# + */ +#elif NPY_BITSOF_@STYPE@ == @len@ + #if @signed@ + #define TO_SIMD_SFX(X) X##_s@len@ + #else + #define TO_SIMD_SFX(X) X##_u@len@ + #endif +/**end repeat1**/ +#endif + +/**begin repeat1 + * #kind = equal, not_equal, less, less_equal, greater, greater_equal# + * #OP = ==, !=, <, <=, >, >=# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (!TO_SIMD_SFX(run_binary_simd_@kind@)(args, dimensions, steps)) { + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + *((npy_bool *)op1) = in1 @OP@ in2; + } + } +} +/**end repeat1**/ +/**end repeat**/ + +/* + ***************************************************************************** + ** FLOAT LOOPS ** + ***************************************************************************** + */ + +/**begin repeat + * Float types + * #type = npy_float, npy_double# + * #TYPE = FLOAT, DOUBLE# + * #sfx = f32, f64# + */ +/**begin repeat1 + * #kind = equal, not_equal, less, less_equal, greater, greater_equal# + * #OP = ==, !=, <, <=, >, >=# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (!run_binary_simd_@kind@_@sfx@(args, dimensions, steps)) { + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + *((npy_bool *)op1) = in1 @OP@ in2; + } + } + npy_clear_floatstatus_barrier((char*)dimensions); +} +/**end repeat1**/ +/**end repeat**/ diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index b477027b3c8a..d6c9a7e65385 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -158,55 +158,6 @@ run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp /**end repeat1**/ -/**begin repeat1 - * #kind = equal, not_equal, less, less_equal, greater, greater_equal, - * logical_and, logical_or# - * #simd = 1, 1, 1, 1, 1, 1, 0, 0# - */ - -#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS - -/* prototypes */ -static void -sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, - npy_intp n); -static void -sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, - npy_intp n); -static void -sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, - npy_intp n); - -#endif - -static NPY_INLINE int -run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS - @type@ * ip1 = (@type@ *)args[0]; - @type@ * ip2 = (@type@ *)args[1]; - npy_bool * op = (npy_bool *)args[2]; - npy_intp n = dimensions[0]; - /* argument one scalar */ - if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) { - sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } - /* argument two scalar */ - else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) { - sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } - else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) { - sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } -#endif - return 0; -} - -/**end repeat1**/ - /**begin repeat1 * #kind = isnan, isfinite, isinf, signbit# */ @@ -476,101 +427,6 @@ sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n) /**end repeat1**/ -/**begin repeat1 - * #kind = equal, not_equal, less, less_equal, greater, greater_equal# - * #OP = ==, !=, <, <=, >, >=# - * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge# -*/ - -/* sets invalid fpu flag on QNaN for consistency with packed compare */ -NPY_FINLINE int -sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b) -{ - @vtype@ one = @vpre@_set1_@vsuf@(1); - @type@ tmp; - @vtype@ v = @vpre@_@VOP@_@vsufs@(@vpre@_load_@vsufs@(&a), - @vpre@_load_@vsufs@(&b)); - v = @vpre@_and_@vsuf@(v, one); - @vpre@_store_@vsufs@(&tmp, v); - return tmp; -} - -static void -sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ - LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) { - op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]); - } - LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) { - @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2); - @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2); - @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2); - @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2); - sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]); - } - LOOP_BLOCKED_END { - op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]); - } -} - - -static void -sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ - @vtype@ s = @vpre@_set1_@vsuf@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(ip2, @type@, VECTOR_SIZE_BYTES) { - op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]); - } - LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_load_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ b = @vpre@_load_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ c = @vpre@_load_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ d = @vpre@_load_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ r1 = @vpre@_@VOP@_@vsuf@(s, a); - @vtype@ r2 = @vpre@_@VOP@_@vsuf@(s, b); - @vtype@ r3 = @vpre@_@VOP@_@vsuf@(s, c); - @vtype@ r4 = @vpre@_@VOP@_@vsuf@(s, d); - sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]); - } - LOOP_BLOCKED_END { - op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]); - } -} - - -static void -sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ - @vtype@ s = @vpre@_set1_@vsuf@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) { - op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]); - } - LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a, s); - @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b, s); - @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c, s); - @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d, s); - sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]); - } - LOOP_BLOCKED_END { - op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]); - } -} -/**end repeat1**/ - - static void sse2_negative_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n) { diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index e4b5e0c8f474..b0b62b4a2e17 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -156,6 +156,40 @@ def test_tobits(self): tobits = bin(self.tobits(vdata)) assert tobits == bin(data_bits) + def test_pack(self): + """ + Pack multiple vectors into one + Test intrinsics: + npyv_pack_b8_b16 + npyv_pack_b8_b32 + npyv_pack_b8_b64 + """ + if self.sfx not in ("b16", "b32", "b64"): + return + + # create the vectors + data = self._data() + rdata = self._data(reverse=True) + vdata = self._load_b(data) + vrdata = self._load_b(rdata) + + pack_simd = getattr(self.npyv, f"pack_b8_{self.sfx}") + + # for scalar execution, concatenate the elements of the multiple lists + # into a single list (spack) and then iterate over the elements of + # the created list applying a mask to capture the first byte of them. + if self.sfx == "b16": + spack = [(i & 0xFF) for i in (list(rdata) + list(data))] + vpack = pack_simd(vrdata, vdata) + elif self.sfx == "b32": + spack = [(i & 0xFF) for i in (2*list(rdata) + 2*list(data))] + vpack = pack_simd(vrdata, vrdata, vdata, vdata) + elif self.sfx == "b64": + spack = [(i & 0xFF) for i in (4*list(rdata) + 4*list(data))] + vpack = pack_simd(vrdata, vrdata, vrdata, vrdata, + vdata, vdata, vdata, vdata) + assert vpack == spack + class _SIMD_INT(_Test_Utility): """ To test all integer vector types at once diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index dd0bb88fff73..01ef9365e7de 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -185,6 +185,82 @@ def __array_wrap__(self, arr, context): class TestComparisons: + @pytest.mark.parametrize('dtype', np.sctypes['uint'] + np.sctypes['int'] + + np.sctypes['float'] + [np.bool_]) + def test_comparison_functions(self, dtype): + # Initialize input arrays + if dtype == np.bool_: + a = np.random.choice(a=[False, True], size=1000) + b = np.random.choice(a=[False, True], size=1000) + scalar = True + else: + a = np.random.randint(low=1, high=10, size=1000).astype(dtype) + b = np.random.randint(low=1, high=10, size=1000).astype(dtype) + scalar = 5 + scalar_np = np.dtype(dtype).type(scalar) + a_lst = a.tolist() + b_lst = b.tolist() + + # (Binary) Comparison (x1=array, x2=array) + lt_b = np.less(a, b) + le_b = np.less_equal(a, b) + gt_b = np.greater(a, b) + ge_b = np.greater_equal(a, b) + eq_b = np.equal(a, b) + ne_b = np.not_equal(a, b) + lt_b_lst = [x < y for x, y in zip(a_lst, b_lst)] + le_b_lst = [x <= y for x, y in zip(a_lst, b_lst)] + gt_b_lst = [x > y for x, y in zip(a_lst, b_lst)] + ge_b_lst = [x >= y for x, y in zip(a_lst, b_lst)] + eq_b_lst = [x == y for x, y in zip(a_lst, b_lst)] + ne_b_lst = [x != y for x, y in zip(a_lst, b_lst)] + + # (Scalar1) Comparison (x1=scalar, x2=array) + lt_s1 = np.less(scalar_np, b) + le_s1 = np.less_equal(scalar_np, b) + gt_s1 = np.greater(scalar_np, b) + ge_s1 = np.greater_equal(scalar_np, b) + eq_s1 = np.equal(scalar_np, b) + ne_s1 = np.not_equal(scalar_np, b) + lt_s1_lst = [scalar < x for x in b_lst] + le_s1_lst = [scalar <= x for x in b_lst] + gt_s1_lst = [scalar > x for x in b_lst] + ge_s1_lst = [scalar >= x for x in b_lst] + eq_s1_lst = [scalar == x for x in b_lst] + ne_s1_lst = [scalar != x for x in b_lst] + + # (Scalar2) Comparison (x1=array, x2=scalar) + lt_s2 = np.less(a, scalar_np) + le_s2 = np.less_equal(a, scalar_np) + gt_s2 = np.greater(a, scalar_np) + ge_s2 = np.greater_equal(a, scalar_np) + eq_s2 = np.equal(a, scalar_np) + ne_s2 = np.not_equal(a, scalar_np) + lt_s2_lst = [x < scalar for x in a_lst] + le_s2_lst = [x <= scalar for x in a_lst] + gt_s2_lst = [x > scalar for x in a_lst] + ge_s2_lst = [x >= scalar for x in a_lst] + eq_s2_lst = [x == scalar for x in a_lst] + ne_s2_lst = [x != scalar for x in a_lst] + + # Compare comparison functions (Python vs NumPy) using native Python + def compare(lt, le, gt, ge, eq, ne, lt_lst, le_lst, gt_lst, ge_lst, + eq_lst, ne_lst): + assert_(lt.tolist() == lt_lst, "Comparison function check (lt)") + assert_(le.tolist() == le_lst, "Comparison function check (le)") + assert_(gt.tolist() == gt_lst, "Comparison function check (gt)") + assert_(ge.tolist() == ge_lst, "Comparison function check (ge)") + assert_(eq.tolist() == eq_lst, "Comparison function check (eq)") + assert_(ne.tolist() == ne_lst, "Comparison function check (ne)") + + # Sequence: Binary, Scalar1 and Scalar2 + compare(lt_b, le_b, gt_b, ge_b, eq_b, ne_b, lt_b_lst, le_b_lst, + gt_b_lst, ge_b_lst, eq_b_lst, ne_b_lst) + compare(lt_s1, le_s1, gt_s1, ge_s1, eq_s1, ne_s1, lt_s1_lst, le_s1_lst, + gt_s1_lst, ge_s1_lst, eq_s1_lst, ne_s1_lst) + compare(lt_s2, le_s2, gt_s2, ge_s2, eq_s2, ne_s2, lt_s2_lst, le_s2_lst, + gt_s2_lst, ge_s2_lst, eq_s2_lst, ne_s2_lst) + def test_ignore_object_identity_in_equal(self): # Check comparing identical objects whose comparison # is not a simple boolean, e.g., arrays that are compared elementwise. From ff03e72646ee0e4727a1cc333d5ea4945724d44b Mon Sep 17 00:00:00 2001 From: Rafael CF Sousa Date: Mon, 23 May 2022 16:19:47 -0300 Subject: [PATCH 2/6] TST: Rewrite the test that checks the universal SIMD intrinsic (pack) --- numpy/core/tests/test_umath.py | 78 +++++++++++----------------------- 1 file changed, 24 insertions(+), 54 deletions(-) diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 01ef9365e7de..7b6e2ee92276 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -185,9 +185,19 @@ def __array_wrap__(self, arr, context): class TestComparisons: + import operator + @pytest.mark.parametrize('dtype', np.sctypes['uint'] + np.sctypes['int'] + np.sctypes['float'] + [np.bool_]) - def test_comparison_functions(self, dtype): + @pytest.mark.parametrize('py_comp,np_comp', [ + (operator.lt, np.less), + (operator.le, np.less_equal), + (operator.gt, np.greater), + (operator.ge, np.greater_equal), + (operator.eq, np.equal), + (operator.ne, np.not_equal) + ]) + def test_comparison_functions(self, dtype, py_comp, np_comp): # Initialize input arrays if dtype == np.bool_: a = np.random.choice(a=[False, True], size=1000) @@ -197,69 +207,29 @@ def test_comparison_functions(self, dtype): a = np.random.randint(low=1, high=10, size=1000).astype(dtype) b = np.random.randint(low=1, high=10, size=1000).astype(dtype) scalar = 5 - scalar_np = np.dtype(dtype).type(scalar) + np_scalar = np.dtype(dtype).type(scalar) a_lst = a.tolist() b_lst = b.tolist() # (Binary) Comparison (x1=array, x2=array) - lt_b = np.less(a, b) - le_b = np.less_equal(a, b) - gt_b = np.greater(a, b) - ge_b = np.greater_equal(a, b) - eq_b = np.equal(a, b) - ne_b = np.not_equal(a, b) - lt_b_lst = [x < y for x, y in zip(a_lst, b_lst)] - le_b_lst = [x <= y for x, y in zip(a_lst, b_lst)] - gt_b_lst = [x > y for x, y in zip(a_lst, b_lst)] - ge_b_lst = [x >= y for x, y in zip(a_lst, b_lst)] - eq_b_lst = [x == y for x, y in zip(a_lst, b_lst)] - ne_b_lst = [x != y for x, y in zip(a_lst, b_lst)] + comp_b = np_comp(a, b) + comp_b_list = [py_comp(x, y) for x, y in zip(a_lst, b_lst)] # (Scalar1) Comparison (x1=scalar, x2=array) - lt_s1 = np.less(scalar_np, b) - le_s1 = np.less_equal(scalar_np, b) - gt_s1 = np.greater(scalar_np, b) - ge_s1 = np.greater_equal(scalar_np, b) - eq_s1 = np.equal(scalar_np, b) - ne_s1 = np.not_equal(scalar_np, b) - lt_s1_lst = [scalar < x for x in b_lst] - le_s1_lst = [scalar <= x for x in b_lst] - gt_s1_lst = [scalar > x for x in b_lst] - ge_s1_lst = [scalar >= x for x in b_lst] - eq_s1_lst = [scalar == x for x in b_lst] - ne_s1_lst = [scalar != x for x in b_lst] + comp_s1 = np_comp(np_scalar, b) + comp_s1_list = [py_comp(scalar, x) for x in b_lst] # (Scalar2) Comparison (x1=array, x2=scalar) - lt_s2 = np.less(a, scalar_np) - le_s2 = np.less_equal(a, scalar_np) - gt_s2 = np.greater(a, scalar_np) - ge_s2 = np.greater_equal(a, scalar_np) - eq_s2 = np.equal(a, scalar_np) - ne_s2 = np.not_equal(a, scalar_np) - lt_s2_lst = [x < scalar for x in a_lst] - le_s2_lst = [x <= scalar for x in a_lst] - gt_s2_lst = [x > scalar for x in a_lst] - ge_s2_lst = [x >= scalar for x in a_lst] - eq_s2_lst = [x == scalar for x in a_lst] - ne_s2_lst = [x != scalar for x in a_lst] - - # Compare comparison functions (Python vs NumPy) using native Python - def compare(lt, le, gt, ge, eq, ne, lt_lst, le_lst, gt_lst, ge_lst, - eq_lst, ne_lst): - assert_(lt.tolist() == lt_lst, "Comparison function check (lt)") - assert_(le.tolist() == le_lst, "Comparison function check (le)") - assert_(gt.tolist() == gt_lst, "Comparison function check (gt)") - assert_(ge.tolist() == ge_lst, "Comparison function check (ge)") - assert_(eq.tolist() == eq_lst, "Comparison function check (eq)") - assert_(ne.tolist() == ne_lst, "Comparison function check (ne)") + comp_s2 = np_comp(a, np_scalar) + comp_s2_list = [py_comp(x, scalar) for x in a_lst] # Sequence: Binary, Scalar1 and Scalar2 - compare(lt_b, le_b, gt_b, ge_b, eq_b, ne_b, lt_b_lst, le_b_lst, - gt_b_lst, ge_b_lst, eq_b_lst, ne_b_lst) - compare(lt_s1, le_s1, gt_s1, ge_s1, eq_s1, ne_s1, lt_s1_lst, le_s1_lst, - gt_s1_lst, ge_s1_lst, eq_s1_lst, ne_s1_lst) - compare(lt_s2, le_s2, gt_s2, ge_s2, eq_s2, ne_s2, lt_s2_lst, le_s2_lst, - gt_s2_lst, ge_s2_lst, eq_s2_lst, ne_s2_lst) + assert_(comp_b.tolist() == comp_b_list, + f"Failed comparision ({py_comp.__name__})") + assert_(comp_s1.tolist() == comp_s1_list, + f"Failed comparision ({py_comp.__name__})") + assert_(comp_s2.tolist() == comp_s2_list, + f"Failed comparision ({py_comp.__name__})") def test_ignore_object_identity_in_equal(self): # Check comparing identical objects whose comparison From 09b22a118466ff85fe365f451139cc3da2e8bc43 Mon Sep 17 00:00:00 2001 From: Rafael CF Sousa Date: Mon, 23 May 2022 16:20:47 -0300 Subject: [PATCH 3/6] SIMD, ENH: Use logical bitwise to implement comparison functions (bool_) --- numpy/core/src/_simd/_simd.dispatch.c.src | 6 +++ numpy/core/src/common/simd/avx2/operators.h | 5 +++ numpy/core/src/common/simd/avx512/operators.h | 12 +++++ numpy/core/src/common/simd/neon/operators.h | 5 +++ numpy/core/src/common/simd/sse/operators.h | 5 +++ numpy/core/src/common/simd/vsx/operators.h | 5 +++ .../src/umath/loops_comparison.dispatch.c.src | 25 ++++++++--- numpy/core/tests/test_simd.py | 45 +++++++++++++++++-- 8 files changed, 99 insertions(+), 9 deletions(-) diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index ad8c538e4cfb..f8a0a3196c9d 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -462,6 +462,9 @@ SIMD_IMPL_INTRIN_2(or_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_2(xor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@) /**end repeat**/ +SIMD_IMPL_INTRIN_2(andc_b8, vb8, vb8, vb8) +SIMD_IMPL_INTRIN_2(orc_b8, vb8, vb8, vb8) +SIMD_IMPL_INTRIN_2(xnor_b8, vb8, vb8, vb8) /*************************** * Conversions ***************************/ @@ -710,6 +713,9 @@ SIMD_INTRIN_DEF(or_@bsfx@) SIMD_INTRIN_DEF(xor_@bsfx@) SIMD_INTRIN_DEF(not_@bsfx@) /**end repeat**/ +SIMD_INTRIN_DEF(andc_b8) +SIMD_INTRIN_DEF(orc_b8) +SIMD_INTRIN_DEF(xnor_b8) /*************************** * Conversions ***************************/ diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h index 5fc7719e916d..0e77fc6bef99 100644 --- a/numpy/core/src/common/simd/avx2/operators.h +++ b/numpy/core/src/common/simd/avx2/operators.h @@ -114,6 +114,11 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c) #define npyv_not_b32 npyv_not_u8 #define npyv_not_b64 npyv_not_u8 +// ANDC, ORC and XNOR +#define npyv_andc_b8(A, B) _mm256_andnot_si256(A, B) +#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) +#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B)) + /*************************** * Comparison ***************************/ diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h index d53932fa8726..8c98b72ddd5a 100644 --- a/numpy/core/src/common/simd/avx512/operators.h +++ b/numpy/core/src/common/simd/avx512/operators.h @@ -152,6 +152,9 @@ #define npyv_xor_b16 _kxor_mask32 #define npyv_not_b8 _knot_mask64 #define npyv_not_b16 _knot_mask32 + #define npyv_andc_b8 _kandn_mask64 + #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) + #define npyv_xnor_b8 _kxnor_mask64 #elif defined(NPY_HAVE_AVX512BW) NPY_FINLINE npyv_b8 npyv_and_b8(npyv_b8 a, npyv_b8 b) { return a & b; } @@ -169,6 +172,12 @@ { return ~a; } NPY_FINLINE npyv_b16 npyv_not_b16(npyv_b16 a) { return ~a; } + NPY_FINLINE npyv_b8 npyv_andc_b8(npyv_b8 a, npyv_b8 b) + { return (~a) & b; } + NPY_FINLINE npyv_b8 npyv_orc_b8(npyv_b8 a, npyv_b8 b) + { return (~a) | b; } + NPY_FINLINE npyv_b8 npyv_xnor_b8(npyv_b8 a, npyv_b8 b) + { return ~(a ^ b); } #else #define npyv_and_b8 _mm512_and_si512 #define npyv_and_b16 _mm512_and_si512 @@ -178,6 +187,9 @@ #define npyv_xor_b16 _mm512_xor_si512 #define npyv_not_b8 npyv_not_u8 #define npyv_not_b16 npyv_not_u8 + #define npyv_andc_b8 _mm512_andnot_si512 + #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) + #define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B)) #endif #define npyv_and_b32 _mm512_kand diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h index b43ba36537e9..6c155fc67bc1 100644 --- a/numpy/core/src/common/simd/neon/operators.h +++ b/numpy/core/src/common/simd/neon/operators.h @@ -116,6 +116,11 @@ #define npyv_not_b32 vmvnq_u32 #define npyv_not_b64 npyv_not_u64 +// ANDC, ORC and XNOR +#define npyv_andc_b8(A, B) vbicq_u8(B, A) +#define npyv_orc_b8(A, B) vornq_u8(B, A) +#define npyv_xnor_b8 vceqq_u8 + /*************************** * Comparison ***************************/ diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h index 51c84fb4e9d9..51bdca356988 100644 --- a/numpy/core/src/common/simd/sse/operators.h +++ b/numpy/core/src/common/simd/sse/operators.h @@ -115,6 +115,11 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c) #define npyv_not_b32 npyv_not_u8 #define npyv_not_b64 npyv_not_u8 +// ANDC, ORC and XNOR +#define npyv_andc_b8(A, B) _mm_andnot_si128(A, B) +#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) +#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B)) + /*************************** * Comparison ***************************/ diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vsx/operators.h index d34057ff3f38..fc29ba920905 100644 --- a/numpy/core/src/common/simd/vsx/operators.h +++ b/numpy/core/src/common/simd/vsx/operators.h @@ -133,6 +133,11 @@ NPY_FINLINE npyv_f32 npyv_not_f32(npyv_f32 a) NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) { return vec_nor(a, a); } +// ANDC, ORC and XNOR +#define npyv_andc_b8(A, B) vec_andc(B, A) +#define npyv_orc_b8(A, B) vec_orc(B, A) +#define npyv_xnor_b8 vec_eqv + /*************************** * Comparison ***************************/ diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src index e5518afb36f6..07bbf035454e 100644 --- a/numpy/core/src/umath/loops_comparison.dispatch.c.src +++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src @@ -207,7 +207,8 @@ static void simd_binary_scalar2_@kind@_@sfx@(char **args, npy_intp len) /**begin repeat * #kind = equal, not_equal, less, less_equal, greater, greater_equal# * #OP = ==, !=, <, <=, >, >=# - * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge# + * #VOP = xnor, xor, andc, orc, andc, orc# + * #rev = 0, 0, 0, 0, 1, 1# */ #if NPY_SIMD @@ -225,7 +226,11 @@ static void simd_binary_@kind@_b8(char **args, npy_intp len) // Whatever element in src != 0x0 is converted to 0xFF npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src1), vzero); npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src2), vzero); - npyv_b8 c = npyv_@VOP@_u8(npyv_cvt_u8_b8(a), npyv_cvt_u8_b8(b)); +#if !@rev@ + npyv_b8 c = npyv_@VOP@_b8(a, b); +#else + npyv_b8 c = npyv_@VOP@_b8(b, a); +#endif npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); } @@ -243,13 +248,17 @@ static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len) npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; const npyv_u8 vzero = npyv_setall_u8(0x0); const npyv_u8 vscalar = npyv_setall_u8(scalar); - const npyv_u8 a = npyv_cvt_u8_b8(npyv_cmpneq_u8(vscalar, vzero)); + const npyv_b8 a = npyv_cmpneq_u8(vscalar, vzero); const npyv_u8 truemask = npyv_setall_u8(0x1); const int vstep = npyv_nlanes_u8; for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src), vzero); - npyv_b8 c = npyv_@VOP@_u8(a, npyv_cvt_u8_b8(b)); +#if !@rev@ + npyv_b8 c = npyv_@VOP@_b8(a, b); +#else + npyv_b8 c = npyv_@VOP@_b8(b, a); +#endif npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); } @@ -266,13 +275,17 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len) npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; const npyv_u8 vzero = npyv_setall_u8(0x0); const npyv_u8 vscalar = npyv_setall_u8(scalar); - const npyv_u8 b = npyv_cvt_u8_b8(npyv_cmpneq_u8(vscalar, vzero)); + const npyv_b8 b = npyv_cmpneq_u8(vscalar, vzero); const npyv_u8 truemask = npyv_setall_u8(0x1); const int vstep = npyv_nlanes_u8; for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src), vzero); - npyv_b8 c = npyv_@VOP@_u8(npyv_cvt_u8_b8(a), b); +#if !@rev@ + npyv_b8 c = npyv_@VOP@_b8(a, b); +#else + npyv_b8 c = npyv_@VOP@_b8(b, a); +#endif npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); } diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index b0b62b4a2e17..81d759ad42ea 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -156,6 +156,48 @@ def test_tobits(self): tobits = bin(self.tobits(vdata)) assert tobits == bin(data_bits) + def test_andc(self): + if self.sfx not in ("b8"): + return + andc_simd = getattr(self.npyv, f"andc_b8") + # create the vectors + data = self._data() + rdata = self._data(reverse=True) + vdata = self._load_b(data) + vrdata = self._load_b(rdata) + # check andc + sandc = [(~x & y) & 0xFF for x, y in zip(rdata, data)] + vandc = andc_simd(vrdata, vdata) + assert sandc == vandc + + def test_orc(self): + if self.sfx not in ("b8"): + return + orc_simd = getattr(self.npyv, f"orc_b8") + # create the vectors + data = self._data() + rdata = self._data(reverse=True) + vdata = self._load_b(data) + vrdata = self._load_b(rdata) + # check orc + sorc = [(~x | y) & 0xFF for x, y in zip(rdata, data)] + vorc = orc_simd(vrdata, vdata) + assert sorc == vorc + + def test_xnor(self): + if self.sfx not in ("b8"): + return + xnor_simd = getattr(self.npyv, f"xnor_b8") + # create the vectors + data = self._data() + rdata = self._data(reverse=True) + vdata = self._load_b(data) + vrdata = self._load_b(rdata) + # check orc + sxnor = [~(x ^ y) & 0xFF for x, y in zip(rdata, data)] + vxnor = xnor_simd(vrdata, vdata) + assert sxnor == vxnor + def test_pack(self): """ Pack multiple vectors into one @@ -166,15 +208,12 @@ def test_pack(self): """ if self.sfx not in ("b16", "b32", "b64"): return - # create the vectors data = self._data() rdata = self._data(reverse=True) vdata = self._load_b(data) vrdata = self._load_b(rdata) - pack_simd = getattr(self.npyv, f"pack_b8_{self.sfx}") - # for scalar execution, concatenate the elements of the multiple lists # into a single list (spack) and then iterate over the elements of # the created list applying a mask to capture the first byte of them. From d5d6eb567ec228bcc65da184240349239db4f080 Mon Sep 17 00:00:00 2001 From: Rafael CF Sousa Date: Fri, 27 May 2022 10:17:29 -0300 Subject: [PATCH 4/6] SIMD, ENH: Add universal intrinsic andc8 and use it to remove ifneq This commit also applies some techniques to reduce the size of the binary generated from the source loops_comparison.dispatch.c.src --- numpy/core/src/_simd/_simd.dispatch.c.src | 22 +- numpy/core/src/common/simd/avx2/operators.h | 7 +- .../core/src/common/simd/avx512/conversion.h | 4 +- numpy/core/src/common/simd/avx512/operators.h | 15 +- numpy/core/src/common/simd/neon/operators.h | 5 +- numpy/core/src/common/simd/sse/operators.h | 7 +- numpy/core/src/common/simd/vsx/operators.h | 5 +- .../src/umath/loops_comparison.dispatch.c.src | 201 ++++++++++-------- 8 files changed, 150 insertions(+), 116 deletions(-) diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index f8a0a3196c9d..0f3e4fc8f8da 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -31,6 +31,7 @@ * #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# + * #bitw8b_sup= 1, 0, 0, 0, 0, 0, 0, 0, 0, 0# */ #if @simd_sup@ /*************************** @@ -332,6 +333,13 @@ SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@) SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@) /**end repeat1**/ +#if @bitw8b_sup@ +SIMD_IMPL_INTRIN_2(andc_@sfx@, v@sfx@, v@sfx@, v@sfx@) +SIMD_IMPL_INTRIN_2(andc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) +SIMD_IMPL_INTRIN_2(orc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) +SIMD_IMPL_INTRIN_2(xnor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) +#endif + /*************************** * Conversion ***************************/ @@ -462,9 +470,6 @@ SIMD_IMPL_INTRIN_2(or_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_2(xor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@) /**end repeat**/ -SIMD_IMPL_INTRIN_2(andc_b8, vb8, vb8, vb8) -SIMD_IMPL_INTRIN_2(orc_b8, vb8, vb8, vb8) -SIMD_IMPL_INTRIN_2(xnor_b8, vb8, vb8, vb8) /*************************** * Conversions ***************************/ @@ -503,6 +508,7 @@ static PyMethodDef simd__intrinsics_methods[] = { * #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# + * #bitw8b_sup= 1, 0, 0, 0, 0, 0, 0, 0, 0, 0# */ #if @simd_sup@ @@ -584,6 +590,13 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ +#if @bitw8b_sup@ +SIMD_INTRIN_DEF(andc_@sfx@) +SIMD_INTRIN_DEF(andc_@bsfx@) +SIMD_INTRIN_DEF(orc_@bsfx@) +SIMD_INTRIN_DEF(xnor_@bsfx@) +#endif + /*************************** * Conversion ***************************/ @@ -713,9 +726,6 @@ SIMD_INTRIN_DEF(or_@bsfx@) SIMD_INTRIN_DEF(xor_@bsfx@) SIMD_INTRIN_DEF(not_@bsfx@) /**end repeat**/ -SIMD_INTRIN_DEF(andc_b8) -SIMD_INTRIN_DEF(orc_b8) -SIMD_INTRIN_DEF(xnor_b8) /*************************** * Conversions ***************************/ diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h index 0e77fc6bef99..99ef76dcb1dc 100644 --- a/numpy/core/src/common/simd/avx2/operators.h +++ b/numpy/core/src/common/simd/avx2/operators.h @@ -115,9 +115,10 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c) #define npyv_not_b64 npyv_not_u8 // ANDC, ORC and XNOR -#define npyv_andc_b8(A, B) _mm256_andnot_si256(A, B) -#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) -#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B)) +#define npyv_andc_u8(A, B) _mm256_andnot_si256(B, A) +#define npyv_andc_b8(A, B) _mm256_andnot_si256(B, A) +#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A) +#define npyv_xnor_b8 _mm256_cmpeq_epi8 /*************************** * Comparison diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h index a2f56b2ae654..474aee446b6a 100644 --- a/numpy/core/src/common/simd/avx512/conversion.h +++ b/numpy/core/src/common/simd/avx512/conversion.h @@ -104,8 +104,8 @@ NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) { NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) { #ifdef NPY_HAVE_AVX512BW - __mmask32 ab = (__mmask64)_mm512_kunpackw((__mmask32)b, (__mmask32)a); - __mmask32 cd = (__mmask64)_mm512_kunpackw((__mmask32)d, (__mmask32)c); + __mmask32 ab = _mm512_kunpackw((__mmask32)b, (__mmask32)a); + __mmask32 cd = _mm512_kunpackw((__mmask32)d, (__mmask32)c); return npyv_pack_b8_b16(ab, cd); #else const __m512i idx = _mm512_setr_epi32( diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h index 8c98b72ddd5a..b856b345ae97 100644 --- a/numpy/core/src/common/simd/avx512/operators.h +++ b/numpy/core/src/common/simd/avx512/operators.h @@ -140,6 +140,9 @@ #define npyv_not_f64(A) _mm512_castsi512_pd(npyv_not_u64(_mm512_castpd_si512(A))) #endif +// ANDC +#define npyv_andc_u8(A, B) _mm512_andnot_si512(B, A) + /*************************** * Logical (boolean) ***************************/ @@ -152,8 +155,8 @@ #define npyv_xor_b16 _kxor_mask32 #define npyv_not_b8 _knot_mask64 #define npyv_not_b16 _knot_mask32 - #define npyv_andc_b8 _kandn_mask64 - #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) + #define npyv_andc_b8(A, B) _kandn_mask64(B, A) + #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A) #define npyv_xnor_b8 _kxnor_mask64 #elif defined(NPY_HAVE_AVX512BW) NPY_FINLINE npyv_b8 npyv_and_b8(npyv_b8 a, npyv_b8 b) @@ -173,9 +176,9 @@ NPY_FINLINE npyv_b16 npyv_not_b16(npyv_b16 a) { return ~a; } NPY_FINLINE npyv_b8 npyv_andc_b8(npyv_b8 a, npyv_b8 b) - { return (~a) & b; } + { return a & (~b); } NPY_FINLINE npyv_b8 npyv_orc_b8(npyv_b8 a, npyv_b8 b) - { return (~a) | b; } + { return a | (~b); } NPY_FINLINE npyv_b8 npyv_xnor_b8(npyv_b8 a, npyv_b8 b) { return ~(a ^ b); } #else @@ -187,8 +190,8 @@ #define npyv_xor_b16 _mm512_xor_si512 #define npyv_not_b8 npyv_not_u8 #define npyv_not_b16 npyv_not_u8 - #define npyv_andc_b8 _mm512_andnot_si512 - #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) + #define npyv_andc_b8(A, B) _mm512_andnot_si512(B, A) + #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A) #define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B)) #endif diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h index 6c155fc67bc1..a08fa5390d03 100644 --- a/numpy/core/src/common/simd/neon/operators.h +++ b/numpy/core/src/common/simd/neon/operators.h @@ -117,8 +117,9 @@ #define npyv_not_b64 npyv_not_u64 // ANDC, ORC and XNOR -#define npyv_andc_b8(A, B) vbicq_u8(B, A) -#define npyv_orc_b8(A, B) vornq_u8(B, A) +#define npyv_andc_u8 vbicq_u8 +#define npyv_andc_b8 vbicq_u8 +#define npyv_orc_b8 vornq_u8 #define npyv_xnor_b8 vceqq_u8 /*************************** diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h index 51bdca356988..86dbcfea5eca 100644 --- a/numpy/core/src/common/simd/sse/operators.h +++ b/numpy/core/src/common/simd/sse/operators.h @@ -116,9 +116,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c) #define npyv_not_b64 npyv_not_u8 // ANDC, ORC and XNOR -#define npyv_andc_b8(A, B) _mm_andnot_si128(A, B) -#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B) -#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B)) +#define npyv_andc_u8(A, B) _mm_andnot_si128(B, A) +#define npyv_andc_b8(A, B) _mm_andnot_si128(B, A) +#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A) +#define npyv_xnor_b8 _mm_cmpeq_epi8 /*************************** * Comparison diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vsx/operators.h index fc29ba920905..b01d8532159b 100644 --- a/numpy/core/src/common/simd/vsx/operators.h +++ b/numpy/core/src/common/simd/vsx/operators.h @@ -134,8 +134,9 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) { return vec_nor(a, a); } // ANDC, ORC and XNOR -#define npyv_andc_b8(A, B) vec_andc(B, A) -#define npyv_orc_b8(A, B) vec_orc(B, A) +#define npyv_andc_u8 vec_andc +#define npyv_andc_b8 vec_andc +#define npyv_orc_b8 vec_orc #define npyv_xnor_b8 vec_eqv /*************************** diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src index 07bbf035454e..01d58fbf9c92 100644 --- a/numpy/core/src/umath/loops_comparison.dispatch.c.src +++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src @@ -1,6 +1,6 @@ /*@targets ** $maxopt baseline - ** sse2 sse41 avx2 avx512f avx512_skx + ** sse2 sse42 avx2 avx512f avx512_skx ** vsx2 vsx3 ** neon **/ @@ -15,18 +15,23 @@ // Provides the various *_LOOP macros #include "fast_loop_macros.h" +/******************************************************************************** + ** Defining the SIMD kernels + ********************************************************************************/ /**begin repeat * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #len = 8, 8, 16, 16, 32, 32, 64, 64, 32, 64# + * #signed = 0, 1, 0, 1, 0, 1, 0, 1, 0, 0# * #VECTOR = NPY_SIMD*9, NPY_SIMD_F64# */ /**begin repeat1 - * #kind = equal, not_equal, less, less_equal, greater, greater_equal# - * #OP = ==, !=, <, <=, >, >=# - * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge# + * #kind = equal, not_equal, less, less_equal# + * #eq = 1, 0, 0, 0# + * #neq = 0, 1, 0, 0# + * #OP = ==, !=, <, <=# + * #VOP = cmpeq, cmpneq, cmplt, cmple# */ - -#if @VECTOR@ +#if @VECTOR@ && !((@eq@ || @neq@) && @signed@) static void simd_binary_@kind@_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; @@ -205,10 +210,11 @@ static void simd_binary_scalar2_@kind@_@sfx@(char **args, npy_intp len) /**end repeat**/ /**begin repeat - * #kind = equal, not_equal, less, less_equal, greater, greater_equal# - * #OP = ==, !=, <, <=, >, >=# - * #VOP = xnor, xor, andc, orc, andc, orc# - * #rev = 0, 0, 0, 0, 1, 1# + * #kind = equal, not_equal, less, less_equal# + * #eq = 1, 0, 0, 0# + * #neq = 0, 1, 0, 0# + * #OP = ==, !=, <, <=# + * #VOP = xnor, xor, andc, orc# */ #if NPY_SIMD @@ -224,14 +230,10 @@ static void simd_binary_@kind@_b8(char **args, npy_intp len) for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) { // Whatever element in src != 0x0 is converted to 0xFF - npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src1), vzero); - npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src2), vzero); -#if !@rev@ + npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero); + npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero); npyv_b8 c = npyv_@VOP@_b8(a, b); -#else - npyv_b8 c = npyv_@VOP@_b8(b, a); -#endif - npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); + npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask)); } for (; len > 0; --len, ++src1, ++src2, ++dst) { @@ -248,18 +250,14 @@ static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len) npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; const npyv_u8 vzero = npyv_setall_u8(0x0); const npyv_u8 vscalar = npyv_setall_u8(scalar); - const npyv_b8 a = npyv_cmpneq_u8(vscalar, vzero); + const npyv_b8 a = npyv_cmpeq_u8(vscalar, vzero); const npyv_u8 truemask = npyv_setall_u8(0x1); const int vstep = npyv_nlanes_u8; for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src), vzero); -#if !@rev@ + npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero); npyv_b8 c = npyv_@VOP@_b8(a, b); -#else - npyv_b8 c = npyv_@VOP@_b8(b, a); -#endif - npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); + npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask)); } for (; len > 0; --len, ++src, ++dst) { @@ -275,18 +273,14 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len) npyv_lanetype_u8 *dst = (npyv_lanetype_u8 *) args[2]; const npyv_u8 vzero = npyv_setall_u8(0x0); const npyv_u8 vscalar = npyv_setall_u8(scalar); - const npyv_b8 b = npyv_cmpneq_u8(vscalar, vzero); + const npyv_b8 b = npyv_cmpeq_u8(vscalar, vzero); const npyv_u8 truemask = npyv_setall_u8(0x1); const int vstep = npyv_nlanes_u8; for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src), vzero); -#if !@rev@ + npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero); npyv_b8 c = npyv_@VOP@_b8(a, b); -#else - npyv_b8 c = npyv_@VOP@_b8(b, a); -#endif - npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); + npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask)); } for (; len > 0; --len, ++src, ++dst) { @@ -297,73 +291,73 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len) #endif /**end repeat**/ - /**begin repeat * #type = npy_ubyte*2, npy_byte, npy_ushort, npy_short, npy_uint, npy_int, npy_ulonglong, npy_longlong, npy_float, npy_double# * #sfx = b8, u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# + * #bool = 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0# + * #fp = 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #signed = 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0# * #VECTOR = NPY_SIMD*10, NPY_SIMD_F64# */ /**begin repeat1 - * #kind = equal, not_equal, less, less_equal, greater, greater_equal# + * #kind = equal, not_equal, less, less_equal# + * #eq = 1, 0, 0, 0# + * #neq = 0, 1, 0, 0# + * #OP = ==, !=, <, <=# */ -static NPY_INLINE int +#if !((@eq@ || @neq@) && @signed@) +static NPY_INLINE void run_binary_simd_@kind@_@sfx@(char **args, npy_intp const *dimensions, npy_intp const *steps) { #if @VECTOR@ /* argument one scalar */ if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) { simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]); - return 1; + return; } /* argument two scalar */ else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) { simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]); - return 1; + return; } else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) { simd_binary_@kind@_@sfx@(args, dimensions[0]); - return 1; + return; } #endif - return 0; -} -/**end repeat1**/ -/**end repeat**/ -/* - ***************************************************************************** - ** BOOLEAN LOOPS ** - ***************************************************************************** - */ - -/**begin repeat - * #kind = equal, not_equal, less, less_equal, greater, greater_equal# - * #OP = ==, !=, <, <=, >, >=# - */ -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (!run_binary_simd_@kind@_b8(args, dimensions, steps)) { - BINARY_LOOP { - npy_bool in1 = *((npy_bool *)ip1) != 0; - npy_bool in2 = *((npy_bool *)ip2) != 0; - *((npy_bool *)op1)= in1 @OP@ in2; - } + BINARY_LOOP { +#if @bool@ + npy_bool in1 = *((npy_bool *)ip1) != 0; + npy_bool in2 = *((npy_bool *)ip2) != 0; +#else + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; +#endif + *((npy_bool *)op1) = in1 @OP@ in2; } } +#endif +/**end repeat1**/ /**end repeat**/ +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ + /* - ***************************************************************************** - ** INTEGER LOOPS - ***************************************************************************** + * In order to reduce the size of the binary generated from this source, the + * following rules are applied: 1) each data type implements its function + * 'greater' as a call to the function 'less' but with the arguments swapped, + * the same applies to the function 'greater_equal', which is implemented + * with a call to the function 'less_equal', and 2) for the integer datatypes + * of the same size (eg 8-bit), a single kernel of the functions 'equal' and + * 'not_equal' is used to implement both signed and unsigned types. */ /**begin repeat * Signed and Unsigned types - * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, - * npy_byte, npy_short, npy_int, npy_long, npy_longlong# * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, * BYTE, SHORT, INT, LONG, LONGLONG# * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG, @@ -371,11 +365,13 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) * #signed = 0, 0, 0, 0, 0, 1, 1, 1, 1, 1# */ #undef TO_SIMD_SFX +#undef TO_SIMD_UTYPE #if 0 /**begin repeat1 * #len = 8, 16, 32, 64# */ #elif NPY_BITSOF_@STYPE@ == @len@ + #define TO_SIMD_UTYPE(X) X##_u@len@ #if @signed@ #define TO_SIMD_SFX(X) X##_s@len@ #else @@ -385,50 +381,71 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) #endif /**begin repeat1 - * #kind = equal, not_equal, less, less_equal, greater, greater_equal# - * #OP = ==, !=, <, <=, >, >=# + * #kind = greater, greater_equal# + * #kind_to = less, less_equal# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - if (!TO_SIMD_SFX(run_binary_simd_@kind@)(args, dimensions, steps)) { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((npy_bool *)op1) = in1 @OP@ in2; - } - } + char *nargs[3] = {args[1], args[0], args[2]}; + npy_intp nsteps[3] = {steps[1], steps[0], steps[2]}; + TO_SIMD_SFX(run_binary_simd_@kind_to@)(nargs, dimensions, nsteps); } /**end repeat1**/ -/**end repeat**/ -/* - ***************************************************************************** - ** FLOAT LOOPS ** - ***************************************************************************** +/**begin repeat1 + * #kind = less, less_equal# */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + TO_SIMD_SFX(run_binary_simd_@kind@)(args, dimensions, steps); +} +/**end repeat1**/ + +/**begin repeat1 + * #kind = equal, not_equal# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + TO_SIMD_UTYPE(run_binary_simd_@kind@)(args, dimensions, steps); +} +/**end repeat1**/ +/**end repeat**/ /**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #sfx = f32, f64# + * Boolean & Float types + * #TYPE = BOOL, FLOAT, DOUBLE# + * #sfx = b8, f32, f64# + * #fp = 0, 1, 1# */ /**begin repeat1 - * #kind = equal, not_equal, less, less_equal, greater, greater_equal# - * #OP = ==, !=, <, <=, >, >=# + * #kind = greater, greater_equal# + * #kind_to = less, less_equal# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - if (!run_binary_simd_@kind@_@sfx@(args, dimensions, steps)) { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((npy_bool *)op1) = in1 @OP@ in2; - } - } + char *nargs[3] = {args[1], args[0], args[2]}; + npy_intp nsteps[3] = {steps[1], steps[0], steps[2]}; + run_binary_simd_@kind_to@_@sfx@(nargs, dimensions, nsteps); +#if @fp@ npy_clear_floatstatus_barrier((char*)dimensions); +#endif +} +/**end repeat1**/ + +/**begin repeat1 + * #kind = equal, not_equal, less, less_equal# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + run_binary_simd_@kind@_@sfx@(args, dimensions, steps); +#if @fp@ + npy_clear_floatstatus_barrier((char*)dimensions); +#endif } /**end repeat1**/ /**end repeat**/ From dc4a9e39dfb13aecb61d955445538838fbb2233d Mon Sep 17 00:00:00 2001 From: Rafael CF Sousa Date: Fri, 27 May 2022 12:34:43 -0300 Subject: [PATCH 5/6] TST: Add test for andc (u8) This commit also rewrite the tests andc, orc and xnor --- numpy/core/tests/test_simd.py | 67 +++++++++++++---------------------- 1 file changed, 24 insertions(+), 43 deletions(-) diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 81d759ad42ea..f33db95fc415 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -126,7 +126,8 @@ def test_operators_logical(self): """ Logical operations for boolean types. Test intrinsics: - npyv_xor_##SFX, npyv_and_##SFX, npyv_or_##SFX, npyv_not_##SFX + npyv_xor_##SFX, npyv_and_##SFX, npyv_or_##SFX, npyv_not_##SFX, + npyv_andc_b8, npvy_orc_b8, nvpy_xnor_b8 """ data_a = self._data() data_b = self._data(reverse=True) @@ -148,6 +149,22 @@ def test_operators_logical(self): vnot = getattr(self, "not")(vdata_a) assert vnot == data_b + # among the boolean types, andc, orc and xnor only support b8 + if self.sfx not in ("b8"): + return + + data_andc = [(a & ~b) & 0xFF for a, b in zip(data_a, data_b)] + vandc = getattr(self, "andc")(vdata_a, vdata_b) + assert data_andc == vandc + + data_orc = [(a | ~b) & 0xFF for a, b in zip(data_a, data_b)] + vorc = getattr(self, "orc")(vdata_a, vdata_b) + assert data_orc == vorc + + data_xnor = [~(a ^ b) & 0xFF for a, b in zip(data_a, data_b)] + vxnor = getattr(self, "xnor")(vdata_a, vdata_b) + assert data_xnor == vxnor + def test_tobits(self): data2bits = lambda data: sum([int(x != 0) << i for i, x in enumerate(data, 0)]) for data in (self._data(), self._data(reverse=True)): @@ -156,48 +173,6 @@ def test_tobits(self): tobits = bin(self.tobits(vdata)) assert tobits == bin(data_bits) - def test_andc(self): - if self.sfx not in ("b8"): - return - andc_simd = getattr(self.npyv, f"andc_b8") - # create the vectors - data = self._data() - rdata = self._data(reverse=True) - vdata = self._load_b(data) - vrdata = self._load_b(rdata) - # check andc - sandc = [(~x & y) & 0xFF for x, y in zip(rdata, data)] - vandc = andc_simd(vrdata, vdata) - assert sandc == vandc - - def test_orc(self): - if self.sfx not in ("b8"): - return - orc_simd = getattr(self.npyv, f"orc_b8") - # create the vectors - data = self._data() - rdata = self._data(reverse=True) - vdata = self._load_b(data) - vrdata = self._load_b(rdata) - # check orc - sorc = [(~x | y) & 0xFF for x, y in zip(rdata, data)] - vorc = orc_simd(vrdata, vdata) - assert sorc == vorc - - def test_xnor(self): - if self.sfx not in ("b8"): - return - xnor_simd = getattr(self.npyv, f"xnor_b8") - # create the vectors - data = self._data() - rdata = self._data(reverse=True) - vdata = self._load_b(data) - vrdata = self._load_b(rdata) - # check orc - sxnor = [~(x ^ y) & 0xFF for x, y in zip(rdata, data)] - vxnor = xnor_simd(vrdata, vdata) - assert sxnor == vxnor - def test_pack(self): """ Pack multiple vectors into one @@ -865,6 +840,12 @@ def test_operators_logical(self): vnot = cast(getattr(self, "not")(vdata_a)) assert vnot == data_not + if self.sfx not in ("u8"): + return + data_andc = [a & ~b for a, b in zip(data_cast_a, data_cast_b)] + vandc = cast(getattr(self, "andc")(vdata_a, vdata_b)) + assert vandc == data_andc + def test_conversion_boolean(self): bsfx = "b" + self.sfx[1:] to_boolean = getattr(self.npyv, "cvt_%s_%s" % (bsfx, self.sfx)) From 2701a5a38e1521e5dd66e343c81bf075c0c30d17 Mon Sep 17 00:00:00 2001 From: Rafael CF Sousa Date: Sun, 29 May 2022 21:29:02 -0300 Subject: [PATCH 6/6] DOC: Add a release note for the comparison functions The PR #21483 improves the execution time of the comparison functions by using universal intrinsics --- doc/release/upcoming_changes/21483.performance.rst | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 doc/release/upcoming_changes/21483.performance.rst diff --git a/doc/release/upcoming_changes/21483.performance.rst b/doc/release/upcoming_changes/21483.performance.rst new file mode 100644 index 000000000000..f9456d69f9ef --- /dev/null +++ b/doc/release/upcoming_changes/21483.performance.rst @@ -0,0 +1,7 @@ +Faster comparison operators +---------------------------- +The comparison functions (``numpy.equal``, ``numpy.not_equal``, ``numpy.less``, +``numpy.less_equal``, ``numpy.greater`` and ``numpy.greater_equal``) are now +much faster as they are now vectorized with universal intrinsics. For a CPU +with SIMD extension AVX512BW, the performance gain is up to 2.57x, 1.65x and +19.15x for integer, float and boolean data types, respectively (with N=50000).