Skip to content

Commit

Permalink
WIP::ENH:SIMD Improve the performance of comparison operators
Browse files Browse the repository at this point in the history
  TODO
  • Loading branch information
seiko2plus committed Aug 3, 2020
1 parent d161288 commit 95c485a
Show file tree
Hide file tree
Showing 7 changed files with 586 additions and 519 deletions.
16 changes: 8 additions & 8 deletions numpy/core/code_generators/generate_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,55 +425,55 @@ def english_upper(s):
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.greater'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'greater_equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.greater_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'less':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.less'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'less_equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.less_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'not_equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.not_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'logical_and':
Ufunc(2, 1, True_,
docstrings.get('numpy.core.umath.logical_and'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
TD(nodatetime_or_obj, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
TD(O, f='npy_ObjectLogicalAnd'),
TD(O, f='npy_ObjectLogicalAnd', out='?'),
),
Expand All @@ -489,7 +489,7 @@ def english_upper(s):
Ufunc(2, 1, False_,
docstrings.get('numpy.core.umath.logical_or'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
TD(nodatetime_or_obj, out='?', dispatch=[('loops_cmp', nodatetime_or_obj)]),
TD(O, f='npy_ObjectLogicalOr'),
TD(O, f='npy_ObjectLogicalOr', out='?'),
),
Expand Down
1 change: 1 addition & 0 deletions numpy/core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,7 @@ def generate_umath_c(ext, build_dir):
join('src', 'umath', 'simd.inc.src'),
join('src', 'umath', 'loops.h.src'),
join('src', 'umath', 'loops.c.src'),
join('src', 'umath', 'loops_cmp.dispatch.c.src'),
join('src', 'umath', 'matmul.h.src'),
join('src', 'umath', 'matmul.c.src'),
join('src', 'umath', 'clip.h.src'),
Expand Down
16 changes: 8 additions & 8 deletions numpy/core/src/common/simd/avx512/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
#define npyv_and_s32 _mm512_and_si512
#define npyv_and_u64 _mm512_and_si512
#define npyv_and_s64 _mm512_and_si512
#define npyv_and_b32 _kand_mask16
#define npyv_and_b32 _mm512_kand
#ifdef NPY_HAVE_AVX512BW
#define npyv_and_b8 _kand_mask64
#define npyv_and_b16 _kand_mask32
Expand All @@ -96,7 +96,7 @@
#define npyv_and_f32 _mm512_and_ps
#define npyv_and_f64 _mm512_and_pd
#else
#define npyv_and_b64(A, B) ((__mmask8)_kand_mask16((__mmask16)(A), (__mmask16)(B)))
#define npyv_and_b64(A, B) ((__mmask8)_mm512_kand((__mmask16)(A), (__mmask16)(B)))
NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_and_f32, _mm512_and_si512)
NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_and_f64, _mm512_and_si512)
#endif
Expand All @@ -109,7 +109,7 @@
#define npyv_or_s32 _mm512_or_si512
#define npyv_or_u64 _mm512_or_si512
#define npyv_or_s64 _mm512_or_si512
#define npyv_or_b32 _kor_mask16
#define npyv_or_b32 _mm512_kor
#ifdef NPY_HAVE_AVX512BW
#define npyv_or_b8 _kor_mask64
#define npyv_or_b16 _kor_mask32
Expand All @@ -122,7 +122,7 @@
#define npyv_or_f32 _mm512_or_ps
#define npyv_or_f64 _mm512_or_pd
#else
#define npyv_or_b64(A, B) ((__mmask8)_kor_mask16((__mmask16)(A), (__mmask16)(B)))
#define npyv_or_b64(A, B) ((__mmask8)_mm512_kor((__mmask16)(A), (__mmask16)(B)))
NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_or_f32, _mm512_or_si512)
NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_or_f64, _mm512_or_si512)
#endif
Expand All @@ -135,7 +135,7 @@
#define npyv_xor_s32 _mm512_xor_si512
#define npyv_xor_u64 _mm512_xor_si512
#define npyv_xor_s64 _mm512_xor_si512
#define npyv_xor_b32 _kxor_mask16
#define npyv_xor_b32 _mm512_kxor
#ifdef NPY_HAVE_AVX512BW
#define npyv_xor_b8 _kxor_mask64
#define npyv_xor_b16 _kxor_mask32
Expand All @@ -148,7 +148,7 @@
#define npyv_xor_f32 _mm512_xor_ps
#define npyv_xor_f64 _mm512_xor_pd
#else
#define npyv_xor_b64(A, B) ((__mmask8)_kxor_mask16((__mmask16)(A), (__mmask16)(B)))
#define npyv_xor_b64(A, B) ((__mmask8)_mm512_kxor((__mmask16)(A), (__mmask16)(B)))
NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_xor_f32, _mm512_xor_si512)
NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_xor_f64, _mm512_xor_si512)
#endif
Expand All @@ -161,7 +161,7 @@
#define npyv_not_s32 npyv_not_u8
#define npyv_not_u64 npyv_not_u8
#define npyv_not_s64 npyv_not_u8
#define npyv_not_b32 _knot_mask16
#define npyv_not_b32 _mm512_knot
#ifdef NPY_HAVE_AVX512BW
#define npyv_not_b8 _knot_mask64
#define npyv_not_b16 _knot_mask32
Expand All @@ -174,7 +174,7 @@
#define npyv_not_f32(A) _mm512_xor_ps(A, _mm512_castsi512_ps(_mm512_set1_epi32(-1)))
#define npyv_not_f64(A) _mm512_xor_pd(A, _mm512_castsi512_pd(_mm512_set1_epi32(-1)))
#else
#define npyv_not_b64(A, B) ((__mmask8)_knot_mask16((__mmask16)(A), (__mmask16)(B)))
#define npyv_not_b64(A) ((__mmask8)_mm512_knot((__mmask16)(A))
#define npyv_not_f32(A) _mm512_castsi512_ps(npyv_not_u32(_mm512_castps_si512(A)))
#define npyv_not_f64(A) _mm512_castsi512_pd(npyv_not_u64(_mm512_castpd_si512(A)))
#endif
Expand Down
186 changes: 0 additions & 186 deletions numpy/core/src/umath/loops.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -397,96 +397,6 @@ PyUFunc_On_Om(char **args, npy_intp const *dimensions, npy_intp const *steps, vo
*****************************************************************************
*/

/**begin repeat
* #kind = equal, not_equal, greater, greater_equal, less, less_equal#
* #OP = ==, !=, >, >=, <, <=#
**/

NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
BINARY_LOOP {
npy_bool in1 = *((npy_bool *)ip1) != 0;
npy_bool in2 = *((npy_bool *)ip2) != 0;
*((npy_bool *)op1)= in1 @OP@ in2;
}
}
/**end repeat**/


/**begin repeat
* #kind = logical_and, logical_or#
* #OP = &&, ||#
* #SC = ==, !=#
* #and = 1, 0#
**/

NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
if(IS_BINARY_REDUCE) {
#ifdef NPY_HAVE_SSE2_INTRINSICS
/*
* stick with our variant for more reliable performance, only known
* platform which outperforms it by ~20% is an i7 with glibc 2.17
*/
if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
return;
}
#else
/* for now only use libc on 32-bit/non-x86 */
if (steps[1] == 1) {
npy_bool * op = (npy_bool *)args[0];
#if @and@
/* np.all(), search for a zero (false) */
if (*op) {
*op = memchr(args[1], 0, dimensions[0]) == NULL;
}
#else
/*
* np.any(), search for a non-zero (true) via comparing against
* zero blocks, memcmp is faster than memchr on SSE4 machines
* with glibc >= 2.12 and memchr can only check for equal 1
*/
static const npy_bool zero[4096]; /* zero by C standard */
npy_uintp i, n = dimensions[0];

for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
*op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
}
if (!*op && n - i > 0) {
*op = memcmp(&args[1][i], zero, n - i) != 0;
}
#endif
return;
}
#endif
else {
BINARY_REDUCE_LOOP(npy_bool) {
const npy_bool in2 = *(npy_bool *)ip2;
io1 = io1 @OP@ in2;
if (io1 @SC@ 0) {
break;
}
}
*((npy_bool *)iop1) = io1;
}
}
else {
if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
return;
}
else {
BINARY_LOOP {
const npy_bool in1 = *(npy_bool *)ip1;
const npy_bool in2 = *(npy_bool *)ip2;
*((npy_bool *)op1) = in1 @OP@ in2;
}
}
}
}
/**end repeat**/

/**begin repeat
* #kind = absolute, logical_not#
* #OP = !=, ==#
Expand Down Expand Up @@ -686,27 +596,6 @@ void
BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
}


/**begin repeat2
* #kind = equal, not_equal, greater, greater_equal, less, less_equal,
* logical_and, logical_or#
* #OP = ==, !=, >, >=, <, <=, &&, ||#
*/

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
/*
* gcc vectorization of this is not good (PR60575) but manual integer
* vectorization is too tedious to be worthwhile
*/
BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
}
#endif

/**end repeat2**/

#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
Expand Down Expand Up @@ -1839,25 +1728,6 @@ NPY_NO_EXPORT void
}
/**end repeat1**/

/**begin repeat1
* #kind = equal, not_equal, less, less_equal, greater, greater_equal,
* logical_and, logical_or#
* #OP = ==, !=, <, <=, >, >=, &&, ||#
*/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
BINARY_LOOP {
const @type@ in1 = *(@type@ *)ip1;
const @type@ in2 = *(@type@ *)ip2;
*((npy_bool *)op1) = in1 @OP@ in2;
}
}
npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
Expand Down Expand Up @@ -2254,27 +2124,6 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
}
/**end repeat**/

#define _HALF_LOGICAL_AND(a,b) (!npy_half_iszero(a) && !npy_half_iszero(b))
#define _HALF_LOGICAL_OR(a,b) (!npy_half_iszero(a) || !npy_half_iszero(b))
/**begin repeat
* #kind = equal, not_equal, less, less_equal, greater,
* greater_equal, logical_and, logical_or#
* #OP = npy_half_eq, npy_half_ne, npy_half_lt, npy_half_le, npy_half_gt,
* npy_half_ge, _HALF_LOGICAL_AND, _HALF_LOGICAL_OR#
*/
NPY_NO_EXPORT void
HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
BINARY_LOOP {
const npy_half in1 = *(npy_half *)ip1;
const npy_half in2 = *(npy_half *)ip2;
*((npy_bool *)op1) = @OP@(in1, in2);
}
}
/**end repeat**/
#undef _HALF_LOGICAL_AND
#undef _HALF_LOGICAL_OR

NPY_NO_EXPORT void
HALF_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
Expand Down Expand Up @@ -2763,41 +2612,6 @@ NPY_NO_EXPORT void
}
}

/**begin repeat1
* #kind= greater, greater_equal, less, less_equal, equal, not_equal#
* #OP = CGT, CGE, CLT, CLE, CEQ, CNE#
*/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
BINARY_LOOP {
const @ftype@ in1r = ((@ftype@ *)ip1)[0];
const @ftype@ in1i = ((@ftype@ *)ip1)[1];
const @ftype@ in2r = ((@ftype@ *)ip2)[0];
const @ftype@ in2i = ((@ftype@ *)ip2)[1];
*((npy_bool *)op1) = @OP@(in1r,in1i,in2r,in2i);
}
}
/**end repeat1**/

/**begin repeat1
#kind = logical_and, logical_or#
#OP1 = ||, ||#
#OP2 = &&, ||#
*/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
BINARY_LOOP {
const @ftype@ in1r = ((@ftype@ *)ip1)[0];
const @ftype@ in1i = ((@ftype@ *)ip1)[1];
const @ftype@ in2r = ((@ftype@ *)ip2)[0];
const @ftype@ in2i = ((@ftype@ *)ip2)[1];
*((npy_bool *)op1) = (in1r @OP1@ in1i) @OP2@ (in2r @OP1@ in2i);
}
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
Expand Down

0 comments on commit 95c485a

Please sign in to comment.