Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SIMD: Use universal intrinsics to implement comparison functions #21483

Merged
merged 6 commits into from May 30, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -225,6 +225,7 @@ numpy/core/src/umath/loops_exponent_log.dispatch.c
numpy/core/src/umath/loops_umath_fp.dispatch.c
numpy/core/src/umath/loops_hyperbolic.dispatch.c
numpy/core/src/umath/loops_modulo.dispatch.c
numpy/core/src/umath/loops_comparison.dispatch.c
# npysort module
numpy/core/src/npysort/x86-qsort.dispatch.c
numpy/core/src/npysort/x86-qsort.dispatch.*.cpp
Expand Down
19 changes: 18 additions & 1 deletion benchmarks/benchmarks/bench_ufunc.py
Expand Up @@ -170,8 +170,25 @@ def time_divide_scalar2(self, dtype):
def time_divide_scalar2_inplace(self, dtype):
np.divide(self.d, 1, out=self.d)


class CustomComparison(Benchmark):
params = (np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16,
np.uint32, np.uint64, np.float32, np.float64, np.bool_)
param_names = ['dtype']

def setup(self, dtype):
self.x = np.ones(50000, dtype=dtype)
self.y = np.ones(50000, dtype=dtype)
self.s = np.ones(1, dtype=dtype)

def time_less_than_binary(self, dtype):
(self.x < self.y)

def time_less_than_scalar1(self, dtype):
(self.s < self.x)

def time_less_than_scalar2(self, dtype):
(self.d < 1)
(self.x < self.s)
Comment on lines +184 to +191
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mattip, benchmarks here doesn't cover the rest of operations, not sure if its necessary to cover them or its better to save a room for the upcoming benchmarks since its the benchmark process start to became pretty slow.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for minimal effective benchmarks. I don't think there is a need to repeatedly hit the same code multiple times



class CustomScalarFloorDivideInt(Benchmark):
Expand Down
12 changes: 6 additions & 6 deletions numpy/core/code_generators/generate_umath.py
Expand Up @@ -445,47 +445,47 @@ def english_upper(s):
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.greater'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'greater_equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.greater_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'less':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.less'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'less_equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.less_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'not_equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.not_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
Expand Down
1 change: 1 addition & 0 deletions numpy/core/setup.py
Expand Up @@ -1070,6 +1070,7 @@ def generate_umath_doc_header(ext, build_dir):
join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
join('src', 'umath', 'loops_hyperbolic.dispatch.c.src'),
join('src', 'umath', 'loops_modulo.dispatch.c.src'),
join('src', 'umath', 'loops_comparison.dispatch.c.src'),
join('src', 'umath', 'matmul.h.src'),
join('src', 'umath', 'matmul.c.src'),
join('src', 'umath', 'clip.h'),
Expand Down
15 changes: 15 additions & 0 deletions numpy/core/src/_simd/_simd.dispatch.c.src
Expand Up @@ -462,6 +462,9 @@ SIMD_IMPL_INTRIN_2(or_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_2(xor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
/**end repeat**/
SIMD_IMPL_INTRIN_2(andc_b8, vb8, vb8, vb8)
SIMD_IMPL_INTRIN_2(orc_b8, vb8, vb8, vb8)
SIMD_IMPL_INTRIN_2(xnor_b8, vb8, vb8, vb8)
/***************************
* Conversions
***************************/
Expand All @@ -472,6 +475,10 @@ SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@)
/**end repeat**/

SIMD_IMPL_INTRIN_2(pack_b8_b16, vb8, vb16, vb16)
SIMD_IMPL_INTRIN_4(pack_b8_b32, vb8, vb32, vb32, vb32, vb32)
SIMD_IMPL_INTRIN_8(pack_b8_b64, vb8, vb64, vb64, vb64, vb64,
vb64, vb64, vb64, vb64)

//#########################################################################
//## Attach module functions
Expand Down Expand Up @@ -706,6 +713,9 @@ SIMD_INTRIN_DEF(or_@bsfx@)
SIMD_INTRIN_DEF(xor_@bsfx@)
SIMD_INTRIN_DEF(not_@bsfx@)
/**end repeat**/
SIMD_INTRIN_DEF(andc_b8)
SIMD_INTRIN_DEF(orc_b8)
SIMD_INTRIN_DEF(xnor_b8)
/***************************
* Conversions
***************************/
Expand All @@ -716,6 +726,11 @@ SIMD_INTRIN_DEF(not_@bsfx@)
SIMD_INTRIN_DEF(tobits_@bsfx@)
/**end repeat**/

// Pack multiple vectors into one
SIMD_INTRIN_DEF(pack_b8_b16)
SIMD_INTRIN_DEF(pack_b8_b32)
SIMD_INTRIN_DEF(pack_b8_b64)

/************************************************************************/
{NULL, NULL, 0, NULL}
}; // PyMethodDef
Expand Down
44 changes: 44 additions & 0 deletions numpy/core/src/_simd/_simd_easyintrin.inc
Expand Up @@ -153,6 +153,50 @@
return simd_arg_to_obj(&ret); \
}

#define SIMD_IMPL_INTRIN_8(NAME, RET, IN0, IN1, IN2, IN3, \
IN4, IN5, IN6, IN7) \
static PyObject *simd__intrin_##NAME \
(PyObject* NPY_UNUSED(self), PyObject *args) \
{ \
simd_arg arg1 = {.dtype = simd_data_##IN0}; \
simd_arg arg2 = {.dtype = simd_data_##IN1}; \
simd_arg arg3 = {.dtype = simd_data_##IN2}; \
simd_arg arg4 = {.dtype = simd_data_##IN3}; \
simd_arg arg5 = {.dtype = simd_data_##IN4}; \
simd_arg arg6 = {.dtype = simd_data_##IN5}; \
simd_arg arg7 = {.dtype = simd_data_##IN6}; \
simd_arg arg8 = {.dtype = simd_data_##IN7}; \
if (!PyArg_ParseTuple( \
args, "O&O&O&O&O&O&O&O&:"NPY_TOSTRING(NAME), \
simd_arg_converter, &arg1, \
simd_arg_converter, &arg2, \
simd_arg_converter, &arg3, \
simd_arg_converter, &arg4, \
simd_arg_converter, &arg5, \
simd_arg_converter, &arg6, \
simd_arg_converter, &arg7, \
simd_arg_converter, &arg8 \
)) return NULL; \
simd_data data = {.RET = npyv_##NAME( \
arg1.data.IN0, arg2.data.IN1, \
arg3.data.IN2, arg4.data.IN3, \
arg5.data.IN4, arg6.data.IN5, \
arg7.data.IN6, arg8.data.IN7 \
)}; \
simd_arg_free(&arg1); \
simd_arg_free(&arg2); \
simd_arg_free(&arg3); \
simd_arg_free(&arg4); \
simd_arg_free(&arg5); \
simd_arg_free(&arg6); \
simd_arg_free(&arg7); \
simd_arg_free(&arg8); \
simd_arg ret = { \
.data = data, .dtype = simd_data_##RET \
}; \
return simd_arg_to_obj(&ret); \
}

/**
* Helper macros for repeating and expand a certain macro.
* Mainly used for converting a scalar to an immediate constant.
Expand Down
30 changes: 30 additions & 0 deletions numpy/core/src/common/simd/avx2/conversion.h
Expand Up @@ -58,6 +58,36 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
return r;
}

// pack two 16-bit boolean into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
__m256i ab = _mm256_packs_epi16(a, b);
return npyv256_shuffle_odd(ab);
}

// pack four 32-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
__m256i ab = _mm256_packs_epi32(a, b);
__m256i cd = _mm256_packs_epi32(c, d);
__m256i abcd = npyv_pack_b8_b16(ab, cd);
return _mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0));
}

// pack eight 64-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
__m256i ab = _mm256_packs_epi32(a, b);
__m256i cd = _mm256_packs_epi32(c, d);
__m256i ef = _mm256_packs_epi32(e, f);
__m256i gh = _mm256_packs_epi32(g, h);
__m256i abcd = _mm256_packs_epi32(ab, cd);
__m256i efgh = _mm256_packs_epi32(ef, gh);
__m256i all = npyv256_shuffle_odd(_mm256_packs_epi16(abcd, efgh));
__m256i rev128 = _mm256_alignr_epi8(all, all, 8);
return _mm256_unpacklo_epi16(all, rev128);
}

// round to nearest integer (assuming even)
#define npyv_round_s32_f32 _mm256_cvtps_epi32
NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
Expand Down
5 changes: 5 additions & 0 deletions numpy/core/src/common/simd/avx2/operators.h
Expand Up @@ -114,6 +114,11 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
#define npyv_not_b32 npyv_not_u8
#define npyv_not_b64 npyv_not_u8

// ANDC, ORC and XNOR
#define npyv_andc_b8(A, B) _mm256_andnot_si256(A, B)
seiko2plus marked this conversation as resolved.
Show resolved Hide resolved
#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
seiko2plus marked this conversation as resolved.
Show resolved Hide resolved

/***************************
* Comparison
***************************/
Expand Down
42 changes: 42 additions & 0 deletions numpy/core/src/common/simd/avx512/conversion.h
Expand Up @@ -90,6 +90,48 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
return r;
}

// pack two 16-bit boolean into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
#ifdef NPY_HAVE_AVX512BW
return _mm512_kunpackd((__mmask64)b, (__mmask64)a);
#else
const __m512i idx = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
return _mm512_permutexvar_epi64(idx, npyv512_packs_epi16(a, b));
#endif
}

// pack four 32-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
#ifdef NPY_HAVE_AVX512BW
__mmask32 ab = (__mmask64)_mm512_kunpackw((__mmask32)b, (__mmask32)a);
__mmask32 cd = (__mmask64)_mm512_kunpackw((__mmask32)d, (__mmask32)c);
seiko2plus marked this conversation as resolved.
Show resolved Hide resolved
return npyv_pack_b8_b16(ab, cd);
#else
const __m512i idx = _mm512_setr_epi32(
0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15);
__m256i ta = npyv512_pack_lo_hi(npyv_cvt_u32_b32(a));
__m256i tb = npyv512_pack_lo_hi(npyv_cvt_u32_b32(b));
__m256i tc = npyv512_pack_lo_hi(npyv_cvt_u32_b32(c));
__m256i td = npyv512_pack_lo_hi(npyv_cvt_u32_b32(d));
__m256i ab = _mm256_packs_epi16(ta, tb);
__m256i cd = _mm256_packs_epi16(tc, td);
__m512i abcd = npyv512_combine_si256(ab, cd);
return _mm512_permutexvar_epi32(idx, abcd);
#endif
}

// pack eight 64-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
__mmask16 ab = _mm512_kunpackb((__mmask16)b, (__mmask16)a);
__mmask16 cd = _mm512_kunpackb((__mmask16)d, (__mmask16)c);
__mmask16 ef = _mm512_kunpackb((__mmask16)f, (__mmask16)e);
__mmask16 gh = _mm512_kunpackb((__mmask16)h, (__mmask16)g);
return npyv_pack_b8_b32(ab, cd, ef, gh);
}

// convert boolean vectors to integer bitfield
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
{
Expand Down
12 changes: 12 additions & 0 deletions numpy/core/src/common/simd/avx512/operators.h
Expand Up @@ -152,6 +152,9 @@
#define npyv_xor_b16 _kxor_mask32
#define npyv_not_b8 _knot_mask64
#define npyv_not_b16 _knot_mask32
#define npyv_andc_b8 _kandn_mask64
#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
#define npyv_xnor_b8 _kxnor_mask64
#elif defined(NPY_HAVE_AVX512BW)
NPY_FINLINE npyv_b8 npyv_and_b8(npyv_b8 a, npyv_b8 b)
{ return a & b; }
Expand All @@ -169,6 +172,12 @@
{ return ~a; }
NPY_FINLINE npyv_b16 npyv_not_b16(npyv_b16 a)
{ return ~a; }
NPY_FINLINE npyv_b8 npyv_andc_b8(npyv_b8 a, npyv_b8 b)
{ return (~a) & b; }
NPY_FINLINE npyv_b8 npyv_orc_b8(npyv_b8 a, npyv_b8 b)
{ return (~a) | b; }
NPY_FINLINE npyv_b8 npyv_xnor_b8(npyv_b8 a, npyv_b8 b)
{ return ~(a ^ b); }
#else
#define npyv_and_b8 _mm512_and_si512
#define npyv_and_b16 _mm512_and_si512
Expand All @@ -178,6 +187,9 @@
#define npyv_xor_b16 _mm512_xor_si512
#define npyv_not_b8 npyv_not_u8
#define npyv_not_b16 npyv_not_u8
#define npyv_andc_b8 _mm512_andnot_si512
#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
#endif

#define npyv_and_b32 _mm512_kand
Expand Down
12 changes: 12 additions & 0 deletions numpy/core/src/common/simd/avx512/utils.h
Expand Up @@ -87,4 +87,16 @@
)); \
}

#ifndef NPY_HAVE_AVX512BW
NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv512_packs_epi16, _mm256_packs_epi16)
#else
#define npyv512_packs_epi16 _mm512_packs_epi16
#endif

NPY_FINLINE __m256i npyv512_pack_lo_hi(__m512i a) {
__m256i lo = npyv512_lower_si256(a);
__m256i hi = npyv512_higher_si256(a);
return _mm256_packs_epi32(lo, hi);
}

#endif // _NPY_SIMD_AVX512_UTILS_H
24 changes: 24 additions & 0 deletions numpy/core/src/common/simd/neon/conversion.h
Expand Up @@ -86,6 +86,30 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
return r;
}

// pack two 16-bit boolean into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
return vcombine_u8(vmovn_u16(a), vmovn_u16(b));
}

// pack four 32-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
npyv_b16 ab = vcombine_u16(vmovn_u32(a), vmovn_u32(b));
npyv_b16 cd = vcombine_u16(vmovn_u32(c), vmovn_u32(d));
return npyv_pack_b8_b16(ab, cd);
}

// pack eight 64-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
npyv_b32 ab = vcombine_u32(vmovn_u64(a), vmovn_u64(b));
npyv_b32 cd = vcombine_u32(vmovn_u64(c), vmovn_u64(d));
npyv_b32 ef = vcombine_u32(vmovn_u64(e), vmovn_u64(f));
npyv_b32 gh = vcombine_u32(vmovn_u64(g), vmovn_u64(h));
return npyv_pack_b8_b32(ab, cd, ef, gh);
}

// round to nearest integer
#if NPY_SIMD_F64
#define npyv_round_s32_f32 vcvtnq_s32_f32
Expand Down
5 changes: 5 additions & 0 deletions numpy/core/src/common/simd/neon/operators.h
Expand Up @@ -116,6 +116,11 @@
#define npyv_not_b32 vmvnq_u32
#define npyv_not_b64 npyv_not_u64

// ANDC, ORC and XNOR
#define npyv_andc_b8(A, B) vbicq_u8(B, A)
#define npyv_orc_b8(A, B) vornq_u8(B, A)
#define npyv_xnor_b8 vceqq_u8

/***************************
* Comparison
***************************/
Expand Down