Skip to content

Commit

Permalink
SIMD: Use universal intrinsics to implement comparison functions
Browse files Browse the repository at this point in the history
  • Loading branch information
rafaelcfsousa committed May 23, 2022
1 parent ae8b9ce commit 1134be2
Show file tree
Hide file tree
Showing 18 changed files with 852 additions and 200 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -225,6 +225,7 @@ numpy/core/src/umath/loops_exponent_log.dispatch.c
numpy/core/src/umath/loops_umath_fp.dispatch.c
numpy/core/src/umath/loops_hyperbolic.dispatch.c
numpy/core/src/umath/loops_modulo.dispatch.c
numpy/core/src/umath/loops_comparison.dispatch.c
# npysort module
numpy/core/src/npysort/x86-qsort.dispatch.c
numpy/core/src/npysort/x86-qsort.dispatch.*.cpp
Expand Down
19 changes: 18 additions & 1 deletion benchmarks/benchmarks/bench_ufunc.py
Expand Up @@ -170,8 +170,25 @@ def time_divide_scalar2(self, dtype):
def time_divide_scalar2_inplace(self, dtype):
np.divide(self.d, 1, out=self.d)


class CustomComparison(Benchmark):
params = (np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16,
np.uint32, np.uint64, np.float32, np.float64, np.bool_)
param_names = ['dtype']

def setup(self, dtype):
self.x = np.ones(50000, dtype=dtype)
self.y = np.ones(50000, dtype=dtype)
self.s = np.ones(1, dtype=dtype)

def time_less_than_binary(self, dtype):
(self.x < self.y)

def time_less_than_scalar1(self, dtype):
(self.s < self.x)

def time_less_than_scalar2(self, dtype):
(self.d < 1)
(self.x < self.s)


class CustomScalarFloorDivideInt(Benchmark):
Expand Down
12 changes: 6 additions & 6 deletions numpy/core/code_generators/generate_umath.py
Expand Up @@ -445,47 +445,47 @@ def english_upper(s):
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.greater'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'greater_equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.greater_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'less':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.less'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'less_equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.less_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'not_equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.not_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
Expand Down
1 change: 1 addition & 0 deletions numpy/core/setup.py
Expand Up @@ -1070,6 +1070,7 @@ def generate_umath_doc_header(ext, build_dir):
join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
join('src', 'umath', 'loops_hyperbolic.dispatch.c.src'),
join('src', 'umath', 'loops_modulo.dispatch.c.src'),
join('src', 'umath', 'loops_comparison.dispatch.c.src'),
join('src', 'umath', 'matmul.h.src'),
join('src', 'umath', 'matmul.c.src'),
join('src', 'umath', 'clip.h'),
Expand Down
9 changes: 9 additions & 0 deletions numpy/core/src/_simd/_simd.dispatch.c.src
Expand Up @@ -472,6 +472,10 @@ SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@)
/**end repeat**/

SIMD_IMPL_INTRIN_2(pack_b8_b16, vb8, vb16, vb16)
SIMD_IMPL_INTRIN_4(pack_b8_b32, vb8, vb32, vb32, vb32, vb32)
SIMD_IMPL_INTRIN_8(pack_b8_b64, vb8, vb64, vb64, vb64, vb64,
vb64, vb64, vb64, vb64)

//#########################################################################
//## Attach module functions
Expand Down Expand Up @@ -716,6 +720,11 @@ SIMD_INTRIN_DEF(not_@bsfx@)
SIMD_INTRIN_DEF(tobits_@bsfx@)
/**end repeat**/

// Pack multiple vectors into one
SIMD_INTRIN_DEF(pack_b8_b16)
SIMD_INTRIN_DEF(pack_b8_b32)
SIMD_INTRIN_DEF(pack_b8_b64)

/************************************************************************/
{NULL, NULL, 0, NULL}
}; // PyMethodDef
Expand Down
44 changes: 44 additions & 0 deletions numpy/core/src/_simd/_simd_easyintrin.inc
Expand Up @@ -153,6 +153,50 @@
return simd_arg_to_obj(&ret); \
}

#define SIMD_IMPL_INTRIN_8(NAME, RET, IN0, IN1, IN2, IN3, \
IN4, IN5, IN6, IN7) \
static PyObject *simd__intrin_##NAME \
(PyObject* NPY_UNUSED(self), PyObject *args) \
{ \
simd_arg arg1 = {.dtype = simd_data_##IN0}; \
simd_arg arg2 = {.dtype = simd_data_##IN1}; \
simd_arg arg3 = {.dtype = simd_data_##IN2}; \
simd_arg arg4 = {.dtype = simd_data_##IN3}; \
simd_arg arg5 = {.dtype = simd_data_##IN4}; \
simd_arg arg6 = {.dtype = simd_data_##IN5}; \
simd_arg arg7 = {.dtype = simd_data_##IN6}; \
simd_arg arg8 = {.dtype = simd_data_##IN7}; \
if (!PyArg_ParseTuple( \
args, "O&O&O&O&O&O&O&O&:"NPY_TOSTRING(NAME), \
simd_arg_converter, &arg1, \
simd_arg_converter, &arg2, \
simd_arg_converter, &arg3, \
simd_arg_converter, &arg4, \
simd_arg_converter, &arg5, \
simd_arg_converter, &arg6, \
simd_arg_converter, &arg7, \
simd_arg_converter, &arg8 \
)) return NULL; \
simd_data data = {.RET = npyv_##NAME( \
arg1.data.IN0, arg2.data.IN1, \
arg3.data.IN2, arg4.data.IN3, \
arg5.data.IN4, arg6.data.IN5, \
arg7.data.IN6, arg8.data.IN7 \
)}; \
simd_arg_free(&arg1); \
simd_arg_free(&arg2); \
simd_arg_free(&arg3); \
simd_arg_free(&arg4); \
simd_arg_free(&arg5); \
simd_arg_free(&arg6); \
simd_arg_free(&arg7); \
simd_arg_free(&arg8); \
simd_arg ret = { \
.data = data, .dtype = simd_data_##RET \
}; \
return simd_arg_to_obj(&ret); \
}

/**
* Helper macros for repeating and expand a certain macro.
* Mainly used for converting a scalar to an immediate constant.
Expand Down
30 changes: 30 additions & 0 deletions numpy/core/src/common/simd/avx2/conversion.h
Expand Up @@ -58,6 +58,36 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
return r;
}

// pack two 16-bit boolean into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
__m256i ab = _mm256_packs_epi16(a, b);
return npyv256_shuffle_odd(ab);
}

// pack four 32-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
__m256i ab = _mm256_packs_epi32(a, b);
__m256i cd = _mm256_packs_epi32(c, d);
__m256i abcd = npyv_pack_b8_b16(ab, cd);
return _mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0));
}

// pack eight 64-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
__m256i ab = _mm256_packs_epi32(a, b);
__m256i cd = _mm256_packs_epi32(c, d);
__m256i ef = _mm256_packs_epi32(e, f);
__m256i gh = _mm256_packs_epi32(g, h);
__m256i abcd = _mm256_packs_epi32(ab, cd);
__m256i efgh = _mm256_packs_epi32(ef, gh);
__m256i all = npyv256_shuffle_odd(_mm256_packs_epi16(abcd, efgh));
__m256i rev128 = _mm256_alignr_epi8(all, all, 8);
return _mm256_unpacklo_epi16(all, rev128);
}

// round to nearest integer (assuming even)
#define npyv_round_s32_f32 _mm256_cvtps_epi32
NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
Expand Down
42 changes: 42 additions & 0 deletions numpy/core/src/common/simd/avx512/conversion.h
Expand Up @@ -90,6 +90,48 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
return r;
}

// pack two 16-bit boolean into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
#ifdef NPY_HAVE_AVX512BW
return _mm512_kunpackd((__mmask64)b, (__mmask64)a);
#else
const __m512i idx = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
return _mm512_permutexvar_epi64(idx, npyv512_packs_epi16(a, b));
#endif
}

// pack four 32-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
#ifdef NPY_HAVE_AVX512BW
__mmask32 ab = (__mmask64)_mm512_kunpackw((__mmask32)b, (__mmask32)a);
__mmask32 cd = (__mmask64)_mm512_kunpackw((__mmask32)d, (__mmask32)c);
return npyv_pack_b8_b16(ab, cd);
#else
const __m512i idx = _mm512_setr_epi32(
0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15);
__m256i ta = npyv512_pack_lo_hi(npyv_cvt_u32_b32(a));
__m256i tb = npyv512_pack_lo_hi(npyv_cvt_u32_b32(b));
__m256i tc = npyv512_pack_lo_hi(npyv_cvt_u32_b32(c));
__m256i td = npyv512_pack_lo_hi(npyv_cvt_u32_b32(d));
__m256i ab = _mm256_packs_epi16(ta, tb);
__m256i cd = _mm256_packs_epi16(tc, td);
__m512i abcd = npyv512_combine_si256(ab, cd);
return _mm512_permutexvar_epi32(idx, abcd);
#endif
}

// pack eight 64-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
__mmask16 ab = _mm512_kunpackb((__mmask16)b, (__mmask16)a);
__mmask16 cd = _mm512_kunpackb((__mmask16)d, (__mmask16)c);
__mmask16 ef = _mm512_kunpackb((__mmask16)f, (__mmask16)e);
__mmask16 gh = _mm512_kunpackb((__mmask16)h, (__mmask16)g);
return npyv_pack_b8_b32(ab, cd, ef, gh);
}

// convert boolean vectors to integer bitfield
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
{
Expand Down
12 changes: 12 additions & 0 deletions numpy/core/src/common/simd/avx512/utils.h
Expand Up @@ -87,4 +87,16 @@
)); \
}

#ifndef NPY_HAVE_AVX512BW
NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv512_packs_epi16, _mm256_packs_epi16)
#else
#define npyv512_packs_epi16 _mm512_packs_epi16
#endif

NPY_FINLINE __m256i npyv512_pack_lo_hi(__m512i a) {
__m256i lo = npyv512_lower_si256(a);
__m256i hi = npyv512_higher_si256(a);
return _mm256_packs_epi32(lo, hi);
}

#endif // _NPY_SIMD_AVX512_UTILS_H
24 changes: 24 additions & 0 deletions numpy/core/src/common/simd/neon/conversion.h
Expand Up @@ -86,6 +86,30 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
return r;
}

// pack two 16-bit boolean into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
return vcombine_u8(vmovn_u16(a), vmovn_u16(b));
}

// pack four 32-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
npyv_b16 ab = vcombine_u16(vmovn_u32(a), vmovn_u32(b));
npyv_b16 cd = vcombine_u16(vmovn_u32(c), vmovn_u32(d));
return npyv_pack_b8_b16(ab, cd);
}

// pack eight 64-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
npyv_b32 ab = vcombine_u32(vmovn_u64(a), vmovn_u64(b));
npyv_b32 cd = vcombine_u32(vmovn_u64(c), vmovn_u64(d));
npyv_b32 ef = vcombine_u32(vmovn_u64(e), vmovn_u64(f));
npyv_b32 gh = vcombine_u32(vmovn_u64(g), vmovn_u64(h));
return npyv_pack_b8_b32(ab, cd, ef, gh);
}

// round to nearest integer
#if NPY_SIMD_F64
#define npyv_round_s32_f32 vcvtnq_s32_f32
Expand Down
24 changes: 24 additions & 0 deletions numpy/core/src/common/simd/sse/conversion.h
Expand Up @@ -59,6 +59,30 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
return r;
}

// pack two 16-bit boolean into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
return _mm_packs_epi16(a, b);
}

// pack four 32-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
npyv_b16 ab = _mm_packs_epi32(a, b);
npyv_b16 cd = _mm_packs_epi32(c, d);
return npyv_pack_b8_b16(ab, cd);
}

// pack eight 64-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
npyv_b32 ab = _mm_packs_epi32(a, b);
npyv_b32 cd = _mm_packs_epi32(c, d);
npyv_b32 ef = _mm_packs_epi32(e, f);
npyv_b32 gh = _mm_packs_epi32(g, h);
return npyv_pack_b8_b32(ab, cd, ef, gh);
}

// round to nearest integer (assuming even)
#define npyv_round_s32_f32 _mm_cvtps_epi32
NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
Expand Down
23 changes: 23 additions & 0 deletions numpy/core/src/common/simd/vsx/conversion.h
Expand Up @@ -48,6 +48,29 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
return r;
}

// pack two 16-bit boolean into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
return vec_pack(a, b);
}

// pack four 32-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
npyv_b16 ab = vec_pack(a, b);
npyv_b16 cd = vec_pack(c, d);
return npyv_pack_b8_b16(ab, cd);
}

// pack eight 64-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
npyv_b32 ab = vec_pack(a, b);
npyv_b32 cd = vec_pack(c, d);
npyv_b32 ef = vec_pack(e, f);
npyv_b32 gh = vec_pack(g, h);
return npyv_pack_b8_b32(ab, cd, ef, gh);
}

// convert boolean vector to integer bitfield
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
{
Expand Down

0 comments on commit 1134be2

Please sign in to comment.