Skip to content

Commit

Permalink
SIMD: Use universal intrinsics to implement comparison functions
Browse files Browse the repository at this point in the history
  • Loading branch information
rafaelcfsousa committed May 20, 2022
1 parent ae8b9ce commit 2336869
Show file tree
Hide file tree
Showing 16 changed files with 804 additions and 200 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ numpy/core/src/umath/loops_exponent_log.dispatch.c
numpy/core/src/umath/loops_umath_fp.dispatch.c
numpy/core/src/umath/loops_hyperbolic.dispatch.c
numpy/core/src/umath/loops_modulo.dispatch.c
numpy/core/src/umath/loops_comparison.dispatch.c
# npysort module
numpy/core/src/npysort/x86-qsort.dispatch.c
numpy/core/src/npysort/x86-qsort.dispatch.*.cpp
Expand Down
19 changes: 18 additions & 1 deletion benchmarks/benchmarks/bench_ufunc.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,25 @@ def time_divide_scalar2(self, dtype):
def time_divide_scalar2_inplace(self, dtype):
np.divide(self.d, 1, out=self.d)


class CustomComparison(Benchmark):
params = (np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16,
np.uint32, np.uint64, np.float32, np.float64, np.bool_)
param_names = ['dtype']

def setup(self, dtype):
self.x = np.ones(50000, dtype=dtype)
self.y = np.ones(50000, dtype=dtype)
self.s = np.ones(1, dtype=dtype)

def time_less_than_binary(self, dtype):
(self.x < self.y)

def time_less_than_scalar1(self, dtype):
(self.s < self.x)

def time_less_than_scalar2(self, dtype):
(self.d < 1)
(self.x < self.s)


class CustomScalarFloorDivideInt(Benchmark):
Expand Down
12 changes: 6 additions & 6 deletions numpy/core/code_generators/generate_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,47 +445,47 @@ def english_upper(s):
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.greater'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'greater_equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.greater_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'less':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.less'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'less_equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.less_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
'not_equal':
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.not_equal'),
'PyUFunc_SimpleBinaryComparisonTypeResolver',
TD(all, out='?', simd=[('avx2', ints)]),
TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
[TypeDescription('O', FullTypeDescr, 'OO', 'O')],
TD('O', out='?'),
),
Expand Down
1 change: 1 addition & 0 deletions numpy/core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1070,6 +1070,7 @@ def generate_umath_doc_header(ext, build_dir):
join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
join('src', 'umath', 'loops_hyperbolic.dispatch.c.src'),
join('src', 'umath', 'loops_modulo.dispatch.c.src'),
join('src', 'umath', 'loops_comparison.dispatch.c.src'),
join('src', 'umath', 'matmul.h.src'),
join('src', 'umath', 'matmul.c.src'),
join('src', 'umath', 'clip.h'),
Expand Down
9 changes: 9 additions & 0 deletions numpy/core/src/_simd/_simd.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,10 @@ SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@)
/**end repeat**/

SIMD_IMPL_INTRIN_2(pack_b8_b16, vb8, vb16, vb16)
SIMD_IMPL_INTRIN_4(pack_b8_b32, vb8, vb32, vb32, vb32, vb32)
SIMD_IMPL_INTRIN_8(pack_b8_b64, vb8, vb64, vb64, vb64, vb64,
vb64, vb64, vb64, vb64)

//#########################################################################
//## Attach module functions
Expand Down Expand Up @@ -716,6 +720,11 @@ SIMD_INTRIN_DEF(not_@bsfx@)
SIMD_INTRIN_DEF(tobits_@bsfx@)
/**end repeat**/

// Pack multiple vectors into one
SIMD_INTRIN_DEF(pack_b8_b16)
SIMD_INTRIN_DEF(pack_b8_b32)
SIMD_INTRIN_DEF(pack_b8_b64)

/************************************************************************/
{NULL, NULL, 0, NULL}
}; // PyMethodDef
Expand Down
44 changes: 44 additions & 0 deletions numpy/core/src/_simd/_simd_easyintrin.inc
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,50 @@
return simd_arg_to_obj(&ret); \
}

#define SIMD_IMPL_INTRIN_8(NAME, RET, IN0, IN1, IN2, IN3, \
IN4, IN5, IN6, IN7) \
static PyObject *simd__intrin_##NAME \
(PyObject* NPY_UNUSED(self), PyObject *args) \
{ \
simd_arg arg1 = {.dtype = simd_data_##IN0}; \
simd_arg arg2 = {.dtype = simd_data_##IN1}; \
simd_arg arg3 = {.dtype = simd_data_##IN2}; \
simd_arg arg4 = {.dtype = simd_data_##IN3}; \
simd_arg arg5 = {.dtype = simd_data_##IN4}; \
simd_arg arg6 = {.dtype = simd_data_##IN5}; \
simd_arg arg7 = {.dtype = simd_data_##IN6}; \
simd_arg arg8 = {.dtype = simd_data_##IN7}; \
if (!PyArg_ParseTuple( \
args, "O&O&O&O&O&O&O&O&:"NPY_TOSTRING(NAME), \
simd_arg_converter, &arg1, \
simd_arg_converter, &arg2, \
simd_arg_converter, &arg3, \
simd_arg_converter, &arg4, \
simd_arg_converter, &arg5, \
simd_arg_converter, &arg6, \
simd_arg_converter, &arg7, \
simd_arg_converter, &arg8 \
)) return NULL; \
simd_data data = {.RET = npyv_##NAME( \
arg1.data.IN0, arg2.data.IN1, \
arg3.data.IN2, arg4.data.IN3, \
arg5.data.IN4, arg6.data.IN5, \
arg7.data.IN6, arg8.data.IN7 \
)}; \
simd_arg_free(&arg1); \
simd_arg_free(&arg2); \
simd_arg_free(&arg3); \
simd_arg_free(&arg4); \
simd_arg_free(&arg5); \
simd_arg_free(&arg6); \
simd_arg_free(&arg7); \
simd_arg_free(&arg8); \
simd_arg ret = { \
.data = data, .dtype = simd_data_##RET \
}; \
return simd_arg_to_obj(&ret); \
}

/**
* Helper macros for repeating and expand a certain macro.
* Mainly used for converting a scalar to an immediate constant.
Expand Down
28 changes: 28 additions & 0 deletions numpy/core/src/common/simd/avx2/conversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,34 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
return r;
}

// pack multiple vectors into one
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
__m256i r = _mm256_packs_epi16(a, b);
/* mask = 0b11011000 */
return _mm256_permute4x64_epi64(r, 216);
}

NPY_FINLINE npyv_b8
npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
const __m256i idx = _mm256_setr_epi32(
/* a */0, 4, /* b */1, 5, /* c */2, 6, /* d */3, 7);
__m256i p1 = _mm256_packs_epi32(a, b);
__m256i p2 = _mm256_packs_epi32(c, d);
__m256i r = _mm256_packs_epi16(p1, p2);
return _mm256_permutevar8x32_epi32(r, idx);
}

NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
/* mask = 0b11011000 */
__m256i p1 = _mm256_permute4x64_epi64(_mm256_packs_epi32(a, b), 216);
__m256i p2 = _mm256_permute4x64_epi64(_mm256_packs_epi32(c, d), 216);
__m256i p3 = _mm256_permute4x64_epi64(_mm256_packs_epi32(e, f), 216);
__m256i p4 = _mm256_permute4x64_epi64(_mm256_packs_epi32(g, h), 216);
return npyv_pack_b8_b32(p1, p2, p3, p4);
}

// round to nearest integer (assuming even)
#define npyv_round_s32_f32 _mm256_cvtps_epi32
NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
Expand Down
90 changes: 90 additions & 0 deletions numpy/core/src/common/simd/avx512/conversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,96 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
return r;
}

#define NPYV__PACK_LO_HI(LEN, IN, R) \
lo = npyv512_lower_si256(IN); \
hi = npyv512_higher_si256(IN); \
R = _mm256_packs_epi##LEN(lo, hi);

// pack multiple vectors into one
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
#ifdef NPY_HAVE_AVX512BW
const __m512i idx = npyv__setr_epi64(/* a */0, 2, 4, 6, /* b */1, 3, 5, 7);
__m512i r = _mm512_packs_epi16(npyv_cvt_u16_b16(a), npyv_cvt_u16_b16(b));
return npyv_cvt_b8_u8(_mm512_permutexvar_epi64(idx, r));
#else
const __m512i idx = npyv__setr_epi64(/* a */0, 2, 1, 3, /* b */4, 6, 5, 7);
__m256i ta, tb, lo, hi;
NPYV__PACK_LO_HI(16, npyv_cvt_u16_b16(a), ta);
NPYV__PACK_LO_HI(16, npyv_cvt_u16_b16(b), tb);
__m512i r = npyv512_combine_si256(ta, tb);
return npyv_cvt_b8_u8(_mm512_permutexvar_epi64(idx, r));
#endif
}

NPY_FINLINE npyv_b8
npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
#ifdef NPY_HAVE_AVX512BW
const __m512i idx = npyv__setr_epi32(
/* a */0, 4, 8, 12, /* b */1, 5, 9, 13,
/* c */2, 6, 10, 14, /* d */3, 7, 11, 15);
__m512i p1 = _mm512_packs_epi32(npyv_cvt_u32_b32(a), npyv_cvt_u32_b32(b));
__m512i p2 = _mm512_packs_epi32(npyv_cvt_u32_b32(c), npyv_cvt_u32_b32(d));
__m512i r = _mm512_packs_epi16(p1, p2);
return npyv_cvt_b8_u8(_mm512_permutexvar_epi32(idx, r));
#else
const __m512i idx = npyv__setr_epi32(
/* a */0, 4, 1, 5, /* b */2, 6, 3, 7,
/* c */8, 12, 9, 13, /* d */10, 14, 11, 15);
__m256i ta, tb, tc, td, lo, hi;
NPYV__PACK_LO_HI(32, npyv_cvt_u32_b32(a), ta);
NPYV__PACK_LO_HI(32, npyv_cvt_u32_b32(b), tb);
NPYV__PACK_LO_HI(32, npyv_cvt_u32_b32(c), tc);
NPYV__PACK_LO_HI(32, npyv_cvt_u32_b32(d), td);
__m256i p1 = _mm256_packs_epi16(ta, tb);
__m256i p2 = _mm256_packs_epi16(tc, td);
__m512i r = npyv512_combine_si256(p1, p2);
return npyv_cvt_b8_u8(_mm512_permutexvar_epi32(idx, r));
#endif
}

NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
#ifdef NPY_HAVE_AVX512BW
const __m512i idx = npyv__setr_epi16(
/* a */0, 8, 16, 24, /* b */1, 9, 17, 25, /* c */2, 10, 18, 26,
/* d */3, 11, 19, 27, /* e */4, 12, 20, 28, /* f */5, 13, 21, 29,
/* g */6, 14, 22, 30, /* h */7, 15, 23, 31);
__m512i p1 = _mm512_packs_epi32(npyv_cvt_u64_b64(a), npyv_cvt_u64_b64(b));
__m512i p2 = _mm512_packs_epi32(npyv_cvt_u64_b64(c), npyv_cvt_u64_b64(d));
__m512i p3 = _mm512_packs_epi32(npyv_cvt_u64_b64(e), npyv_cvt_u64_b64(f));
__m512i p4 = _mm512_packs_epi32(npyv_cvt_u64_b64(g), npyv_cvt_u64_b64(h));
__m512i p1_2 = _mm512_packs_epi32(p1, p2);
__m512i p2_2 = _mm512_packs_epi32(p3, p4);
__m512i r = _mm512_packs_epi16(p1_2, p2_2);
return npyv_cvt_b8_u8(_mm512_permutexvar_epi16(idx, r));
#else
const __m256i idx = _mm256_setr_epi32(
/* a|c|e|g */0, 4, 1, 5, /* b|d|f|h */2, 6, 3, 7);
const __m512i idx2 = npyv__setr_epi64(
/* a,b,c,d */0, 2, 1, 3, /* e,f,g,h */4, 6, 5, 7);
__m256i ta, tb, tc, td, te, tf, tg, th, lo, hi;
NPYV__PACK_LO_HI(32, npyv_cvt_u64_b64(a), ta);
NPYV__PACK_LO_HI(32, npyv_cvt_u64_b64(b), tb);
NPYV__PACK_LO_HI(32, npyv_cvt_u64_b64(c), tc);
NPYV__PACK_LO_HI(32, npyv_cvt_u64_b64(d), td);
NPYV__PACK_LO_HI(32, npyv_cvt_u64_b64(e), te);
NPYV__PACK_LO_HI(32, npyv_cvt_u64_b64(f), tf);
NPYV__PACK_LO_HI(32, npyv_cvt_u64_b64(g), tg);
NPYV__PACK_LO_HI(32, npyv_cvt_u64_b64(h), th);
__m256i p1 = _mm256_permutevar8x32_epi32(_mm256_packs_epi32(ta, tb), idx);
__m256i p2 = _mm256_permutevar8x32_epi32(_mm256_packs_epi32(tc, td), idx);
__m256i p3 = _mm256_permutevar8x32_epi32(_mm256_packs_epi32(te, tf), idx);
__m256i p4 = _mm256_permutevar8x32_epi32(_mm256_packs_epi32(tg, th), idx);
__m256i p1_2 = _mm256_packs_epi16(p1, p2);
__m256i p2_2 = _mm256_packs_epi16(p3, p4);
__m512i r = npyv512_combine_si256(p1_2, p2_2);
return npyv_cvt_b8_u8(_mm512_permutexvar_epi64(idx2, r));
#endif
}

#undef NPYV__PACK_LO_HI

// convert boolean vectors to integer bitfield
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
{
Expand Down
22 changes: 22 additions & 0 deletions numpy/core/src/common/simd/neon/conversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,28 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
return r;
}

// pack multiple vectors into one
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
return vcombine_u8(vmovn_u16(a), vmovn_u16(b));
}

NPY_FINLINE npyv_b8
npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
npyv_b16 p1 = vcombine_u16(vmovn_u32(a), vmovn_u32(b));
npyv_b16 p2 = vcombine_u16(vmovn_u32(c), vmovn_u32(d));
return npyv_pack_b8_b16(p1, p2);
}

NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
npyv_b32 p1 = vcombine_u32(vmovn_u64(a), vmovn_u64(b));
npyv_b32 p2 = vcombine_u32(vmovn_u64(c), vmovn_u64(d));
npyv_b32 p3 = vcombine_u32(vmovn_u64(e), vmovn_u64(f));
npyv_b32 p4 = vcombine_u32(vmovn_u64(g), vmovn_u64(h));
return npyv_pack_b8_b32(p1, p2, p3, p4);
}

// round to nearest integer
#if NPY_SIMD_F64
#define npyv_round_s32_f32 vcvtnq_s32_f32
Expand Down
22 changes: 22 additions & 0 deletions numpy/core/src/common/simd/sse/conversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,28 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) {
return r;
}

// pack multiple vectors into one
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
return _mm_packs_epi16(a, b);
}

NPY_FINLINE npyv_b8
npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
npyv_b16 p1 = _mm_packs_epi32(a, b);
npyv_b16 p2 = _mm_packs_epi32(c, d);
return npyv_pack_b8_b16(p1, p2);
}

NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
npyv_b32 p1 = _mm_packs_epi32(a, b);
npyv_b32 p2 = _mm_packs_epi32(c, d);
npyv_b32 p3 = _mm_packs_epi32(e, f);
npyv_b32 p4 = _mm_packs_epi32(g, h);
return npyv_pack_b8_b32(p1, p2, p3, p4);
}

// round to nearest integer (assuming even)
#define npyv_round_s32_f32 _mm_cvtps_epi32
NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
Expand Down
21 changes: 21 additions & 0 deletions numpy/core/src/common/simd/vsx/conversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,27 @@ NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
return r;
}

// pack multiple vectors into one
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
return vec_pack(a, b);
}

NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
npyv_b16 p1 = vec_pack(a, b);
npyv_b16 p2 = vec_pack(c, d);
return npyv_pack_b8_b16(p1, p2);
}

NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
npyv_b32 p1 = vec_pack(a, b);
npyv_b32 p2 = vec_pack(c, d);
npyv_b32 p3 = vec_pack(e, f);
npyv_b32 p4 = vec_pack(g, h);
return npyv_pack_b8_b32(p1, p2, p3, p4);
}

// convert boolean vector to integer bitfield
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
{
Expand Down

0 comments on commit 2336869

Please sign in to comment.