Skip to content

Commit

Permalink
SIMD, ENH: Add universal intrinsic andc8 and use it to remove ifneq
Browse files Browse the repository at this point in the history
This commit also applies some techniques to reduce the size of the binary generated from the source loops_comparison.dispatch.c.src
  • Loading branch information
rafaelcfsousa committed May 30, 2022
1 parent 09b22a1 commit d5d6eb5
Show file tree
Hide file tree
Showing 8 changed files with 150 additions and 116 deletions.
22 changes: 16 additions & 6 deletions numpy/core/src/_simd/_simd.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
* #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
* #bitw8b_sup= 1, 0, 0, 0, 0, 0, 0, 0, 0, 0#
*/
#if @simd_sup@
/***************************
Expand Down Expand Up @@ -332,6 +333,13 @@ SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@)
SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@)
/**end repeat1**/

#if @bitw8b_sup@
SIMD_IMPL_INTRIN_2(andc_@sfx@, v@sfx@, v@sfx@, v@sfx@)
SIMD_IMPL_INTRIN_2(andc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_2(orc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_2(xnor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
#endif

/***************************
* Conversion
***************************/
Expand Down Expand Up @@ -462,9 +470,6 @@ SIMD_IMPL_INTRIN_2(or_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_2(xor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
/**end repeat**/
SIMD_IMPL_INTRIN_2(andc_b8, vb8, vb8, vb8)
SIMD_IMPL_INTRIN_2(orc_b8, vb8, vb8, vb8)
SIMD_IMPL_INTRIN_2(xnor_b8, vb8, vb8, vb8)
/***************************
* Conversions
***************************/
Expand Down Expand Up @@ -503,6 +508,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
* #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
* #bitw8b_sup= 1, 0, 0, 0, 0, 0, 0, 0, 0, 0#
*/
#if @simd_sup@

Expand Down Expand Up @@ -584,6 +590,13 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/

#if @bitw8b_sup@
SIMD_INTRIN_DEF(andc_@sfx@)
SIMD_INTRIN_DEF(andc_@bsfx@)
SIMD_INTRIN_DEF(orc_@bsfx@)
SIMD_INTRIN_DEF(xnor_@bsfx@)
#endif

/***************************
* Conversion
***************************/
Expand Down Expand Up @@ -713,9 +726,6 @@ SIMD_INTRIN_DEF(or_@bsfx@)
SIMD_INTRIN_DEF(xor_@bsfx@)
SIMD_INTRIN_DEF(not_@bsfx@)
/**end repeat**/
SIMD_INTRIN_DEF(andc_b8)
SIMD_INTRIN_DEF(orc_b8)
SIMD_INTRIN_DEF(xnor_b8)
/***************************
* Conversions
***************************/
Expand Down
7 changes: 4 additions & 3 deletions numpy/core/src/common/simd/avx2/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,10 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
#define npyv_not_b64 npyv_not_u8

// ANDC, ORC and XNOR
#define npyv_andc_b8(A, B) _mm256_andnot_si256(A, B)
#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
#define npyv_andc_u8(A, B) _mm256_andnot_si256(B, A)
#define npyv_andc_b8(A, B) _mm256_andnot_si256(B, A)
#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
#define npyv_xnor_b8 _mm256_cmpeq_epi8

/***************************
* Comparison
Expand Down
4 changes: 2 additions & 2 deletions numpy/core/src/common/simd/avx512/conversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
NPY_FINLINE npyv_b8
npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
#ifdef NPY_HAVE_AVX512BW
__mmask32 ab = (__mmask64)_mm512_kunpackw((__mmask32)b, (__mmask32)a);
__mmask32 cd = (__mmask64)_mm512_kunpackw((__mmask32)d, (__mmask32)c);
__mmask32 ab = _mm512_kunpackw((__mmask32)b, (__mmask32)a);
__mmask32 cd = _mm512_kunpackw((__mmask32)d, (__mmask32)c);
return npyv_pack_b8_b16(ab, cd);
#else
const __m512i idx = _mm512_setr_epi32(
Expand Down
15 changes: 9 additions & 6 deletions numpy/core/src/common/simd/avx512/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@
#define npyv_not_f64(A) _mm512_castsi512_pd(npyv_not_u64(_mm512_castpd_si512(A)))
#endif

// ANDC
#define npyv_andc_u8(A, B) _mm512_andnot_si512(B, A)

/***************************
* Logical (boolean)
***************************/
Expand All @@ -152,8 +155,8 @@
#define npyv_xor_b16 _kxor_mask32
#define npyv_not_b8 _knot_mask64
#define npyv_not_b16 _knot_mask32
#define npyv_andc_b8 _kandn_mask64
#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
#define npyv_andc_b8(A, B) _kandn_mask64(B, A)
#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
#define npyv_xnor_b8 _kxnor_mask64
#elif defined(NPY_HAVE_AVX512BW)
NPY_FINLINE npyv_b8 npyv_and_b8(npyv_b8 a, npyv_b8 b)
Expand All @@ -173,9 +176,9 @@
NPY_FINLINE npyv_b16 npyv_not_b16(npyv_b16 a)
{ return ~a; }
NPY_FINLINE npyv_b8 npyv_andc_b8(npyv_b8 a, npyv_b8 b)
{ return (~a) & b; }
{ return a & (~b); }
NPY_FINLINE npyv_b8 npyv_orc_b8(npyv_b8 a, npyv_b8 b)
{ return (~a) | b; }
{ return a | (~b); }
NPY_FINLINE npyv_b8 npyv_xnor_b8(npyv_b8 a, npyv_b8 b)
{ return ~(a ^ b); }
#else
Expand All @@ -187,8 +190,8 @@
#define npyv_xor_b16 _mm512_xor_si512
#define npyv_not_b8 npyv_not_u8
#define npyv_not_b16 npyv_not_u8
#define npyv_andc_b8 _mm512_andnot_si512
#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
#define npyv_andc_b8(A, B) _mm512_andnot_si512(B, A)
#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
#endif

Expand Down
5 changes: 3 additions & 2 deletions numpy/core/src/common/simd/neon/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,9 @@
#define npyv_not_b64 npyv_not_u64

// ANDC, ORC and XNOR
#define npyv_andc_b8(A, B) vbicq_u8(B, A)
#define npyv_orc_b8(A, B) vornq_u8(B, A)
#define npyv_andc_u8 vbicq_u8
#define npyv_andc_b8 vbicq_u8
#define npyv_orc_b8 vornq_u8
#define npyv_xnor_b8 vceqq_u8

/***************************
Expand Down
7 changes: 4 additions & 3 deletions numpy/core/src/common/simd/sse/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c)
#define npyv_not_b64 npyv_not_u8

// ANDC, ORC and XNOR
#define npyv_andc_b8(A, B) _mm_andnot_si128(A, B)
#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
#define npyv_andc_u8(A, B) _mm_andnot_si128(B, A)
#define npyv_andc_b8(A, B) _mm_andnot_si128(B, A)
#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
#define npyv_xnor_b8 _mm_cmpeq_epi8

/***************************
* Comparison
Expand Down
5 changes: 3 additions & 2 deletions numpy/core/src/common/simd/vsx/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,9 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
{ return vec_nor(a, a); }

// ANDC, ORC and XNOR
#define npyv_andc_b8(A, B) vec_andc(B, A)
#define npyv_orc_b8(A, B) vec_orc(B, A)
#define npyv_andc_u8 vec_andc
#define npyv_andc_b8 vec_andc
#define npyv_orc_b8 vec_orc
#define npyv_xnor_b8 vec_eqv

/***************************
Expand Down

0 comments on commit d5d6eb5

Please sign in to comment.