ENH, SIMD: update dispatch-able sources to support IBM ZArch SIMD"

numpy · Jan 27, 2022 · 73c476d · 73c476d
1 parent 3025bb3
commit 73c476d
Show file tree

Hide file tree

Showing 6 changed files with 19 additions and 11 deletions.
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -68,7 +68,7 @@
  *            0*3#
  * #NPYV_CHK = 0*5,
  *             0*5,
- *             0, NPY_SIMD, NPY_SIMD_F64, 0,
+ *             0, NPY_SIMD_F32, NPY_SIMD_F64, 0,
  *             0*3#
  */
 

diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -1,6 +1,7 @@
 /*@targets
  ** $maxopt baseline
  ** sse2 avx2 avx512f
+ ** vx vxe
  **/
 #define _UMATHMODULE
 #define _MULTIARRAYMODULE
@@ -364,7 +365,7 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
  *  #type = npy_float, npy_double#
  *  #TYPE = FLOAT, DOUBLE#
  *  #sfx = f32, f64#
- *  #CHK =    , _F64#
+ *  #CHK = _F32, _F64#
  */
 #if NPY_SIMD@CHK@
 /**begin repeat1
@@ -444,7 +445,7 @@ simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
  *  #type = npy_float, npy_double, npy_longdouble#
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  *  #vector = 1, 1, 0#
- *  #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
+ *  #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64, 0 #
  */
 /**begin repeat1
  * Arithmetic

diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -3,6 +3,7 @@
  ** sse2 sse41 avx2 avx512f avx512_skx
  ** vsx2
  ** neon
+ ** vx
  **/
 #define _UMATHMODULE
 #define _MULTIARRAYMODULE

diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src
@@ -3,6 +3,7 @@
  ** neon asimd
  ** sse2 avx2 avx512_skx
  ** vsx2
+ ** vx vxe
  **/
 #define _UMATHMODULE
 #define _MULTIARRAYMODULE
@@ -142,7 +143,7 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
 /**begin repeat
  * #sfx = f32, f64#
  * #bsfx = b32, b64#
- * #simd_chk = NPY_SIMD, NPY_SIMD_F64#
+ * #simd_chk = NPY_SIMD_F32, NPY_SIMD_F64#
  * #scalar_sfx = f, d#
  */
 #if @simd_chk@
@@ -194,7 +195,7 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
  ******************************************************************************/
 /**begin repeat
  * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64#
- * #simd_chk = NPY_SIMD*9, NPY_SIMD_F64#
+ * #simd_chk = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64#
  * #is_fp = 0*8, 1, 1#
  * #scalar_sfx = i*8, f, d#
  */
@@ -393,6 +394,9 @@ simd_binary_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, npy_intp sip1,
 #elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
     #if @is_fp@
         #define TO_SIMD_SFX(X) X##_f@len@
+        #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
         #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
             #undef TO_SIMD_SFX
         #endif

diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
@@ -3,6 +3,7 @@
  ** (avx2 fma3) avx512f
  ** vsx2
  ** neon_vfpv4
+ ** vxe vxe2
  **/
 #include "numpy/npy_math.h"
 #include "simd/simd.h"
@@ -13,7 +14,7 @@
  * - use vectorized version of Payne-Hanek style reduction for large elements or
  *   when there's no native FUSED support instead of fallback to libc
  */
-#if NPY_SIMD_FMA3 // native support
+#if NPY_SIMD_F32 && NPY_SIMD_FMA3 // native support
 /*
  * Vectorized Cody-Waite range reduction technique
  * Performs the reduction step x* = x - y*C in three steps:
@@ -210,7 +211,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
     const npy_intp sdst = steps[1] / lsize;
     npy_intp len = dimensions[0];
     assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
-#if NPY_SIMD_FMA3
+#if NPY_SIMD_F32 && NPY_SIMD_FMA3
     if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
         !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
     ) {

diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -3,6 +3,7 @@
  ** sse2 sse41
  ** vsx2
  ** neon asimd
+ ** vx vxe
  **/
 /**
  * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
@@ -18,7 +19,7 @@
 /**********************************************************
  ** Scalars
  **********************************************************/
-#if !NPY_SIMD
+#if !NPY_SIMD_F32
 NPY_FINLINE float c_recip_f32(float a)
 { return 1.0f / a; }
 NPY_FINLINE float c_abs_f32(float a)
@@ -29,7 +30,7 @@ NPY_FINLINE float c_abs_f32(float a)
 }
 NPY_FINLINE float c_square_f32(float a)
 { return a * a; }
-#endif // !NPY_SIMD
+#endif // !NPY_SIMD_F32
 
 #if !NPY_SIMD_F64
 NPY_FINLINE double c_recip_f64(double a)
@@ -144,7 +145,7 @@ NPY_FINLINE double c_square_f64(double a)
 /**begin repeat
  * #TYPE = FLOAT, DOUBLE#
  * #sfx  = f32, f64#
- * #VCHK = NPY_SIMD, NPY_SIMD_F64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
  */
 #if @VCHK@
 /**begin repeat1
@@ -256,7 +257,7 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
 /**begin repeat
  * #TYPE = FLOAT, DOUBLE#
  * #sfx  = f32, f64#
- * #VCHK = NPY_SIMD, NPY_SIMD_F64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
  */
 /**begin repeat1
  * #kind  = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal#