From ea75651ac938befbd227fea9dc9021ac6b50ad78 Mon Sep 17 00:00:00 2001 From: jhonatancunha Date: Tue, 17 May 2022 20:39:55 -0300 Subject: [PATCH 01/15] ENH: adding casting option to numpy.stack. See #20959 --- numpy/core/shape_base.py | 16 +- numpy/core/shape_base.pyi | 16 +- numpy/core/src/umath/loops_modulo.dispatch.c | 5714 ++++++++++++++++++ 3 files changed, 5740 insertions(+), 6 deletions(-) create mode 100644 numpy/core/src/umath/loops_modulo.dispatch.c diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py index 1a4198c5f8e9..e70c322cfd78 100644 --- a/numpy/core/shape_base.py +++ b/numpy/core/shape_base.py @@ -345,7 +345,7 @@ def hstack(tup): return _nx.concatenate(arrs, 1) -def _stack_dispatcher(arrays, axis=None, out=None): +def _stack_dispatcher(arrays, axis=None, out=None, casting=None): arrays = _arrays_for_stack_dispatcher(arrays, stacklevel=6) if out is not None: # optimize for the typical case where only arrays is provided @@ -355,7 +355,7 @@ def _stack_dispatcher(arrays, axis=None, out=None): @array_function_dispatch(_stack_dispatcher) -def stack(arrays, axis=0, out=None): +def stack(arrays, axis=0, out=None, casting='same_kind'): """ Join a sequence of arrays along a new axis. @@ -378,6 +378,10 @@ def stack(arrays, axis=0, out=None): correct, matching that of what stack would have returned if no out argument were specified. + casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional + Controls what kind of data casting may occur. Defaults to 'same_kind'. + + Returns ------- stacked : ndarray @@ -430,7 +434,8 @@ def stack(arrays, axis=0, out=None): sl = (slice(None),) * axis + (_nx.newaxis,) expanded_arrays = [arr[sl] for arr in arrays] - return _nx.concatenate(expanded_arrays, axis=axis, out=out) + return _nx.concatenate(expanded_arrays, axis=axis, out=out, + casting=casting) # Internal functions to eliminate the overhead of repeated dispatch in one of @@ -438,7 +443,8 @@ def stack(arrays, axis=0, out=None): # Use getattr to protect against __array_function__ being disabled. _size = getattr(_from_nx.size, '__wrapped__', _from_nx.size) _ndim = getattr(_from_nx.ndim, '__wrapped__', _from_nx.ndim) -_concatenate = getattr(_from_nx.concatenate, '__wrapped__', _from_nx.concatenate) +_concatenate = getattr(_from_nx.concatenate, + '__wrapped__', _from_nx.concatenate) def _block_format_index(index): @@ -539,7 +545,7 @@ def _concatenate_shapes(shapes, axis): """Given array shapes, return the resulting shape and slices prefixes. These help in nested concatenation. - + Returns ------- shape: tuple of int diff --git a/numpy/core/shape_base.pyi b/numpy/core/shape_base.pyi index cea355d443c0..da02a60aee13 100644 --- a/numpy/core/shape_base.pyi +++ b/numpy/core/shape_base.pyi @@ -2,13 +2,14 @@ from collections.abc import Sequence from typing import TypeVar, overload, Any, SupportsIndex from numpy import generic -from numpy._typing import ArrayLike, NDArray, _ArrayLike +from numpy._typing import ArrayLike, NDArray, _ArrayLike, _CastingKind _SCT = TypeVar("_SCT", bound=generic) _ArrayType = TypeVar("_ArrayType", bound=NDArray[Any]) __all__: list[str] + @overload def atleast_1d(arys: _ArrayLike[_SCT], /) -> NDArray[_SCT]: ... @overload @@ -16,6 +17,7 @@ def atleast_1d(arys: ArrayLike, /) -> NDArray[Any]: ... @overload def atleast_1d(*arys: ArrayLike) -> list[NDArray[Any]]: ... + @overload def atleast_2d(arys: _ArrayLike[_SCT], /) -> NDArray[_SCT]: ... @overload @@ -23,6 +25,7 @@ def atleast_2d(arys: ArrayLike, /) -> NDArray[Any]: ... @overload def atleast_2d(*arys: ArrayLike) -> list[NDArray[Any]]: ... + @overload def atleast_3d(arys: _ArrayLike[_SCT], /) -> NDArray[_SCT]: ... @overload @@ -30,35 +33,46 @@ def atleast_3d(arys: ArrayLike, /) -> NDArray[Any]: ... @overload def atleast_3d(*arys: ArrayLike) -> list[NDArray[Any]]: ... + @overload def vstack(tup: Sequence[_ArrayLike[_SCT]]) -> NDArray[_SCT]: ... @overload def vstack(tup: Sequence[ArrayLike]) -> NDArray[Any]: ... + @overload def hstack(tup: Sequence[_ArrayLike[_SCT]]) -> NDArray[_SCT]: ... @overload def hstack(tup: Sequence[ArrayLike]) -> NDArray[Any]: ... + @overload def stack( arrays: Sequence[_ArrayLike[_SCT]], axis: SupportsIndex = ..., out: None = ..., + casting: None | _CastingKind = ... ) -> NDArray[_SCT]: ... + + @overload def stack( arrays: Sequence[ArrayLike], axis: SupportsIndex = ..., out: None = ..., + casting: None | _CastingKind = ... ) -> NDArray[Any]: ... + + @overload def stack( arrays: Sequence[ArrayLike], axis: SupportsIndex = ..., out: _ArrayType = ..., + casting: None | _CastingKind = ... ) -> _ArrayType: ... + @overload def block(arrays: _ArrayLike[_SCT]) -> NDArray[_SCT]: ... @overload diff --git a/numpy/core/src/umath/loops_modulo.dispatch.c b/numpy/core/src/umath/loops_modulo.dispatch.c new file mode 100644 index 000000000000..d29a0179560f --- /dev/null +++ b/numpy/core/src/umath/loops_modulo.dispatch.c @@ -0,0 +1,5714 @@ +#line 1 "numpy/core/src/umath/loops_modulo.dispatch.c.src" + +/* + ***************************************************************************** + ** This file was autogenerated from a template DO NOT EDIT!!!! ** + ** Changes should be made to the original source (.src) file ** + ***************************************************************************** + */ + +#line 1 +/*@targets + ** baseline vsx4 + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +#if NPY_SIMD && defined(NPY_HAVE_VSX4) +typedef struct { + npyv_u32x2 hi; + npyv_u32x2 lo; +} vsx4_u32x4; + +typedef struct { + npyv_s32x2 hi; + npyv_s32x2 lo; +} vsx4_s32x4; + +// Converts 1 8-bit vector into 2 16-bit vectors +NPY_FINLINE npyv_s16x2 +vsx4_expand_s16_s8(npyv_s8 data) +{ + npyv_s16x2 r; + r.val[0] = vec_unpackh(data); + r.val[1] = vec_unpackl(data); + return r; +} + +// Converts 1 16-bit vector into 2 32-bit vectors +NPY_FINLINE npyv_s32x2 +vsx4_expand_s32_s16(npyv_s16 data) +{ + npyv_s32x2 r; + r.val[0] = vec_unpackh(data); + r.val[1] = vec_unpackl(data); + return r; +} + +#line 50 +// Converts 1 8-bit vector into 4 32-bit vectors +NPY_FINLINE vsx4_u32x4 +vsx4_expand_u32_u8(npyv_u8 data) +{ + vsx4_u32x4 r; + npyv_u16x2 expand = npyv_expand_u16_u8(data); + r.hi = npyv_expand_u32_u16(expand.val[0]); + r.lo = npyv_expand_u32_u16(expand.val[1]); + return r; +} + +#line 64 +/* + * Computes division/modulo of 2 8-bit signed/unsigned integer vectors + * + * As Power10 only supports integer vector division/modulo for data of 32 bits + * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer + * vector division/modulo instruction, and then, convert the result back to + * npyv_u8. + */ +NPY_FINLINE npyv_u8 +vsx4_div_u8(npyv_u8 a, npyv_u8 b) +{ + vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a); + vsx4_u32x4 b_expand = vsx4_expand_u32_u8(b); + npyv_u32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]); + npyv_u32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]); + npyv_u32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]); + npyv_u32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]); + npyv_u16 hi = vec_pack(v1, v2); + npyv_u16 lo = vec_pack(v3, v4); + return vec_pack(hi, lo); +} + +NPY_FINLINE npyv_u8 +vsx4_div_scalar_u8(npyv_u8 a, const vsx4_u32x4 b_expand) +{ + vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a); + npyv_u32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]); + npyv_u32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]); + npyv_u32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]); + npyv_u32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]); + npyv_u16 hi = vec_pack(v1, v2); + npyv_u16 lo = vec_pack(v3, v4); + return vec_pack(hi, lo); +} + +NPY_FINLINE npyv_u16 +vsx4_div_u16(npyv_u16 a, npyv_u16 b) +{ + npyv_u32x2 a_expand = npyv_expand_u32_u16(a); + npyv_u32x2 b_expand = npyv_expand_u32_u16(b); + npyv_u32 v1 = vec_div(a_expand.val[0], b_expand.val[0]); + npyv_u32 v2 = vec_div(a_expand.val[1], b_expand.val[1]); + return vec_pack(v1, v2); +} + +NPY_FINLINE npyv_u16 +vsx4_div_scalar_u16(npyv_u16 a, const npyv_u32x2 b_expand) +{ + npyv_u32x2 a_expand = npyv_expand_u32_u16(a); + npyv_u32 v1 = vec_div(a_expand.val[0], b_expand.val[0]); + npyv_u32 v2 = vec_div(a_expand.val[1], b_expand.val[1]); + return vec_pack(v1, v2); +} + +#define vsx4_div_u32 vec_div +#define vsx4_div_u64 vec_div +#define vsx4_div_scalar_u32 vec_div +#define vsx4_div_scalar_u64 vec_div + +#line 64 +/* + * Computes division/modulo of 2 8-bit signed/unsigned integer vectors + * + * As Power10 only supports integer vector division/modulo for data of 32 bits + * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer + * vector division/modulo instruction, and then, convert the result back to + * npyv_u8. + */ +NPY_FINLINE npyv_u8 +vsx4_mod_u8(npyv_u8 a, npyv_u8 b) +{ + vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a); + vsx4_u32x4 b_expand = vsx4_expand_u32_u8(b); + npyv_u32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]); + npyv_u32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]); + npyv_u32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]); + npyv_u32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]); + npyv_u16 hi = vec_pack(v1, v2); + npyv_u16 lo = vec_pack(v3, v4); + return vec_pack(hi, lo); +} + +NPY_FINLINE npyv_u8 +vsx4_mod_scalar_u8(npyv_u8 a, const vsx4_u32x4 b_expand) +{ + vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a); + npyv_u32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]); + npyv_u32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]); + npyv_u32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]); + npyv_u32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]); + npyv_u16 hi = vec_pack(v1, v2); + npyv_u16 lo = vec_pack(v3, v4); + return vec_pack(hi, lo); +} + +NPY_FINLINE npyv_u16 +vsx4_mod_u16(npyv_u16 a, npyv_u16 b) +{ + npyv_u32x2 a_expand = npyv_expand_u32_u16(a); + npyv_u32x2 b_expand = npyv_expand_u32_u16(b); + npyv_u32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]); + npyv_u32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]); + return vec_pack(v1, v2); +} + +NPY_FINLINE npyv_u16 +vsx4_mod_scalar_u16(npyv_u16 a, const npyv_u32x2 b_expand) +{ + npyv_u32x2 a_expand = npyv_expand_u32_u16(a); + npyv_u32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]); + npyv_u32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]); + return vec_pack(v1, v2); +} + +#define vsx4_mod_u32 vec_mod +#define vsx4_mod_u64 vec_mod +#define vsx4_mod_scalar_u32 vec_mod +#define vsx4_mod_scalar_u64 vec_mod + + +#line 50 +// Converts 1 8-bit vector into 4 32-bit vectors +NPY_FINLINE vsx4_s32x4 +vsx4_expand_s32_s8(npyv_s8 data) +{ + vsx4_s32x4 r; + npyv_s16x2 expand = vsx4_expand_s16_s8(data); + r.hi = vsx4_expand_s32_s16(expand.val[0]); + r.lo = vsx4_expand_s32_s16(expand.val[1]); + return r; +} + +#line 64 +/* + * Computes division/modulo of 2 8-bit signed/unsigned integer vectors + * + * As Power10 only supports integer vector division/modulo for data of 32 bits + * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer + * vector division/modulo instruction, and then, convert the result back to + * npyv_u8. + */ +NPY_FINLINE npyv_s8 +vsx4_div_s8(npyv_s8 a, npyv_s8 b) +{ + vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a); + vsx4_s32x4 b_expand = vsx4_expand_s32_s8(b); + npyv_s32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]); + npyv_s32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]); + npyv_s32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]); + npyv_s32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]); + npyv_s16 hi = vec_pack(v1, v2); + npyv_s16 lo = vec_pack(v3, v4); + return vec_pack(hi, lo); +} + +NPY_FINLINE npyv_s8 +vsx4_div_scalar_s8(npyv_s8 a, const vsx4_s32x4 b_expand) +{ + vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a); + npyv_s32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]); + npyv_s32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]); + npyv_s32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]); + npyv_s32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]); + npyv_s16 hi = vec_pack(v1, v2); + npyv_s16 lo = vec_pack(v3, v4); + return vec_pack(hi, lo); +} + +NPY_FINLINE npyv_s16 +vsx4_div_s16(npyv_s16 a, npyv_s16 b) +{ + npyv_s32x2 a_expand = vsx4_expand_s32_s16(a); + npyv_s32x2 b_expand = vsx4_expand_s32_s16(b); + npyv_s32 v1 = vec_div(a_expand.val[0], b_expand.val[0]); + npyv_s32 v2 = vec_div(a_expand.val[1], b_expand.val[1]); + return vec_pack(v1, v2); +} + +NPY_FINLINE npyv_s16 +vsx4_div_scalar_s16(npyv_s16 a, const npyv_s32x2 b_expand) +{ + npyv_s32x2 a_expand = vsx4_expand_s32_s16(a); + npyv_s32 v1 = vec_div(a_expand.val[0], b_expand.val[0]); + npyv_s32 v2 = vec_div(a_expand.val[1], b_expand.val[1]); + return vec_pack(v1, v2); +} + +#define vsx4_div_s32 vec_div +#define vsx4_div_s64 vec_div +#define vsx4_div_scalar_s32 vec_div +#define vsx4_div_scalar_s64 vec_div + +#line 64 +/* + * Computes division/modulo of 2 8-bit signed/unsigned integer vectors + * + * As Power10 only supports integer vector division/modulo for data of 32 bits + * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer + * vector division/modulo instruction, and then, convert the result back to + * npyv_u8. + */ +NPY_FINLINE npyv_s8 +vsx4_mod_s8(npyv_s8 a, npyv_s8 b) +{ + vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a); + vsx4_s32x4 b_expand = vsx4_expand_s32_s8(b); + npyv_s32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]); + npyv_s32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]); + npyv_s32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]); + npyv_s32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]); + npyv_s16 hi = vec_pack(v1, v2); + npyv_s16 lo = vec_pack(v3, v4); + return vec_pack(hi, lo); +} + +NPY_FINLINE npyv_s8 +vsx4_mod_scalar_s8(npyv_s8 a, const vsx4_s32x4 b_expand) +{ + vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a); + npyv_s32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]); + npyv_s32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]); + npyv_s32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]); + npyv_s32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]); + npyv_s16 hi = vec_pack(v1, v2); + npyv_s16 lo = vec_pack(v3, v4); + return vec_pack(hi, lo); +} + +NPY_FINLINE npyv_s16 +vsx4_mod_s16(npyv_s16 a, npyv_s16 b) +{ + npyv_s32x2 a_expand = vsx4_expand_s32_s16(a); + npyv_s32x2 b_expand = vsx4_expand_s32_s16(b); + npyv_s32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]); + npyv_s32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]); + return vec_pack(v1, v2); +} + +NPY_FINLINE npyv_s16 +vsx4_mod_scalar_s16(npyv_s16 a, const npyv_s32x2 b_expand) +{ + npyv_s32x2 a_expand = vsx4_expand_s32_s16(a); + npyv_s32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]); + npyv_s32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]); + return vec_pack(v1, v2); +} + +#define vsx4_mod_s32 vec_mod +#define vsx4_mod_s64 vec_mod +#define vsx4_mod_scalar_s32 vec_mod +#define vsx4_mod_scalar_s64 vec_mod + + + +#line 131 +// Generates the divisor for the division/modulo operations +NPY_FINLINE vsx4_u32x4 +vsx4_divisor_u8(const npyv_u8 vscalar) +{ + return vsx4_expand_u32_u8(vscalar); +} + +#line 131 +// Generates the divisor for the division/modulo operations +NPY_FINLINE npyv_u32x2 +vsx4_divisor_u16(const npyv_u16 vscalar) +{ + return npyv_expand_u32_u16(vscalar); +} + +#line 131 +// Generates the divisor for the division/modulo operations +NPY_FINLINE vsx4_s32x4 +vsx4_divisor_s8(const npyv_s8 vscalar) +{ + return vsx4_expand_s32_s8(vscalar); +} + +#line 131 +// Generates the divisor for the division/modulo operations +NPY_FINLINE npyv_s32x2 +vsx4_divisor_s16(const npyv_s16 vscalar) +{ + return vsx4_expand_s32_s16(vscalar); +} + + +#line 142 +NPY_FINLINE npyv_u32 +vsx4_divisor_u32(const npyv_u32 vscalar) +{ + return vscalar; +} + +#line 142 +NPY_FINLINE npyv_u64 +vsx4_divisor_u64(const npyv_u64 vscalar) +{ + return vscalar; +} + +#line 142 +NPY_FINLINE npyv_s32 +vsx4_divisor_s32(const npyv_s32 vscalar) +{ + return vscalar; +} + +#line 142 +NPY_FINLINE npyv_s64 +vsx4_divisor_s64(const npyv_s64 vscalar) +{ + return vscalar; +} + + +#line 155 +#line 159 +static NPY_INLINE void +vsx4_simd_fmod_contig_u8(char **args, npy_intp len) +{ + npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0]; + npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1]; + npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2]; + const npyv_u8 vzero = npyv_zero_u8(); + const int vstep = npyv_nlanes_u8; +#if 0 == 2 /* divmod */ + npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3]; + const npyv_u8 vneg_one = npyv_setall_u8(-1); + npyv_b8 warn = npyv_cvt_b8_u8(npyv_zero_u8()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { + npyv_u8 a = npyv_load_u8(src1); + npyv_u8 b = npyv_load_u8(src2); + npyv_u8 quo = vsx4_div_u8(a, b); + npyv_u8 rem = npyv_sub_u8(a, vec_mul(b, quo)); + npyv_b8 bzero = npyv_cmpeq_u8(b, vzero); + // when b is 0, 'cvtozero' forces the modulo to be 0 too + npyv_u8 cvtozero = npyv_select_u8(bzero, vzero, vneg_one); + warn = npyv_or_u8(bzero, warn); + npyv_store_u8(dst1, quo); + npyv_store_u8(dst2, npyv_and_u8(cvtozero, rem)); + } + + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_u8 a = *src1; + const npyv_lanetype_u8 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } else{ + *dst1 = a / b; + *dst2 = a % b; + } + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { + npyv_u8 a = npyv_load_u8(src1); + npyv_u8 b = npyv_load_u8(src2); + npyv_u8 c = vsx4_mod_u8(a, b); + npyv_store_u8(dst1, c); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_u8 a = *src1; + const npyv_lanetype_u8 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_fmod_by_scalar_contig_u8(char **args, npy_intp len) +{ + npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0]; + npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1]; + npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2]; + const int vstep = npyv_nlanes_u8; + const npyv_u8 vscalar = npyv_setall_u8(scalar); + const vsx4_u32x4 divisor = vsx4_divisor_u8(vscalar); +#if 0 == 2 /* divmod */ + npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { + npyv_u8 a = npyv_load_u8(src1); + npyv_u8 quo = vsx4_div_scalar_u8(a, divisor); + npyv_u8 rem = npyv_sub_u8(a, vec_mul(vscalar, quo)); + npyv_store_u8(dst1, quo); + npyv_store_u8(dst2, rem); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_u8 a = *src1; + *dst1 = a / scalar; + *dst2 = a % scalar; + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { + npyv_u8 a = npyv_load_u8(src1); + npyv_u8 c = vsx4_mod_scalar_u8(a, divisor); + npyv_store_u8(dst1, c); + } + + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_u8 a = *src1; + *dst1 = a % scalar; + } +#endif + npyv_cleanup(); +} + +#line 159 +static NPY_INLINE void +vsx4_simd_remainder_contig_u8(char **args, npy_intp len) +{ + npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0]; + npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1]; + npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2]; + const npyv_u8 vzero = npyv_zero_u8(); + const int vstep = npyv_nlanes_u8; +#if 1 == 2 /* divmod */ + npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3]; + const npyv_u8 vneg_one = npyv_setall_u8(-1); + npyv_b8 warn = npyv_cvt_b8_u8(npyv_zero_u8()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { + npyv_u8 a = npyv_load_u8(src1); + npyv_u8 b = npyv_load_u8(src2); + npyv_u8 quo = vsx4_div_u8(a, b); + npyv_u8 rem = npyv_sub_u8(a, vec_mul(b, quo)); + npyv_b8 bzero = npyv_cmpeq_u8(b, vzero); + // when b is 0, 'cvtozero' forces the modulo to be 0 too + npyv_u8 cvtozero = npyv_select_u8(bzero, vzero, vneg_one); + warn = npyv_or_u8(bzero, warn); + npyv_store_u8(dst1, quo); + npyv_store_u8(dst2, npyv_and_u8(cvtozero, rem)); + } + + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_u8 a = *src1; + const npyv_lanetype_u8 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } else{ + *dst1 = a / b; + *dst2 = a % b; + } + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { + npyv_u8 a = npyv_load_u8(src1); + npyv_u8 b = npyv_load_u8(src2); + npyv_u8 c = vsx4_mod_u8(a, b); + npyv_store_u8(dst1, c); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_u8 a = *src1; + const npyv_lanetype_u8 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_remainder_by_scalar_contig_u8(char **args, npy_intp len) +{ + npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0]; + npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1]; + npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2]; + const int vstep = npyv_nlanes_u8; + const npyv_u8 vscalar = npyv_setall_u8(scalar); + const vsx4_u32x4 divisor = vsx4_divisor_u8(vscalar); +#if 1 == 2 /* divmod */ + npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { + npyv_u8 a = npyv_load_u8(src1); + npyv_u8 quo = vsx4_div_scalar_u8(a, divisor); + npyv_u8 rem = npyv_sub_u8(a, vec_mul(vscalar, quo)); + npyv_store_u8(dst1, quo); + npyv_store_u8(dst2, rem); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_u8 a = *src1; + *dst1 = a / scalar; + *dst2 = a % scalar; + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { + npyv_u8 a = npyv_load_u8(src1); + npyv_u8 c = vsx4_mod_scalar_u8(a, divisor); + npyv_store_u8(dst1, c); + } + + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_u8 a = *src1; + *dst1 = a % scalar; + } +#endif + npyv_cleanup(); +} + +#line 159 +static NPY_INLINE void +vsx4_simd_divmod_contig_u8(char **args, npy_intp len) +{ + npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0]; + npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1]; + npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2]; + const npyv_u8 vzero = npyv_zero_u8(); + const int vstep = npyv_nlanes_u8; +#if 2 == 2 /* divmod */ + npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3]; + const npyv_u8 vneg_one = npyv_setall_u8(-1); + npyv_b8 warn = npyv_cvt_b8_u8(npyv_zero_u8()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { + npyv_u8 a = npyv_load_u8(src1); + npyv_u8 b = npyv_load_u8(src2); + npyv_u8 quo = vsx4_div_u8(a, b); + npyv_u8 rem = npyv_sub_u8(a, vec_mul(b, quo)); + npyv_b8 bzero = npyv_cmpeq_u8(b, vzero); + // when b is 0, 'cvtozero' forces the modulo to be 0 too + npyv_u8 cvtozero = npyv_select_u8(bzero, vzero, vneg_one); + warn = npyv_or_u8(bzero, warn); + npyv_store_u8(dst1, quo); + npyv_store_u8(dst2, npyv_and_u8(cvtozero, rem)); + } + + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_u8 a = *src1; + const npyv_lanetype_u8 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } else{ + *dst1 = a / b; + *dst2 = a % b; + } + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { + npyv_u8 a = npyv_load_u8(src1); + npyv_u8 b = npyv_load_u8(src2); + npyv_u8 c = vsx4_mod_u8(a, b); + npyv_store_u8(dst1, c); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_u8 a = *src1; + const npyv_lanetype_u8 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_divmod_by_scalar_contig_u8(char **args, npy_intp len) +{ + npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0]; + npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1]; + npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2]; + const int vstep = npyv_nlanes_u8; + const npyv_u8 vscalar = npyv_setall_u8(scalar); + const vsx4_u32x4 divisor = vsx4_divisor_u8(vscalar); +#if 2 == 2 /* divmod */ + npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { + npyv_u8 a = npyv_load_u8(src1); + npyv_u8 quo = vsx4_div_scalar_u8(a, divisor); + npyv_u8 rem = npyv_sub_u8(a, vec_mul(vscalar, quo)); + npyv_store_u8(dst1, quo); + npyv_store_u8(dst2, rem); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_u8 a = *src1; + *dst1 = a / scalar; + *dst2 = a % scalar; + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { + npyv_u8 a = npyv_load_u8(src1); + npyv_u8 c = vsx4_mod_scalar_u8(a, divisor); + npyv_store_u8(dst1, c); + } + + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_u8 a = *src1; + *dst1 = a % scalar; + } +#endif + npyv_cleanup(); +} + + +#line 155 +#line 159 +static NPY_INLINE void +vsx4_simd_fmod_contig_u16(char **args, npy_intp len) +{ + npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0]; + npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1]; + npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2]; + const npyv_u16 vzero = npyv_zero_u16(); + const int vstep = npyv_nlanes_u16; +#if 0 == 2 /* divmod */ + npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3]; + const npyv_u16 vneg_one = npyv_setall_u16(-1); + npyv_b16 warn = npyv_cvt_b16_u16(npyv_zero_u16()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { + npyv_u16 a = npyv_load_u16(src1); + npyv_u16 b = npyv_load_u16(src2); + npyv_u16 quo = vsx4_div_u16(a, b); + npyv_u16 rem = npyv_sub_u16(a, vec_mul(b, quo)); + npyv_b16 bzero = npyv_cmpeq_u16(b, vzero); + // when b is 0, 'cvtozero' forces the modulo to be 0 too + npyv_u16 cvtozero = npyv_select_u16(bzero, vzero, vneg_one); + warn = npyv_or_u16(bzero, warn); + npyv_store_u16(dst1, quo); + npyv_store_u16(dst2, npyv_and_u16(cvtozero, rem)); + } + + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_u16 a = *src1; + const npyv_lanetype_u16 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } else{ + *dst1 = a / b; + *dst2 = a % b; + } + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { + npyv_u16 a = npyv_load_u16(src1); + npyv_u16 b = npyv_load_u16(src2); + npyv_u16 c = vsx4_mod_u16(a, b); + npyv_store_u16(dst1, c); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_u16 a = *src1; + const npyv_lanetype_u16 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_fmod_by_scalar_contig_u16(char **args, npy_intp len) +{ + npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0]; + npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1]; + npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2]; + const int vstep = npyv_nlanes_u16; + const npyv_u16 vscalar = npyv_setall_u16(scalar); + const npyv_u32x2 divisor = vsx4_divisor_u16(vscalar); +#if 0 == 2 /* divmod */ + npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { + npyv_u16 a = npyv_load_u16(src1); + npyv_u16 quo = vsx4_div_scalar_u16(a, divisor); + npyv_u16 rem = npyv_sub_u16(a, vec_mul(vscalar, quo)); + npyv_store_u16(dst1, quo); + npyv_store_u16(dst2, rem); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_u16 a = *src1; + *dst1 = a / scalar; + *dst2 = a % scalar; + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { + npyv_u16 a = npyv_load_u16(src1); + npyv_u16 c = vsx4_mod_scalar_u16(a, divisor); + npyv_store_u16(dst1, c); + } + + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_u16 a = *src1; + *dst1 = a % scalar; + } +#endif + npyv_cleanup(); +} + +#line 159 +static NPY_INLINE void +vsx4_simd_remainder_contig_u16(char **args, npy_intp len) +{ + npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0]; + npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1]; + npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2]; + const npyv_u16 vzero = npyv_zero_u16(); + const int vstep = npyv_nlanes_u16; +#if 1 == 2 /* divmod */ + npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3]; + const npyv_u16 vneg_one = npyv_setall_u16(-1); + npyv_b16 warn = npyv_cvt_b16_u16(npyv_zero_u16()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { + npyv_u16 a = npyv_load_u16(src1); + npyv_u16 b = npyv_load_u16(src2); + npyv_u16 quo = vsx4_div_u16(a, b); + npyv_u16 rem = npyv_sub_u16(a, vec_mul(b, quo)); + npyv_b16 bzero = npyv_cmpeq_u16(b, vzero); + // when b is 0, 'cvtozero' forces the modulo to be 0 too + npyv_u16 cvtozero = npyv_select_u16(bzero, vzero, vneg_one); + warn = npyv_or_u16(bzero, warn); + npyv_store_u16(dst1, quo); + npyv_store_u16(dst2, npyv_and_u16(cvtozero, rem)); + } + + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_u16 a = *src1; + const npyv_lanetype_u16 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } else{ + *dst1 = a / b; + *dst2 = a % b; + } + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { + npyv_u16 a = npyv_load_u16(src1); + npyv_u16 b = npyv_load_u16(src2); + npyv_u16 c = vsx4_mod_u16(a, b); + npyv_store_u16(dst1, c); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_u16 a = *src1; + const npyv_lanetype_u16 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_remainder_by_scalar_contig_u16(char **args, npy_intp len) +{ + npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0]; + npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1]; + npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2]; + const int vstep = npyv_nlanes_u16; + const npyv_u16 vscalar = npyv_setall_u16(scalar); + const npyv_u32x2 divisor = vsx4_divisor_u16(vscalar); +#if 1 == 2 /* divmod */ + npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { + npyv_u16 a = npyv_load_u16(src1); + npyv_u16 quo = vsx4_div_scalar_u16(a, divisor); + npyv_u16 rem = npyv_sub_u16(a, vec_mul(vscalar, quo)); + npyv_store_u16(dst1, quo); + npyv_store_u16(dst2, rem); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_u16 a = *src1; + *dst1 = a / scalar; + *dst2 = a % scalar; + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { + npyv_u16 a = npyv_load_u16(src1); + npyv_u16 c = vsx4_mod_scalar_u16(a, divisor); + npyv_store_u16(dst1, c); + } + + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_u16 a = *src1; + *dst1 = a % scalar; + } +#endif + npyv_cleanup(); +} + +#line 159 +static NPY_INLINE void +vsx4_simd_divmod_contig_u16(char **args, npy_intp len) +{ + npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0]; + npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1]; + npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2]; + const npyv_u16 vzero = npyv_zero_u16(); + const int vstep = npyv_nlanes_u16; +#if 2 == 2 /* divmod */ + npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3]; + const npyv_u16 vneg_one = npyv_setall_u16(-1); + npyv_b16 warn = npyv_cvt_b16_u16(npyv_zero_u16()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { + npyv_u16 a = npyv_load_u16(src1); + npyv_u16 b = npyv_load_u16(src2); + npyv_u16 quo = vsx4_div_u16(a, b); + npyv_u16 rem = npyv_sub_u16(a, vec_mul(b, quo)); + npyv_b16 bzero = npyv_cmpeq_u16(b, vzero); + // when b is 0, 'cvtozero' forces the modulo to be 0 too + npyv_u16 cvtozero = npyv_select_u16(bzero, vzero, vneg_one); + warn = npyv_or_u16(bzero, warn); + npyv_store_u16(dst1, quo); + npyv_store_u16(dst2, npyv_and_u16(cvtozero, rem)); + } + + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_u16 a = *src1; + const npyv_lanetype_u16 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } else{ + *dst1 = a / b; + *dst2 = a % b; + } + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { + npyv_u16 a = npyv_load_u16(src1); + npyv_u16 b = npyv_load_u16(src2); + npyv_u16 c = vsx4_mod_u16(a, b); + npyv_store_u16(dst1, c); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_u16 a = *src1; + const npyv_lanetype_u16 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_divmod_by_scalar_contig_u16(char **args, npy_intp len) +{ + npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0]; + npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1]; + npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2]; + const int vstep = npyv_nlanes_u16; + const npyv_u16 vscalar = npyv_setall_u16(scalar); + const npyv_u32x2 divisor = vsx4_divisor_u16(vscalar); +#if 2 == 2 /* divmod */ + npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { + npyv_u16 a = npyv_load_u16(src1); + npyv_u16 quo = vsx4_div_scalar_u16(a, divisor); + npyv_u16 rem = npyv_sub_u16(a, vec_mul(vscalar, quo)); + npyv_store_u16(dst1, quo); + npyv_store_u16(dst2, rem); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_u16 a = *src1; + *dst1 = a / scalar; + *dst2 = a % scalar; + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { + npyv_u16 a = npyv_load_u16(src1); + npyv_u16 c = vsx4_mod_scalar_u16(a, divisor); + npyv_store_u16(dst1, c); + } + + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_u16 a = *src1; + *dst1 = a % scalar; + } +#endif + npyv_cleanup(); +} + + +#line 155 +#line 159 +static NPY_INLINE void +vsx4_simd_fmod_contig_u32(char **args, npy_intp len) +{ + npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0]; + npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1]; + npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2]; + const npyv_u32 vzero = npyv_zero_u32(); + const int vstep = npyv_nlanes_u32; +#if 0 == 2 /* divmod */ + npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3]; + const npyv_u32 vneg_one = npyv_setall_u32(-1); + npyv_b32 warn = npyv_cvt_b32_u32(npyv_zero_u32()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { + npyv_u32 a = npyv_load_u32(src1); + npyv_u32 b = npyv_load_u32(src2); + npyv_u32 quo = vsx4_div_u32(a, b); + npyv_u32 rem = npyv_sub_u32(a, vec_mul(b, quo)); + npyv_b32 bzero = npyv_cmpeq_u32(b, vzero); + // when b is 0, 'cvtozero' forces the modulo to be 0 too + npyv_u32 cvtozero = npyv_select_u32(bzero, vzero, vneg_one); + warn = npyv_or_u32(bzero, warn); + npyv_store_u32(dst1, quo); + npyv_store_u32(dst2, npyv_and_u32(cvtozero, rem)); + } + + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_u32 a = *src1; + const npyv_lanetype_u32 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } else{ + *dst1 = a / b; + *dst2 = a % b; + } + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { + npyv_u32 a = npyv_load_u32(src1); + npyv_u32 b = npyv_load_u32(src2); + npyv_u32 c = vsx4_mod_u32(a, b); + npyv_store_u32(dst1, c); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_u32 a = *src1; + const npyv_lanetype_u32 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_fmod_by_scalar_contig_u32(char **args, npy_intp len) +{ + npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0]; + npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1]; + npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2]; + const int vstep = npyv_nlanes_u32; + const npyv_u32 vscalar = npyv_setall_u32(scalar); + const npyv_u32 divisor = vsx4_divisor_u32(vscalar); +#if 0 == 2 /* divmod */ + npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { + npyv_u32 a = npyv_load_u32(src1); + npyv_u32 quo = vsx4_div_scalar_u32(a, divisor); + npyv_u32 rem = npyv_sub_u32(a, vec_mul(vscalar, quo)); + npyv_store_u32(dst1, quo); + npyv_store_u32(dst2, rem); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_u32 a = *src1; + *dst1 = a / scalar; + *dst2 = a % scalar; + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { + npyv_u32 a = npyv_load_u32(src1); + npyv_u32 c = vsx4_mod_scalar_u32(a, divisor); + npyv_store_u32(dst1, c); + } + + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_u32 a = *src1; + *dst1 = a % scalar; + } +#endif + npyv_cleanup(); +} + +#line 159 +static NPY_INLINE void +vsx4_simd_remainder_contig_u32(char **args, npy_intp len) +{ + npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0]; + npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1]; + npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2]; + const npyv_u32 vzero = npyv_zero_u32(); + const int vstep = npyv_nlanes_u32; +#if 1 == 2 /* divmod */ + npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3]; + const npyv_u32 vneg_one = npyv_setall_u32(-1); + npyv_b32 warn = npyv_cvt_b32_u32(npyv_zero_u32()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { + npyv_u32 a = npyv_load_u32(src1); + npyv_u32 b = npyv_load_u32(src2); + npyv_u32 quo = vsx4_div_u32(a, b); + npyv_u32 rem = npyv_sub_u32(a, vec_mul(b, quo)); + npyv_b32 bzero = npyv_cmpeq_u32(b, vzero); + // when b is 0, 'cvtozero' forces the modulo to be 0 too + npyv_u32 cvtozero = npyv_select_u32(bzero, vzero, vneg_one); + warn = npyv_or_u32(bzero, warn); + npyv_store_u32(dst1, quo); + npyv_store_u32(dst2, npyv_and_u32(cvtozero, rem)); + } + + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_u32 a = *src1; + const npyv_lanetype_u32 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } else{ + *dst1 = a / b; + *dst2 = a % b; + } + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { + npyv_u32 a = npyv_load_u32(src1); + npyv_u32 b = npyv_load_u32(src2); + npyv_u32 c = vsx4_mod_u32(a, b); + npyv_store_u32(dst1, c); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_u32 a = *src1; + const npyv_lanetype_u32 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_remainder_by_scalar_contig_u32(char **args, npy_intp len) +{ + npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0]; + npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1]; + npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2]; + const int vstep = npyv_nlanes_u32; + const npyv_u32 vscalar = npyv_setall_u32(scalar); + const npyv_u32 divisor = vsx4_divisor_u32(vscalar); +#if 1 == 2 /* divmod */ + npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { + npyv_u32 a = npyv_load_u32(src1); + npyv_u32 quo = vsx4_div_scalar_u32(a, divisor); + npyv_u32 rem = npyv_sub_u32(a, vec_mul(vscalar, quo)); + npyv_store_u32(dst1, quo); + npyv_store_u32(dst2, rem); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_u32 a = *src1; + *dst1 = a / scalar; + *dst2 = a % scalar; + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { + npyv_u32 a = npyv_load_u32(src1); + npyv_u32 c = vsx4_mod_scalar_u32(a, divisor); + npyv_store_u32(dst1, c); + } + + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_u32 a = *src1; + *dst1 = a % scalar; + } +#endif + npyv_cleanup(); +} + +#line 159 +static NPY_INLINE void +vsx4_simd_divmod_contig_u32(char **args, npy_intp len) +{ + npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0]; + npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1]; + npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2]; + const npyv_u32 vzero = npyv_zero_u32(); + const int vstep = npyv_nlanes_u32; +#if 2 == 2 /* divmod */ + npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3]; + const npyv_u32 vneg_one = npyv_setall_u32(-1); + npyv_b32 warn = npyv_cvt_b32_u32(npyv_zero_u32()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { + npyv_u32 a = npyv_load_u32(src1); + npyv_u32 b = npyv_load_u32(src2); + npyv_u32 quo = vsx4_div_u32(a, b); + npyv_u32 rem = npyv_sub_u32(a, vec_mul(b, quo)); + npyv_b32 bzero = npyv_cmpeq_u32(b, vzero); + // when b is 0, 'cvtozero' forces the modulo to be 0 too + npyv_u32 cvtozero = npyv_select_u32(bzero, vzero, vneg_one); + warn = npyv_or_u32(bzero, warn); + npyv_store_u32(dst1, quo); + npyv_store_u32(dst2, npyv_and_u32(cvtozero, rem)); + } + + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_u32 a = *src1; + const npyv_lanetype_u32 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } else{ + *dst1 = a / b; + *dst2 = a % b; + } + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { + npyv_u32 a = npyv_load_u32(src1); + npyv_u32 b = npyv_load_u32(src2); + npyv_u32 c = vsx4_mod_u32(a, b); + npyv_store_u32(dst1, c); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_u32 a = *src1; + const npyv_lanetype_u32 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_divmod_by_scalar_contig_u32(char **args, npy_intp len) +{ + npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0]; + npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1]; + npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2]; + const int vstep = npyv_nlanes_u32; + const npyv_u32 vscalar = npyv_setall_u32(scalar); + const npyv_u32 divisor = vsx4_divisor_u32(vscalar); +#if 2 == 2 /* divmod */ + npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { + npyv_u32 a = npyv_load_u32(src1); + npyv_u32 quo = vsx4_div_scalar_u32(a, divisor); + npyv_u32 rem = npyv_sub_u32(a, vec_mul(vscalar, quo)); + npyv_store_u32(dst1, quo); + npyv_store_u32(dst2, rem); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_u32 a = *src1; + *dst1 = a / scalar; + *dst2 = a % scalar; + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { + npyv_u32 a = npyv_load_u32(src1); + npyv_u32 c = vsx4_mod_scalar_u32(a, divisor); + npyv_store_u32(dst1, c); + } + + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_u32 a = *src1; + *dst1 = a % scalar; + } +#endif + npyv_cleanup(); +} + + +#line 155 +#line 159 +static NPY_INLINE void +vsx4_simd_fmod_contig_u64(char **args, npy_intp len) +{ + npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0]; + npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1]; + npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2]; + const npyv_u64 vzero = npyv_zero_u64(); + const int vstep = npyv_nlanes_u64; +#if 0 == 2 /* divmod */ + npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3]; + const npyv_u64 vneg_one = npyv_setall_u64(-1); + npyv_b64 warn = npyv_cvt_b64_u64(npyv_zero_u64()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { + npyv_u64 a = npyv_load_u64(src1); + npyv_u64 b = npyv_load_u64(src2); + npyv_u64 quo = vsx4_div_u64(a, b); + npyv_u64 rem = npyv_sub_u64(a, vec_mul(b, quo)); + npyv_b64 bzero = npyv_cmpeq_u64(b, vzero); + // when b is 0, 'cvtozero' forces the modulo to be 0 too + npyv_u64 cvtozero = npyv_select_u64(bzero, vzero, vneg_one); + warn = npyv_or_u64(bzero, warn); + npyv_store_u64(dst1, quo); + npyv_store_u64(dst2, npyv_and_u64(cvtozero, rem)); + } + + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_u64 a = *src1; + const npyv_lanetype_u64 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } else{ + *dst1 = a / b; + *dst2 = a % b; + } + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { + npyv_u64 a = npyv_load_u64(src1); + npyv_u64 b = npyv_load_u64(src2); + npyv_u64 c = vsx4_mod_u64(a, b); + npyv_store_u64(dst1, c); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_u64 a = *src1; + const npyv_lanetype_u64 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_fmod_by_scalar_contig_u64(char **args, npy_intp len) +{ + npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0]; + npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1]; + npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2]; + const int vstep = npyv_nlanes_u64; + const npyv_u64 vscalar = npyv_setall_u64(scalar); + const npyv_u64 divisor = vsx4_divisor_u64(vscalar); +#if 0 == 2 /* divmod */ + npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { + npyv_u64 a = npyv_load_u64(src1); + npyv_u64 quo = vsx4_div_scalar_u64(a, divisor); + npyv_u64 rem = npyv_sub_u64(a, vec_mul(vscalar, quo)); + npyv_store_u64(dst1, quo); + npyv_store_u64(dst2, rem); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_u64 a = *src1; + *dst1 = a / scalar; + *dst2 = a % scalar; + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { + npyv_u64 a = npyv_load_u64(src1); + npyv_u64 c = vsx4_mod_scalar_u64(a, divisor); + npyv_store_u64(dst1, c); + } + + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_u64 a = *src1; + *dst1 = a % scalar; + } +#endif + npyv_cleanup(); +} + +#line 159 +static NPY_INLINE void +vsx4_simd_remainder_contig_u64(char **args, npy_intp len) +{ + npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0]; + npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1]; + npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2]; + const npyv_u64 vzero = npyv_zero_u64(); + const int vstep = npyv_nlanes_u64; +#if 1 == 2 /* divmod */ + npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3]; + const npyv_u64 vneg_one = npyv_setall_u64(-1); + npyv_b64 warn = npyv_cvt_b64_u64(npyv_zero_u64()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { + npyv_u64 a = npyv_load_u64(src1); + npyv_u64 b = npyv_load_u64(src2); + npyv_u64 quo = vsx4_div_u64(a, b); + npyv_u64 rem = npyv_sub_u64(a, vec_mul(b, quo)); + npyv_b64 bzero = npyv_cmpeq_u64(b, vzero); + // when b is 0, 'cvtozero' forces the modulo to be 0 too + npyv_u64 cvtozero = npyv_select_u64(bzero, vzero, vneg_one); + warn = npyv_or_u64(bzero, warn); + npyv_store_u64(dst1, quo); + npyv_store_u64(dst2, npyv_and_u64(cvtozero, rem)); + } + + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_u64 a = *src1; + const npyv_lanetype_u64 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } else{ + *dst1 = a / b; + *dst2 = a % b; + } + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { + npyv_u64 a = npyv_load_u64(src1); + npyv_u64 b = npyv_load_u64(src2); + npyv_u64 c = vsx4_mod_u64(a, b); + npyv_store_u64(dst1, c); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_u64 a = *src1; + const npyv_lanetype_u64 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_remainder_by_scalar_contig_u64(char **args, npy_intp len) +{ + npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0]; + npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1]; + npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2]; + const int vstep = npyv_nlanes_u64; + const npyv_u64 vscalar = npyv_setall_u64(scalar); + const npyv_u64 divisor = vsx4_divisor_u64(vscalar); +#if 1 == 2 /* divmod */ + npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { + npyv_u64 a = npyv_load_u64(src1); + npyv_u64 quo = vsx4_div_scalar_u64(a, divisor); + npyv_u64 rem = npyv_sub_u64(a, vec_mul(vscalar, quo)); + npyv_store_u64(dst1, quo); + npyv_store_u64(dst2, rem); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_u64 a = *src1; + *dst1 = a / scalar; + *dst2 = a % scalar; + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { + npyv_u64 a = npyv_load_u64(src1); + npyv_u64 c = vsx4_mod_scalar_u64(a, divisor); + npyv_store_u64(dst1, c); + } + + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_u64 a = *src1; + *dst1 = a % scalar; + } +#endif + npyv_cleanup(); +} + +#line 159 +static NPY_INLINE void +vsx4_simd_divmod_contig_u64(char **args, npy_intp len) +{ + npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0]; + npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1]; + npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2]; + const npyv_u64 vzero = npyv_zero_u64(); + const int vstep = npyv_nlanes_u64; +#if 2 == 2 /* divmod */ + npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3]; + const npyv_u64 vneg_one = npyv_setall_u64(-1); + npyv_b64 warn = npyv_cvt_b64_u64(npyv_zero_u64()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { + npyv_u64 a = npyv_load_u64(src1); + npyv_u64 b = npyv_load_u64(src2); + npyv_u64 quo = vsx4_div_u64(a, b); + npyv_u64 rem = npyv_sub_u64(a, vec_mul(b, quo)); + npyv_b64 bzero = npyv_cmpeq_u64(b, vzero); + // when b is 0, 'cvtozero' forces the modulo to be 0 too + npyv_u64 cvtozero = npyv_select_u64(bzero, vzero, vneg_one); + warn = npyv_or_u64(bzero, warn); + npyv_store_u64(dst1, quo); + npyv_store_u64(dst2, npyv_and_u64(cvtozero, rem)); + } + + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_u64 a = *src1; + const npyv_lanetype_u64 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } else{ + *dst1 = a / b; + *dst2 = a % b; + } + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { + npyv_u64 a = npyv_load_u64(src1); + npyv_u64 b = npyv_load_u64(src2); + npyv_u64 c = vsx4_mod_u64(a, b); + npyv_store_u64(dst1, c); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_u64 a = *src1; + const npyv_lanetype_u64 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_divmod_by_scalar_contig_u64(char **args, npy_intp len) +{ + npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0]; + npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1]; + npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2]; + const int vstep = npyv_nlanes_u64; + const npyv_u64 vscalar = npyv_setall_u64(scalar); + const npyv_u64 divisor = vsx4_divisor_u64(vscalar); +#if 2 == 2 /* divmod */ + npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { + npyv_u64 a = npyv_load_u64(src1); + npyv_u64 quo = vsx4_div_scalar_u64(a, divisor); + npyv_u64 rem = npyv_sub_u64(a, vec_mul(vscalar, quo)); + npyv_store_u64(dst1, quo); + npyv_store_u64(dst2, rem); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_u64 a = *src1; + *dst1 = a / scalar; + *dst2 = a % scalar; + } +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { + npyv_u64 a = npyv_load_u64(src1); + npyv_u64 c = vsx4_mod_scalar_u64(a, divisor); + npyv_store_u64(dst1, c); + } + + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_u64 a = *src1; + *dst1 = a % scalar; + } +#endif + npyv_cleanup(); +} + + + +#line 277 +#line 281 +static NPY_INLINE void +vsx4_simd_fmod_contig_s8(char **args, npy_intp len) +{ + npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0]; + npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1]; + npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2]; + const npyv_s8 vzero = npyv_zero_s8(); + const int vstep = npyv_nlanes_s8; +#if 0 == 2 /* divmod */ + npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3]; + const npyv_s8 vneg_one = npyv_setall_s8(-1); + const npyv_s8 vmin = npyv_setall_s8(NPY_MIN_INT8); + npyv_b8 warn = npyv_cvt_b8_s8(npyv_zero_s8()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { +#endif + npyv_s8 a = npyv_load_s8(src1); + npyv_s8 b = npyv_load_s8(src2); +#if 0 <= 1 /* fmod and remainder */ + npyv_s8 rem = vsx4_mod_s8(a, b); +#else /* divmod */ + npyv_s8 quo = vsx4_div_s8(a, b); + npyv_s8 rem = npyv_sub_s8(a, vec_mul(b, quo)); + // (b == 0 || (a == NPY_MIN_INT8 && b == -1)) + npyv_b8 bzero = npyv_cmpeq_s8(b, vzero); + npyv_b8 amin = npyv_cmpeq_s8(a, vmin); + npyv_b8 bneg_one = npyv_cmpeq_s8(b, vneg_one); + npyv_b8 overflow = npyv_and_s8(bneg_one, amin); + npyv_b8 error = npyv_or_s8(bzero, overflow); + // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 + npyv_s8 cvtozero = npyv_select_s8(error, vzero, vneg_one); + warn = npyv_or_s8(error, warn); +#endif +#if 0 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b8 a_gt_zero = npyv_cmpgt_s8(a, vzero); + npyv_b8 b_gt_zero = npyv_cmpgt_s8(b, vzero); + npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero); + npyv_b8 rem_zero = npyv_cmpeq_s8(rem, vzero); + npyv_b8 or = npyv_or_s8(ab_eq_cond, rem_zero); + npyv_s8 to_add = npyv_select_s8(or, vzero, b); + rem = npyv_add_s8(rem, to_add); +#endif +#if 0 == 2 /* divmod */ + npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one); + quo = npyv_add_s8(quo, to_sub); + npyv_store_s8(dst1, npyv_and_s8(cvtozero, quo)); + npyv_store_s8(dst2, npyv_and_s8(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s8(dst1, rem); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } +#endif + } + +#if 0 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_s8 a = *src1; + const npyv_lanetype_s8 b = *src2; + if (b == 0 || (a == NPY_MIN_INT8 && b == -1)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / b; + *dst2 = a % b; + if (!((a > 0) == (b > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += b; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_s8 a = *src1; + const npyv_lanetype_s8 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; +#if 0 == 1 /* remainder */ + if (!((a > 0) == (b > 0) || *dst1 == 0)) { + *dst1 += b; + } +#endif + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_fmod_by_scalar_contig_s8(char **args, npy_intp len) +{ + npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0]; + npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1]; + npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2]; + const npyv_s8 vscalar = npyv_setall_s8(scalar); + const vsx4_s32x4 divisor = vsx4_divisor_s8(vscalar); + const int vstep = npyv_nlanes_s8; +#if 0 >= 1 /* remainder and divmod */ + const npyv_s8 vzero = npyv_zero_s8(); + npyv_b8 b_gt_zero = npyv_cmpgt_s8(vscalar, vzero); +#endif +#if 0 == 2 /* divmod */ + npyv_b8 warn = npyv_cvt_b8_s8(npyv_zero_s8()); + const npyv_s8 vmin = npyv_setall_s8(NPY_MIN_INT8); + const npyv_s8 vneg_one = npyv_setall_s8(-1); + npyv_b8 bneg_one = npyv_cmpeq_s8(vscalar, vneg_one); + npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { +#endif + npyv_s8 a = npyv_load_s8(src1); +#if 0 <= 1 /* fmod and remainder */ + npyv_s8 rem = vsx4_mod_scalar_s8(a, divisor); +#else /* divmod */ + npyv_s8 quo = vsx4_div_scalar_s8(a, divisor); + npyv_s8 rem = npyv_sub_s8(a, vec_mul(vscalar, quo)); + // (a == NPY_MIN_INT8 && b == -1) + npyv_b8 amin = npyv_cmpeq_s8(a, vmin); + npyv_b8 overflow = npyv_and_s8(bneg_one, amin); + // in case of overflow, 'cvtozero' forces quo/rem to be 0 + npyv_s8 cvtozero = npyv_select_s8(overflow, vzero, vneg_one); + warn = npyv_or_s8(overflow, warn); +#endif +#if 0 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b8 a_gt_zero = npyv_cmpgt_s8(a, vzero); + npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero); + npyv_b8 rem_zero = npyv_cmpeq_s8(rem, vzero); + npyv_b8 or = npyv_or_s8(ab_eq_cond, rem_zero); + npyv_s8 to_add = npyv_select_s8(or, vzero, vscalar); + rem = npyv_add_s8(rem, to_add); +#endif +#if 0 == 2 /* divmod */ + npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one); + quo = npyv_add_s8(quo, to_sub); + npyv_store_s8(dst1, npyv_and_s8(cvtozero, quo)); + npyv_store_s8(dst2, npyv_and_s8(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s8(dst1, rem); +#endif + } + +#if 0 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_s8 a = *src1; + if (a == NPY_MIN_INT8 && scalar == -1) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / scalar; + *dst2 = a % scalar; + if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += scalar; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_s8 a = *src1; + *dst1 = a % scalar; +#if 0 == 1 /* remainder */ + if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { + *dst1 += scalar; + } +#endif + } +#endif + npyv_cleanup(); +} + +#line 281 +static NPY_INLINE void +vsx4_simd_remainder_contig_s8(char **args, npy_intp len) +{ + npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0]; + npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1]; + npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2]; + const npyv_s8 vzero = npyv_zero_s8(); + const int vstep = npyv_nlanes_s8; +#if 1 == 2 /* divmod */ + npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3]; + const npyv_s8 vneg_one = npyv_setall_s8(-1); + const npyv_s8 vmin = npyv_setall_s8(NPY_MIN_INT8); + npyv_b8 warn = npyv_cvt_b8_s8(npyv_zero_s8()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { +#endif + npyv_s8 a = npyv_load_s8(src1); + npyv_s8 b = npyv_load_s8(src2); +#if 1 <= 1 /* fmod and remainder */ + npyv_s8 rem = vsx4_mod_s8(a, b); +#else /* divmod */ + npyv_s8 quo = vsx4_div_s8(a, b); + npyv_s8 rem = npyv_sub_s8(a, vec_mul(b, quo)); + // (b == 0 || (a == NPY_MIN_INT8 && b == -1)) + npyv_b8 bzero = npyv_cmpeq_s8(b, vzero); + npyv_b8 amin = npyv_cmpeq_s8(a, vmin); + npyv_b8 bneg_one = npyv_cmpeq_s8(b, vneg_one); + npyv_b8 overflow = npyv_and_s8(bneg_one, amin); + npyv_b8 error = npyv_or_s8(bzero, overflow); + // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 + npyv_s8 cvtozero = npyv_select_s8(error, vzero, vneg_one); + warn = npyv_or_s8(error, warn); +#endif +#if 1 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b8 a_gt_zero = npyv_cmpgt_s8(a, vzero); + npyv_b8 b_gt_zero = npyv_cmpgt_s8(b, vzero); + npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero); + npyv_b8 rem_zero = npyv_cmpeq_s8(rem, vzero); + npyv_b8 or = npyv_or_s8(ab_eq_cond, rem_zero); + npyv_s8 to_add = npyv_select_s8(or, vzero, b); + rem = npyv_add_s8(rem, to_add); +#endif +#if 1 == 2 /* divmod */ + npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one); + quo = npyv_add_s8(quo, to_sub); + npyv_store_s8(dst1, npyv_and_s8(cvtozero, quo)); + npyv_store_s8(dst2, npyv_and_s8(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s8(dst1, rem); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } +#endif + } + +#if 1 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_s8 a = *src1; + const npyv_lanetype_s8 b = *src2; + if (b == 0 || (a == NPY_MIN_INT8 && b == -1)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / b; + *dst2 = a % b; + if (!((a > 0) == (b > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += b; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_s8 a = *src1; + const npyv_lanetype_s8 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; +#if 1 == 1 /* remainder */ + if (!((a > 0) == (b > 0) || *dst1 == 0)) { + *dst1 += b; + } +#endif + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_remainder_by_scalar_contig_s8(char **args, npy_intp len) +{ + npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0]; + npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1]; + npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2]; + const npyv_s8 vscalar = npyv_setall_s8(scalar); + const vsx4_s32x4 divisor = vsx4_divisor_s8(vscalar); + const int vstep = npyv_nlanes_s8; +#if 1 >= 1 /* remainder and divmod */ + const npyv_s8 vzero = npyv_zero_s8(); + npyv_b8 b_gt_zero = npyv_cmpgt_s8(vscalar, vzero); +#endif +#if 1 == 2 /* divmod */ + npyv_b8 warn = npyv_cvt_b8_s8(npyv_zero_s8()); + const npyv_s8 vmin = npyv_setall_s8(NPY_MIN_INT8); + const npyv_s8 vneg_one = npyv_setall_s8(-1); + npyv_b8 bneg_one = npyv_cmpeq_s8(vscalar, vneg_one); + npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { +#endif + npyv_s8 a = npyv_load_s8(src1); +#if 1 <= 1 /* fmod and remainder */ + npyv_s8 rem = vsx4_mod_scalar_s8(a, divisor); +#else /* divmod */ + npyv_s8 quo = vsx4_div_scalar_s8(a, divisor); + npyv_s8 rem = npyv_sub_s8(a, vec_mul(vscalar, quo)); + // (a == NPY_MIN_INT8 && b == -1) + npyv_b8 amin = npyv_cmpeq_s8(a, vmin); + npyv_b8 overflow = npyv_and_s8(bneg_one, amin); + // in case of overflow, 'cvtozero' forces quo/rem to be 0 + npyv_s8 cvtozero = npyv_select_s8(overflow, vzero, vneg_one); + warn = npyv_or_s8(overflow, warn); +#endif +#if 1 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b8 a_gt_zero = npyv_cmpgt_s8(a, vzero); + npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero); + npyv_b8 rem_zero = npyv_cmpeq_s8(rem, vzero); + npyv_b8 or = npyv_or_s8(ab_eq_cond, rem_zero); + npyv_s8 to_add = npyv_select_s8(or, vzero, vscalar); + rem = npyv_add_s8(rem, to_add); +#endif +#if 1 == 2 /* divmod */ + npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one); + quo = npyv_add_s8(quo, to_sub); + npyv_store_s8(dst1, npyv_and_s8(cvtozero, quo)); + npyv_store_s8(dst2, npyv_and_s8(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s8(dst1, rem); +#endif + } + +#if 1 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_s8 a = *src1; + if (a == NPY_MIN_INT8 && scalar == -1) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / scalar; + *dst2 = a % scalar; + if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += scalar; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_s8 a = *src1; + *dst1 = a % scalar; +#if 1 == 1 /* remainder */ + if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { + *dst1 += scalar; + } +#endif + } +#endif + npyv_cleanup(); +} + +#line 281 +static NPY_INLINE void +vsx4_simd_divmod_contig_s8(char **args, npy_intp len) +{ + npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0]; + npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1]; + npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2]; + const npyv_s8 vzero = npyv_zero_s8(); + const int vstep = npyv_nlanes_s8; +#if 2 == 2 /* divmod */ + npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3]; + const npyv_s8 vneg_one = npyv_setall_s8(-1); + const npyv_s8 vmin = npyv_setall_s8(NPY_MIN_INT8); + npyv_b8 warn = npyv_cvt_b8_s8(npyv_zero_s8()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { +#endif + npyv_s8 a = npyv_load_s8(src1); + npyv_s8 b = npyv_load_s8(src2); +#if 2 <= 1 /* fmod and remainder */ + npyv_s8 rem = vsx4_mod_s8(a, b); +#else /* divmod */ + npyv_s8 quo = vsx4_div_s8(a, b); + npyv_s8 rem = npyv_sub_s8(a, vec_mul(b, quo)); + // (b == 0 || (a == NPY_MIN_INT8 && b == -1)) + npyv_b8 bzero = npyv_cmpeq_s8(b, vzero); + npyv_b8 amin = npyv_cmpeq_s8(a, vmin); + npyv_b8 bneg_one = npyv_cmpeq_s8(b, vneg_one); + npyv_b8 overflow = npyv_and_s8(bneg_one, amin); + npyv_b8 error = npyv_or_s8(bzero, overflow); + // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 + npyv_s8 cvtozero = npyv_select_s8(error, vzero, vneg_one); + warn = npyv_or_s8(error, warn); +#endif +#if 2 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b8 a_gt_zero = npyv_cmpgt_s8(a, vzero); + npyv_b8 b_gt_zero = npyv_cmpgt_s8(b, vzero); + npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero); + npyv_b8 rem_zero = npyv_cmpeq_s8(rem, vzero); + npyv_b8 or = npyv_or_s8(ab_eq_cond, rem_zero); + npyv_s8 to_add = npyv_select_s8(or, vzero, b); + rem = npyv_add_s8(rem, to_add); +#endif +#if 2 == 2 /* divmod */ + npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one); + quo = npyv_add_s8(quo, to_sub); + npyv_store_s8(dst1, npyv_and_s8(cvtozero, quo)); + npyv_store_s8(dst2, npyv_and_s8(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s8(dst1, rem); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } +#endif + } + +#if 2 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_s8 a = *src1; + const npyv_lanetype_s8 b = *src2; + if (b == 0 || (a == NPY_MIN_INT8 && b == -1)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / b; + *dst2 = a % b; + if (!((a > 0) == (b > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += b; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_s8 a = *src1; + const npyv_lanetype_s8 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; +#if 2 == 1 /* remainder */ + if (!((a > 0) == (b > 0) || *dst1 == 0)) { + *dst1 += b; + } +#endif + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_divmod_by_scalar_contig_s8(char **args, npy_intp len) +{ + npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0]; + npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1]; + npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2]; + const npyv_s8 vscalar = npyv_setall_s8(scalar); + const vsx4_s32x4 divisor = vsx4_divisor_s8(vscalar); + const int vstep = npyv_nlanes_s8; +#if 2 >= 1 /* remainder and divmod */ + const npyv_s8 vzero = npyv_zero_s8(); + npyv_b8 b_gt_zero = npyv_cmpgt_s8(vscalar, vzero); +#endif +#if 2 == 2 /* divmod */ + npyv_b8 warn = npyv_cvt_b8_s8(npyv_zero_s8()); + const npyv_s8 vmin = npyv_setall_s8(NPY_MIN_INT8); + const npyv_s8 vneg_one = npyv_setall_s8(-1); + npyv_b8 bneg_one = npyv_cmpeq_s8(vscalar, vneg_one); + npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { +#endif + npyv_s8 a = npyv_load_s8(src1); +#if 2 <= 1 /* fmod and remainder */ + npyv_s8 rem = vsx4_mod_scalar_s8(a, divisor); +#else /* divmod */ + npyv_s8 quo = vsx4_div_scalar_s8(a, divisor); + npyv_s8 rem = npyv_sub_s8(a, vec_mul(vscalar, quo)); + // (a == NPY_MIN_INT8 && b == -1) + npyv_b8 amin = npyv_cmpeq_s8(a, vmin); + npyv_b8 overflow = npyv_and_s8(bneg_one, amin); + // in case of overflow, 'cvtozero' forces quo/rem to be 0 + npyv_s8 cvtozero = npyv_select_s8(overflow, vzero, vneg_one); + warn = npyv_or_s8(overflow, warn); +#endif +#if 2 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b8 a_gt_zero = npyv_cmpgt_s8(a, vzero); + npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero); + npyv_b8 rem_zero = npyv_cmpeq_s8(rem, vzero); + npyv_b8 or = npyv_or_s8(ab_eq_cond, rem_zero); + npyv_s8 to_add = npyv_select_s8(or, vzero, vscalar); + rem = npyv_add_s8(rem, to_add); +#endif +#if 2 == 2 /* divmod */ + npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one); + quo = npyv_add_s8(quo, to_sub); + npyv_store_s8(dst1, npyv_and_s8(cvtozero, quo)); + npyv_store_s8(dst2, npyv_and_s8(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s8(dst1, rem); +#endif + } + +#if 2 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_s8 a = *src1; + if (a == NPY_MIN_INT8 && scalar == -1) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / scalar; + *dst2 = a % scalar; + if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += scalar; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_s8 a = *src1; + *dst1 = a % scalar; +#if 2 == 1 /* remainder */ + if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { + *dst1 += scalar; + } +#endif + } +#endif + npyv_cleanup(); +} + + +#line 277 +#line 281 +static NPY_INLINE void +vsx4_simd_fmod_contig_s16(char **args, npy_intp len) +{ + npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0]; + npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1]; + npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2]; + const npyv_s16 vzero = npyv_zero_s16(); + const int vstep = npyv_nlanes_s16; +#if 0 == 2 /* divmod */ + npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3]; + const npyv_s16 vneg_one = npyv_setall_s16(-1); + const npyv_s16 vmin = npyv_setall_s16(NPY_MIN_INT16); + npyv_b16 warn = npyv_cvt_b16_s16(npyv_zero_s16()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { +#endif + npyv_s16 a = npyv_load_s16(src1); + npyv_s16 b = npyv_load_s16(src2); +#if 0 <= 1 /* fmod and remainder */ + npyv_s16 rem = vsx4_mod_s16(a, b); +#else /* divmod */ + npyv_s16 quo = vsx4_div_s16(a, b); + npyv_s16 rem = npyv_sub_s16(a, vec_mul(b, quo)); + // (b == 0 || (a == NPY_MIN_INT16 && b == -1)) + npyv_b16 bzero = npyv_cmpeq_s16(b, vzero); + npyv_b16 amin = npyv_cmpeq_s16(a, vmin); + npyv_b16 bneg_one = npyv_cmpeq_s16(b, vneg_one); + npyv_b16 overflow = npyv_and_s16(bneg_one, amin); + npyv_b16 error = npyv_or_s16(bzero, overflow); + // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 + npyv_s16 cvtozero = npyv_select_s16(error, vzero, vneg_one); + warn = npyv_or_s16(error, warn); +#endif +#if 0 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b16 a_gt_zero = npyv_cmpgt_s16(a, vzero); + npyv_b16 b_gt_zero = npyv_cmpgt_s16(b, vzero); + npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero); + npyv_b16 rem_zero = npyv_cmpeq_s16(rem, vzero); + npyv_b16 or = npyv_or_s16(ab_eq_cond, rem_zero); + npyv_s16 to_add = npyv_select_s16(or, vzero, b); + rem = npyv_add_s16(rem, to_add); +#endif +#if 0 == 2 /* divmod */ + npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one); + quo = npyv_add_s16(quo, to_sub); + npyv_store_s16(dst1, npyv_and_s16(cvtozero, quo)); + npyv_store_s16(dst2, npyv_and_s16(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s16(dst1, rem); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } +#endif + } + +#if 0 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_s16 a = *src1; + const npyv_lanetype_s16 b = *src2; + if (b == 0 || (a == NPY_MIN_INT16 && b == -1)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / b; + *dst2 = a % b; + if (!((a > 0) == (b > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += b; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_s16 a = *src1; + const npyv_lanetype_s16 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; +#if 0 == 1 /* remainder */ + if (!((a > 0) == (b > 0) || *dst1 == 0)) { + *dst1 += b; + } +#endif + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_fmod_by_scalar_contig_s16(char **args, npy_intp len) +{ + npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0]; + npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1]; + npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2]; + const npyv_s16 vscalar = npyv_setall_s16(scalar); + const npyv_s32x2 divisor = vsx4_divisor_s16(vscalar); + const int vstep = npyv_nlanes_s16; +#if 0 >= 1 /* remainder and divmod */ + const npyv_s16 vzero = npyv_zero_s16(); + npyv_b16 b_gt_zero = npyv_cmpgt_s16(vscalar, vzero); +#endif +#if 0 == 2 /* divmod */ + npyv_b16 warn = npyv_cvt_b16_s16(npyv_zero_s16()); + const npyv_s16 vmin = npyv_setall_s16(NPY_MIN_INT16); + const npyv_s16 vneg_one = npyv_setall_s16(-1); + npyv_b16 bneg_one = npyv_cmpeq_s16(vscalar, vneg_one); + npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { +#endif + npyv_s16 a = npyv_load_s16(src1); +#if 0 <= 1 /* fmod and remainder */ + npyv_s16 rem = vsx4_mod_scalar_s16(a, divisor); +#else /* divmod */ + npyv_s16 quo = vsx4_div_scalar_s16(a, divisor); + npyv_s16 rem = npyv_sub_s16(a, vec_mul(vscalar, quo)); + // (a == NPY_MIN_INT16 && b == -1) + npyv_b16 amin = npyv_cmpeq_s16(a, vmin); + npyv_b16 overflow = npyv_and_s16(bneg_one, amin); + // in case of overflow, 'cvtozero' forces quo/rem to be 0 + npyv_s16 cvtozero = npyv_select_s16(overflow, vzero, vneg_one); + warn = npyv_or_s16(overflow, warn); +#endif +#if 0 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b16 a_gt_zero = npyv_cmpgt_s16(a, vzero); + npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero); + npyv_b16 rem_zero = npyv_cmpeq_s16(rem, vzero); + npyv_b16 or = npyv_or_s16(ab_eq_cond, rem_zero); + npyv_s16 to_add = npyv_select_s16(or, vzero, vscalar); + rem = npyv_add_s16(rem, to_add); +#endif +#if 0 == 2 /* divmod */ + npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one); + quo = npyv_add_s16(quo, to_sub); + npyv_store_s16(dst1, npyv_and_s16(cvtozero, quo)); + npyv_store_s16(dst2, npyv_and_s16(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s16(dst1, rem); +#endif + } + +#if 0 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_s16 a = *src1; + if (a == NPY_MIN_INT16 && scalar == -1) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / scalar; + *dst2 = a % scalar; + if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += scalar; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_s16 a = *src1; + *dst1 = a % scalar; +#if 0 == 1 /* remainder */ + if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { + *dst1 += scalar; + } +#endif + } +#endif + npyv_cleanup(); +} + +#line 281 +static NPY_INLINE void +vsx4_simd_remainder_contig_s16(char **args, npy_intp len) +{ + npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0]; + npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1]; + npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2]; + const npyv_s16 vzero = npyv_zero_s16(); + const int vstep = npyv_nlanes_s16; +#if 1 == 2 /* divmod */ + npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3]; + const npyv_s16 vneg_one = npyv_setall_s16(-1); + const npyv_s16 vmin = npyv_setall_s16(NPY_MIN_INT16); + npyv_b16 warn = npyv_cvt_b16_s16(npyv_zero_s16()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { +#endif + npyv_s16 a = npyv_load_s16(src1); + npyv_s16 b = npyv_load_s16(src2); +#if 1 <= 1 /* fmod and remainder */ + npyv_s16 rem = vsx4_mod_s16(a, b); +#else /* divmod */ + npyv_s16 quo = vsx4_div_s16(a, b); + npyv_s16 rem = npyv_sub_s16(a, vec_mul(b, quo)); + // (b == 0 || (a == NPY_MIN_INT16 && b == -1)) + npyv_b16 bzero = npyv_cmpeq_s16(b, vzero); + npyv_b16 amin = npyv_cmpeq_s16(a, vmin); + npyv_b16 bneg_one = npyv_cmpeq_s16(b, vneg_one); + npyv_b16 overflow = npyv_and_s16(bneg_one, amin); + npyv_b16 error = npyv_or_s16(bzero, overflow); + // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 + npyv_s16 cvtozero = npyv_select_s16(error, vzero, vneg_one); + warn = npyv_or_s16(error, warn); +#endif +#if 1 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b16 a_gt_zero = npyv_cmpgt_s16(a, vzero); + npyv_b16 b_gt_zero = npyv_cmpgt_s16(b, vzero); + npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero); + npyv_b16 rem_zero = npyv_cmpeq_s16(rem, vzero); + npyv_b16 or = npyv_or_s16(ab_eq_cond, rem_zero); + npyv_s16 to_add = npyv_select_s16(or, vzero, b); + rem = npyv_add_s16(rem, to_add); +#endif +#if 1 == 2 /* divmod */ + npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one); + quo = npyv_add_s16(quo, to_sub); + npyv_store_s16(dst1, npyv_and_s16(cvtozero, quo)); + npyv_store_s16(dst2, npyv_and_s16(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s16(dst1, rem); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } +#endif + } + +#if 1 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_s16 a = *src1; + const npyv_lanetype_s16 b = *src2; + if (b == 0 || (a == NPY_MIN_INT16 && b == -1)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / b; + *dst2 = a % b; + if (!((a > 0) == (b > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += b; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_s16 a = *src1; + const npyv_lanetype_s16 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; +#if 1 == 1 /* remainder */ + if (!((a > 0) == (b > 0) || *dst1 == 0)) { + *dst1 += b; + } +#endif + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_remainder_by_scalar_contig_s16(char **args, npy_intp len) +{ + npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0]; + npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1]; + npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2]; + const npyv_s16 vscalar = npyv_setall_s16(scalar); + const npyv_s32x2 divisor = vsx4_divisor_s16(vscalar); + const int vstep = npyv_nlanes_s16; +#if 1 >= 1 /* remainder and divmod */ + const npyv_s16 vzero = npyv_zero_s16(); + npyv_b16 b_gt_zero = npyv_cmpgt_s16(vscalar, vzero); +#endif +#if 1 == 2 /* divmod */ + npyv_b16 warn = npyv_cvt_b16_s16(npyv_zero_s16()); + const npyv_s16 vmin = npyv_setall_s16(NPY_MIN_INT16); + const npyv_s16 vneg_one = npyv_setall_s16(-1); + npyv_b16 bneg_one = npyv_cmpeq_s16(vscalar, vneg_one); + npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { +#endif + npyv_s16 a = npyv_load_s16(src1); +#if 1 <= 1 /* fmod and remainder */ + npyv_s16 rem = vsx4_mod_scalar_s16(a, divisor); +#else /* divmod */ + npyv_s16 quo = vsx4_div_scalar_s16(a, divisor); + npyv_s16 rem = npyv_sub_s16(a, vec_mul(vscalar, quo)); + // (a == NPY_MIN_INT16 && b == -1) + npyv_b16 amin = npyv_cmpeq_s16(a, vmin); + npyv_b16 overflow = npyv_and_s16(bneg_one, amin); + // in case of overflow, 'cvtozero' forces quo/rem to be 0 + npyv_s16 cvtozero = npyv_select_s16(overflow, vzero, vneg_one); + warn = npyv_or_s16(overflow, warn); +#endif +#if 1 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b16 a_gt_zero = npyv_cmpgt_s16(a, vzero); + npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero); + npyv_b16 rem_zero = npyv_cmpeq_s16(rem, vzero); + npyv_b16 or = npyv_or_s16(ab_eq_cond, rem_zero); + npyv_s16 to_add = npyv_select_s16(or, vzero, vscalar); + rem = npyv_add_s16(rem, to_add); +#endif +#if 1 == 2 /* divmod */ + npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one); + quo = npyv_add_s16(quo, to_sub); + npyv_store_s16(dst1, npyv_and_s16(cvtozero, quo)); + npyv_store_s16(dst2, npyv_and_s16(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s16(dst1, rem); +#endif + } + +#if 1 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_s16 a = *src1; + if (a == NPY_MIN_INT16 && scalar == -1) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / scalar; + *dst2 = a % scalar; + if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += scalar; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_s16 a = *src1; + *dst1 = a % scalar; +#if 1 == 1 /* remainder */ + if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { + *dst1 += scalar; + } +#endif + } +#endif + npyv_cleanup(); +} + +#line 281 +static NPY_INLINE void +vsx4_simd_divmod_contig_s16(char **args, npy_intp len) +{ + npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0]; + npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1]; + npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2]; + const npyv_s16 vzero = npyv_zero_s16(); + const int vstep = npyv_nlanes_s16; +#if 2 == 2 /* divmod */ + npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3]; + const npyv_s16 vneg_one = npyv_setall_s16(-1); + const npyv_s16 vmin = npyv_setall_s16(NPY_MIN_INT16); + npyv_b16 warn = npyv_cvt_b16_s16(npyv_zero_s16()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { +#endif + npyv_s16 a = npyv_load_s16(src1); + npyv_s16 b = npyv_load_s16(src2); +#if 2 <= 1 /* fmod and remainder */ + npyv_s16 rem = vsx4_mod_s16(a, b); +#else /* divmod */ + npyv_s16 quo = vsx4_div_s16(a, b); + npyv_s16 rem = npyv_sub_s16(a, vec_mul(b, quo)); + // (b == 0 || (a == NPY_MIN_INT16 && b == -1)) + npyv_b16 bzero = npyv_cmpeq_s16(b, vzero); + npyv_b16 amin = npyv_cmpeq_s16(a, vmin); + npyv_b16 bneg_one = npyv_cmpeq_s16(b, vneg_one); + npyv_b16 overflow = npyv_and_s16(bneg_one, amin); + npyv_b16 error = npyv_or_s16(bzero, overflow); + // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 + npyv_s16 cvtozero = npyv_select_s16(error, vzero, vneg_one); + warn = npyv_or_s16(error, warn); +#endif +#if 2 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b16 a_gt_zero = npyv_cmpgt_s16(a, vzero); + npyv_b16 b_gt_zero = npyv_cmpgt_s16(b, vzero); + npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero); + npyv_b16 rem_zero = npyv_cmpeq_s16(rem, vzero); + npyv_b16 or = npyv_or_s16(ab_eq_cond, rem_zero); + npyv_s16 to_add = npyv_select_s16(or, vzero, b); + rem = npyv_add_s16(rem, to_add); +#endif +#if 2 == 2 /* divmod */ + npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one); + quo = npyv_add_s16(quo, to_sub); + npyv_store_s16(dst1, npyv_and_s16(cvtozero, quo)); + npyv_store_s16(dst2, npyv_and_s16(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s16(dst1, rem); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } +#endif + } + +#if 2 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_s16 a = *src1; + const npyv_lanetype_s16 b = *src2; + if (b == 0 || (a == NPY_MIN_INT16 && b == -1)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / b; + *dst2 = a % b; + if (!((a > 0) == (b > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += b; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_s16 a = *src1; + const npyv_lanetype_s16 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; +#if 2 == 1 /* remainder */ + if (!((a > 0) == (b > 0) || *dst1 == 0)) { + *dst1 += b; + } +#endif + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_divmod_by_scalar_contig_s16(char **args, npy_intp len) +{ + npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0]; + npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1]; + npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2]; + const npyv_s16 vscalar = npyv_setall_s16(scalar); + const npyv_s32x2 divisor = vsx4_divisor_s16(vscalar); + const int vstep = npyv_nlanes_s16; +#if 2 >= 1 /* remainder and divmod */ + const npyv_s16 vzero = npyv_zero_s16(); + npyv_b16 b_gt_zero = npyv_cmpgt_s16(vscalar, vzero); +#endif +#if 2 == 2 /* divmod */ + npyv_b16 warn = npyv_cvt_b16_s16(npyv_zero_s16()); + const npyv_s16 vmin = npyv_setall_s16(NPY_MIN_INT16); + const npyv_s16 vneg_one = npyv_setall_s16(-1); + npyv_b16 bneg_one = npyv_cmpeq_s16(vscalar, vneg_one); + npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { +#endif + npyv_s16 a = npyv_load_s16(src1); +#if 2 <= 1 /* fmod and remainder */ + npyv_s16 rem = vsx4_mod_scalar_s16(a, divisor); +#else /* divmod */ + npyv_s16 quo = vsx4_div_scalar_s16(a, divisor); + npyv_s16 rem = npyv_sub_s16(a, vec_mul(vscalar, quo)); + // (a == NPY_MIN_INT16 && b == -1) + npyv_b16 amin = npyv_cmpeq_s16(a, vmin); + npyv_b16 overflow = npyv_and_s16(bneg_one, amin); + // in case of overflow, 'cvtozero' forces quo/rem to be 0 + npyv_s16 cvtozero = npyv_select_s16(overflow, vzero, vneg_one); + warn = npyv_or_s16(overflow, warn); +#endif +#if 2 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b16 a_gt_zero = npyv_cmpgt_s16(a, vzero); + npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero); + npyv_b16 rem_zero = npyv_cmpeq_s16(rem, vzero); + npyv_b16 or = npyv_or_s16(ab_eq_cond, rem_zero); + npyv_s16 to_add = npyv_select_s16(or, vzero, vscalar); + rem = npyv_add_s16(rem, to_add); +#endif +#if 2 == 2 /* divmod */ + npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one); + quo = npyv_add_s16(quo, to_sub); + npyv_store_s16(dst1, npyv_and_s16(cvtozero, quo)); + npyv_store_s16(dst2, npyv_and_s16(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s16(dst1, rem); +#endif + } + +#if 2 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_s16 a = *src1; + if (a == NPY_MIN_INT16 && scalar == -1) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / scalar; + *dst2 = a % scalar; + if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += scalar; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_s16 a = *src1; + *dst1 = a % scalar; +#if 2 == 1 /* remainder */ + if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { + *dst1 += scalar; + } +#endif + } +#endif + npyv_cleanup(); +} + + +#line 277 +#line 281 +static NPY_INLINE void +vsx4_simd_fmod_contig_s32(char **args, npy_intp len) +{ + npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0]; + npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1]; + npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2]; + const npyv_s32 vzero = npyv_zero_s32(); + const int vstep = npyv_nlanes_s32; +#if 0 == 2 /* divmod */ + npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3]; + const npyv_s32 vneg_one = npyv_setall_s32(-1); + const npyv_s32 vmin = npyv_setall_s32(NPY_MIN_INT32); + npyv_b32 warn = npyv_cvt_b32_s32(npyv_zero_s32()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { +#endif + npyv_s32 a = npyv_load_s32(src1); + npyv_s32 b = npyv_load_s32(src2); +#if 0 <= 1 /* fmod and remainder */ + npyv_s32 rem = vsx4_mod_s32(a, b); +#else /* divmod */ + npyv_s32 quo = vsx4_div_s32(a, b); + npyv_s32 rem = npyv_sub_s32(a, vec_mul(b, quo)); + // (b == 0 || (a == NPY_MIN_INT32 && b == -1)) + npyv_b32 bzero = npyv_cmpeq_s32(b, vzero); + npyv_b32 amin = npyv_cmpeq_s32(a, vmin); + npyv_b32 bneg_one = npyv_cmpeq_s32(b, vneg_one); + npyv_b32 overflow = npyv_and_s32(bneg_one, amin); + npyv_b32 error = npyv_or_s32(bzero, overflow); + // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 + npyv_s32 cvtozero = npyv_select_s32(error, vzero, vneg_one); + warn = npyv_or_s32(error, warn); +#endif +#if 0 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b32 a_gt_zero = npyv_cmpgt_s32(a, vzero); + npyv_b32 b_gt_zero = npyv_cmpgt_s32(b, vzero); + npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero); + npyv_b32 rem_zero = npyv_cmpeq_s32(rem, vzero); + npyv_b32 or = npyv_or_s32(ab_eq_cond, rem_zero); + npyv_s32 to_add = npyv_select_s32(or, vzero, b); + rem = npyv_add_s32(rem, to_add); +#endif +#if 0 == 2 /* divmod */ + npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one); + quo = npyv_add_s32(quo, to_sub); + npyv_store_s32(dst1, npyv_and_s32(cvtozero, quo)); + npyv_store_s32(dst2, npyv_and_s32(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s32(dst1, rem); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } +#endif + } + +#if 0 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_s32 a = *src1; + const npyv_lanetype_s32 b = *src2; + if (b == 0 || (a == NPY_MIN_INT32 && b == -1)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / b; + *dst2 = a % b; + if (!((a > 0) == (b > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += b; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_s32 a = *src1; + const npyv_lanetype_s32 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; +#if 0 == 1 /* remainder */ + if (!((a > 0) == (b > 0) || *dst1 == 0)) { + *dst1 += b; + } +#endif + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_fmod_by_scalar_contig_s32(char **args, npy_intp len) +{ + npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0]; + npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1]; + npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2]; + const npyv_s32 vscalar = npyv_setall_s32(scalar); + const npyv_s32 divisor = vsx4_divisor_s32(vscalar); + const int vstep = npyv_nlanes_s32; +#if 0 >= 1 /* remainder and divmod */ + const npyv_s32 vzero = npyv_zero_s32(); + npyv_b32 b_gt_zero = npyv_cmpgt_s32(vscalar, vzero); +#endif +#if 0 == 2 /* divmod */ + npyv_b32 warn = npyv_cvt_b32_s32(npyv_zero_s32()); + const npyv_s32 vmin = npyv_setall_s32(NPY_MIN_INT32); + const npyv_s32 vneg_one = npyv_setall_s32(-1); + npyv_b32 bneg_one = npyv_cmpeq_s32(vscalar, vneg_one); + npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { +#endif + npyv_s32 a = npyv_load_s32(src1); +#if 0 <= 1 /* fmod and remainder */ + npyv_s32 rem = vsx4_mod_scalar_s32(a, divisor); +#else /* divmod */ + npyv_s32 quo = vsx4_div_scalar_s32(a, divisor); + npyv_s32 rem = npyv_sub_s32(a, vec_mul(vscalar, quo)); + // (a == NPY_MIN_INT32 && b == -1) + npyv_b32 amin = npyv_cmpeq_s32(a, vmin); + npyv_b32 overflow = npyv_and_s32(bneg_one, amin); + // in case of overflow, 'cvtozero' forces quo/rem to be 0 + npyv_s32 cvtozero = npyv_select_s32(overflow, vzero, vneg_one); + warn = npyv_or_s32(overflow, warn); +#endif +#if 0 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b32 a_gt_zero = npyv_cmpgt_s32(a, vzero); + npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero); + npyv_b32 rem_zero = npyv_cmpeq_s32(rem, vzero); + npyv_b32 or = npyv_or_s32(ab_eq_cond, rem_zero); + npyv_s32 to_add = npyv_select_s32(or, vzero, vscalar); + rem = npyv_add_s32(rem, to_add); +#endif +#if 0 == 2 /* divmod */ + npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one); + quo = npyv_add_s32(quo, to_sub); + npyv_store_s32(dst1, npyv_and_s32(cvtozero, quo)); + npyv_store_s32(dst2, npyv_and_s32(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s32(dst1, rem); +#endif + } + +#if 0 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_s32 a = *src1; + if (a == NPY_MIN_INT32 && scalar == -1) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / scalar; + *dst2 = a % scalar; + if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += scalar; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_s32 a = *src1; + *dst1 = a % scalar; +#if 0 == 1 /* remainder */ + if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { + *dst1 += scalar; + } +#endif + } +#endif + npyv_cleanup(); +} + +#line 281 +static NPY_INLINE void +vsx4_simd_remainder_contig_s32(char **args, npy_intp len) +{ + npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0]; + npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1]; + npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2]; + const npyv_s32 vzero = npyv_zero_s32(); + const int vstep = npyv_nlanes_s32; +#if 1 == 2 /* divmod */ + npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3]; + const npyv_s32 vneg_one = npyv_setall_s32(-1); + const npyv_s32 vmin = npyv_setall_s32(NPY_MIN_INT32); + npyv_b32 warn = npyv_cvt_b32_s32(npyv_zero_s32()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { +#endif + npyv_s32 a = npyv_load_s32(src1); + npyv_s32 b = npyv_load_s32(src2); +#if 1 <= 1 /* fmod and remainder */ + npyv_s32 rem = vsx4_mod_s32(a, b); +#else /* divmod */ + npyv_s32 quo = vsx4_div_s32(a, b); + npyv_s32 rem = npyv_sub_s32(a, vec_mul(b, quo)); + // (b == 0 || (a == NPY_MIN_INT32 && b == -1)) + npyv_b32 bzero = npyv_cmpeq_s32(b, vzero); + npyv_b32 amin = npyv_cmpeq_s32(a, vmin); + npyv_b32 bneg_one = npyv_cmpeq_s32(b, vneg_one); + npyv_b32 overflow = npyv_and_s32(bneg_one, amin); + npyv_b32 error = npyv_or_s32(bzero, overflow); + // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 + npyv_s32 cvtozero = npyv_select_s32(error, vzero, vneg_one); + warn = npyv_or_s32(error, warn); +#endif +#if 1 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b32 a_gt_zero = npyv_cmpgt_s32(a, vzero); + npyv_b32 b_gt_zero = npyv_cmpgt_s32(b, vzero); + npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero); + npyv_b32 rem_zero = npyv_cmpeq_s32(rem, vzero); + npyv_b32 or = npyv_or_s32(ab_eq_cond, rem_zero); + npyv_s32 to_add = npyv_select_s32(or, vzero, b); + rem = npyv_add_s32(rem, to_add); +#endif +#if 1 == 2 /* divmod */ + npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one); + quo = npyv_add_s32(quo, to_sub); + npyv_store_s32(dst1, npyv_and_s32(cvtozero, quo)); + npyv_store_s32(dst2, npyv_and_s32(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s32(dst1, rem); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } +#endif + } + +#if 1 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_s32 a = *src1; + const npyv_lanetype_s32 b = *src2; + if (b == 0 || (a == NPY_MIN_INT32 && b == -1)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / b; + *dst2 = a % b; + if (!((a > 0) == (b > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += b; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_s32 a = *src1; + const npyv_lanetype_s32 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; +#if 1 == 1 /* remainder */ + if (!((a > 0) == (b > 0) || *dst1 == 0)) { + *dst1 += b; + } +#endif + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_remainder_by_scalar_contig_s32(char **args, npy_intp len) +{ + npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0]; + npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1]; + npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2]; + const npyv_s32 vscalar = npyv_setall_s32(scalar); + const npyv_s32 divisor = vsx4_divisor_s32(vscalar); + const int vstep = npyv_nlanes_s32; +#if 1 >= 1 /* remainder and divmod */ + const npyv_s32 vzero = npyv_zero_s32(); + npyv_b32 b_gt_zero = npyv_cmpgt_s32(vscalar, vzero); +#endif +#if 1 == 2 /* divmod */ + npyv_b32 warn = npyv_cvt_b32_s32(npyv_zero_s32()); + const npyv_s32 vmin = npyv_setall_s32(NPY_MIN_INT32); + const npyv_s32 vneg_one = npyv_setall_s32(-1); + npyv_b32 bneg_one = npyv_cmpeq_s32(vscalar, vneg_one); + npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { +#endif + npyv_s32 a = npyv_load_s32(src1); +#if 1 <= 1 /* fmod and remainder */ + npyv_s32 rem = vsx4_mod_scalar_s32(a, divisor); +#else /* divmod */ + npyv_s32 quo = vsx4_div_scalar_s32(a, divisor); + npyv_s32 rem = npyv_sub_s32(a, vec_mul(vscalar, quo)); + // (a == NPY_MIN_INT32 && b == -1) + npyv_b32 amin = npyv_cmpeq_s32(a, vmin); + npyv_b32 overflow = npyv_and_s32(bneg_one, amin); + // in case of overflow, 'cvtozero' forces quo/rem to be 0 + npyv_s32 cvtozero = npyv_select_s32(overflow, vzero, vneg_one); + warn = npyv_or_s32(overflow, warn); +#endif +#if 1 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b32 a_gt_zero = npyv_cmpgt_s32(a, vzero); + npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero); + npyv_b32 rem_zero = npyv_cmpeq_s32(rem, vzero); + npyv_b32 or = npyv_or_s32(ab_eq_cond, rem_zero); + npyv_s32 to_add = npyv_select_s32(or, vzero, vscalar); + rem = npyv_add_s32(rem, to_add); +#endif +#if 1 == 2 /* divmod */ + npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one); + quo = npyv_add_s32(quo, to_sub); + npyv_store_s32(dst1, npyv_and_s32(cvtozero, quo)); + npyv_store_s32(dst2, npyv_and_s32(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s32(dst1, rem); +#endif + } + +#if 1 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_s32 a = *src1; + if (a == NPY_MIN_INT32 && scalar == -1) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / scalar; + *dst2 = a % scalar; + if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += scalar; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_s32 a = *src1; + *dst1 = a % scalar; +#if 1 == 1 /* remainder */ + if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { + *dst1 += scalar; + } +#endif + } +#endif + npyv_cleanup(); +} + +#line 281 +static NPY_INLINE void +vsx4_simd_divmod_contig_s32(char **args, npy_intp len) +{ + npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0]; + npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1]; + npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2]; + const npyv_s32 vzero = npyv_zero_s32(); + const int vstep = npyv_nlanes_s32; +#if 2 == 2 /* divmod */ + npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3]; + const npyv_s32 vneg_one = npyv_setall_s32(-1); + const npyv_s32 vmin = npyv_setall_s32(NPY_MIN_INT32); + npyv_b32 warn = npyv_cvt_b32_s32(npyv_zero_s32()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { +#endif + npyv_s32 a = npyv_load_s32(src1); + npyv_s32 b = npyv_load_s32(src2); +#if 2 <= 1 /* fmod and remainder */ + npyv_s32 rem = vsx4_mod_s32(a, b); +#else /* divmod */ + npyv_s32 quo = vsx4_div_s32(a, b); + npyv_s32 rem = npyv_sub_s32(a, vec_mul(b, quo)); + // (b == 0 || (a == NPY_MIN_INT32 && b == -1)) + npyv_b32 bzero = npyv_cmpeq_s32(b, vzero); + npyv_b32 amin = npyv_cmpeq_s32(a, vmin); + npyv_b32 bneg_one = npyv_cmpeq_s32(b, vneg_one); + npyv_b32 overflow = npyv_and_s32(bneg_one, amin); + npyv_b32 error = npyv_or_s32(bzero, overflow); + // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 + npyv_s32 cvtozero = npyv_select_s32(error, vzero, vneg_one); + warn = npyv_or_s32(error, warn); +#endif +#if 2 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b32 a_gt_zero = npyv_cmpgt_s32(a, vzero); + npyv_b32 b_gt_zero = npyv_cmpgt_s32(b, vzero); + npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero); + npyv_b32 rem_zero = npyv_cmpeq_s32(rem, vzero); + npyv_b32 or = npyv_or_s32(ab_eq_cond, rem_zero); + npyv_s32 to_add = npyv_select_s32(or, vzero, b); + rem = npyv_add_s32(rem, to_add); +#endif +#if 2 == 2 /* divmod */ + npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one); + quo = npyv_add_s32(quo, to_sub); + npyv_store_s32(dst1, npyv_and_s32(cvtozero, quo)); + npyv_store_s32(dst2, npyv_and_s32(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s32(dst1, rem); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } +#endif + } + +#if 2 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_s32 a = *src1; + const npyv_lanetype_s32 b = *src2; + if (b == 0 || (a == NPY_MIN_INT32 && b == -1)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / b; + *dst2 = a % b; + if (!((a > 0) == (b > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += b; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_s32 a = *src1; + const npyv_lanetype_s32 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; +#if 2 == 1 /* remainder */ + if (!((a > 0) == (b > 0) || *dst1 == 0)) { + *dst1 += b; + } +#endif + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_divmod_by_scalar_contig_s32(char **args, npy_intp len) +{ + npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0]; + npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1]; + npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2]; + const npyv_s32 vscalar = npyv_setall_s32(scalar); + const npyv_s32 divisor = vsx4_divisor_s32(vscalar); + const int vstep = npyv_nlanes_s32; +#if 2 >= 1 /* remainder and divmod */ + const npyv_s32 vzero = npyv_zero_s32(); + npyv_b32 b_gt_zero = npyv_cmpgt_s32(vscalar, vzero); +#endif +#if 2 == 2 /* divmod */ + npyv_b32 warn = npyv_cvt_b32_s32(npyv_zero_s32()); + const npyv_s32 vmin = npyv_setall_s32(NPY_MIN_INT32); + const npyv_s32 vneg_one = npyv_setall_s32(-1); + npyv_b32 bneg_one = npyv_cmpeq_s32(vscalar, vneg_one); + npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { +#endif + npyv_s32 a = npyv_load_s32(src1); +#if 2 <= 1 /* fmod and remainder */ + npyv_s32 rem = vsx4_mod_scalar_s32(a, divisor); +#else /* divmod */ + npyv_s32 quo = vsx4_div_scalar_s32(a, divisor); + npyv_s32 rem = npyv_sub_s32(a, vec_mul(vscalar, quo)); + // (a == NPY_MIN_INT32 && b == -1) + npyv_b32 amin = npyv_cmpeq_s32(a, vmin); + npyv_b32 overflow = npyv_and_s32(bneg_one, amin); + // in case of overflow, 'cvtozero' forces quo/rem to be 0 + npyv_s32 cvtozero = npyv_select_s32(overflow, vzero, vneg_one); + warn = npyv_or_s32(overflow, warn); +#endif +#if 2 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b32 a_gt_zero = npyv_cmpgt_s32(a, vzero); + npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero); + npyv_b32 rem_zero = npyv_cmpeq_s32(rem, vzero); + npyv_b32 or = npyv_or_s32(ab_eq_cond, rem_zero); + npyv_s32 to_add = npyv_select_s32(or, vzero, vscalar); + rem = npyv_add_s32(rem, to_add); +#endif +#if 2 == 2 /* divmod */ + npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one); + quo = npyv_add_s32(quo, to_sub); + npyv_store_s32(dst1, npyv_and_s32(cvtozero, quo)); + npyv_store_s32(dst2, npyv_and_s32(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s32(dst1, rem); +#endif + } + +#if 2 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_s32 a = *src1; + if (a == NPY_MIN_INT32 && scalar == -1) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / scalar; + *dst2 = a % scalar; + if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += scalar; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_s32 a = *src1; + *dst1 = a % scalar; +#if 2 == 1 /* remainder */ + if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { + *dst1 += scalar; + } +#endif + } +#endif + npyv_cleanup(); +} + + +#line 277 +#line 281 +static NPY_INLINE void +vsx4_simd_fmod_contig_s64(char **args, npy_intp len) +{ + npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0]; + npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1]; + npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2]; + const npyv_s64 vzero = npyv_zero_s64(); + const int vstep = npyv_nlanes_s64; +#if 0 == 2 /* divmod */ + npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3]; + const npyv_s64 vneg_one = npyv_setall_s64(-1); + const npyv_s64 vmin = npyv_setall_s64(NPY_MIN_INT64); + npyv_b64 warn = npyv_cvt_b64_s64(npyv_zero_s64()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { +#endif + npyv_s64 a = npyv_load_s64(src1); + npyv_s64 b = npyv_load_s64(src2); +#if 0 <= 1 /* fmod and remainder */ + npyv_s64 rem = vsx4_mod_s64(a, b); +#else /* divmod */ + npyv_s64 quo = vsx4_div_s64(a, b); + npyv_s64 rem = npyv_sub_s64(a, vec_mul(b, quo)); + // (b == 0 || (a == NPY_MIN_INT64 && b == -1)) + npyv_b64 bzero = npyv_cmpeq_s64(b, vzero); + npyv_b64 amin = npyv_cmpeq_s64(a, vmin); + npyv_b64 bneg_one = npyv_cmpeq_s64(b, vneg_one); + npyv_b64 overflow = npyv_and_s64(bneg_one, amin); + npyv_b64 error = npyv_or_s64(bzero, overflow); + // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 + npyv_s64 cvtozero = npyv_select_s64(error, vzero, vneg_one); + warn = npyv_or_s64(error, warn); +#endif +#if 0 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b64 a_gt_zero = npyv_cmpgt_s64(a, vzero); + npyv_b64 b_gt_zero = npyv_cmpgt_s64(b, vzero); + npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero); + npyv_b64 rem_zero = npyv_cmpeq_s64(rem, vzero); + npyv_b64 or = npyv_or_s64(ab_eq_cond, rem_zero); + npyv_s64 to_add = npyv_select_s64(or, vzero, b); + rem = npyv_add_s64(rem, to_add); +#endif +#if 0 == 2 /* divmod */ + npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one); + quo = npyv_add_s64(quo, to_sub); + npyv_store_s64(dst1, npyv_and_s64(cvtozero, quo)); + npyv_store_s64(dst2, npyv_and_s64(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s64(dst1, rem); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } +#endif + } + +#if 0 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_s64 a = *src1; + const npyv_lanetype_s64 b = *src2; + if (b == 0 || (a == NPY_MIN_INT64 && b == -1)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / b; + *dst2 = a % b; + if (!((a > 0) == (b > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += b; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_s64 a = *src1; + const npyv_lanetype_s64 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; +#if 0 == 1 /* remainder */ + if (!((a > 0) == (b > 0) || *dst1 == 0)) { + *dst1 += b; + } +#endif + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_fmod_by_scalar_contig_s64(char **args, npy_intp len) +{ + npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0]; + npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1]; + npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2]; + const npyv_s64 vscalar = npyv_setall_s64(scalar); + const npyv_s64 divisor = vsx4_divisor_s64(vscalar); + const int vstep = npyv_nlanes_s64; +#if 0 >= 1 /* remainder and divmod */ + const npyv_s64 vzero = npyv_zero_s64(); + npyv_b64 b_gt_zero = npyv_cmpgt_s64(vscalar, vzero); +#endif +#if 0 == 2 /* divmod */ + npyv_b64 warn = npyv_cvt_b64_s64(npyv_zero_s64()); + const npyv_s64 vmin = npyv_setall_s64(NPY_MIN_INT64); + const npyv_s64 vneg_one = npyv_setall_s64(-1); + npyv_b64 bneg_one = npyv_cmpeq_s64(vscalar, vneg_one); + npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { +#endif + npyv_s64 a = npyv_load_s64(src1); +#if 0 <= 1 /* fmod and remainder */ + npyv_s64 rem = vsx4_mod_scalar_s64(a, divisor); +#else /* divmod */ + npyv_s64 quo = vsx4_div_scalar_s64(a, divisor); + npyv_s64 rem = npyv_sub_s64(a, vec_mul(vscalar, quo)); + // (a == NPY_MIN_INT64 && b == -1) + npyv_b64 amin = npyv_cmpeq_s64(a, vmin); + npyv_b64 overflow = npyv_and_s64(bneg_one, amin); + // in case of overflow, 'cvtozero' forces quo/rem to be 0 + npyv_s64 cvtozero = npyv_select_s64(overflow, vzero, vneg_one); + warn = npyv_or_s64(overflow, warn); +#endif +#if 0 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b64 a_gt_zero = npyv_cmpgt_s64(a, vzero); + npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero); + npyv_b64 rem_zero = npyv_cmpeq_s64(rem, vzero); + npyv_b64 or = npyv_or_s64(ab_eq_cond, rem_zero); + npyv_s64 to_add = npyv_select_s64(or, vzero, vscalar); + rem = npyv_add_s64(rem, to_add); +#endif +#if 0 == 2 /* divmod */ + npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one); + quo = npyv_add_s64(quo, to_sub); + npyv_store_s64(dst1, npyv_and_s64(cvtozero, quo)); + npyv_store_s64(dst2, npyv_and_s64(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s64(dst1, rem); +#endif + } + +#if 0 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_s64 a = *src1; + if (a == NPY_MIN_INT64 && scalar == -1) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / scalar; + *dst2 = a % scalar; + if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += scalar; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_s64 a = *src1; + *dst1 = a % scalar; +#if 0 == 1 /* remainder */ + if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { + *dst1 += scalar; + } +#endif + } +#endif + npyv_cleanup(); +} + +#line 281 +static NPY_INLINE void +vsx4_simd_remainder_contig_s64(char **args, npy_intp len) +{ + npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0]; + npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1]; + npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2]; + const npyv_s64 vzero = npyv_zero_s64(); + const int vstep = npyv_nlanes_s64; +#if 1 == 2 /* divmod */ + npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3]; + const npyv_s64 vneg_one = npyv_setall_s64(-1); + const npyv_s64 vmin = npyv_setall_s64(NPY_MIN_INT64); + npyv_b64 warn = npyv_cvt_b64_s64(npyv_zero_s64()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { +#endif + npyv_s64 a = npyv_load_s64(src1); + npyv_s64 b = npyv_load_s64(src2); +#if 1 <= 1 /* fmod and remainder */ + npyv_s64 rem = vsx4_mod_s64(a, b); +#else /* divmod */ + npyv_s64 quo = vsx4_div_s64(a, b); + npyv_s64 rem = npyv_sub_s64(a, vec_mul(b, quo)); + // (b == 0 || (a == NPY_MIN_INT64 && b == -1)) + npyv_b64 bzero = npyv_cmpeq_s64(b, vzero); + npyv_b64 amin = npyv_cmpeq_s64(a, vmin); + npyv_b64 bneg_one = npyv_cmpeq_s64(b, vneg_one); + npyv_b64 overflow = npyv_and_s64(bneg_one, amin); + npyv_b64 error = npyv_or_s64(bzero, overflow); + // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 + npyv_s64 cvtozero = npyv_select_s64(error, vzero, vneg_one); + warn = npyv_or_s64(error, warn); +#endif +#if 1 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b64 a_gt_zero = npyv_cmpgt_s64(a, vzero); + npyv_b64 b_gt_zero = npyv_cmpgt_s64(b, vzero); + npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero); + npyv_b64 rem_zero = npyv_cmpeq_s64(rem, vzero); + npyv_b64 or = npyv_or_s64(ab_eq_cond, rem_zero); + npyv_s64 to_add = npyv_select_s64(or, vzero, b); + rem = npyv_add_s64(rem, to_add); +#endif +#if 1 == 2 /* divmod */ + npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one); + quo = npyv_add_s64(quo, to_sub); + npyv_store_s64(dst1, npyv_and_s64(cvtozero, quo)); + npyv_store_s64(dst2, npyv_and_s64(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s64(dst1, rem); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } +#endif + } + +#if 1 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_s64 a = *src1; + const npyv_lanetype_s64 b = *src2; + if (b == 0 || (a == NPY_MIN_INT64 && b == -1)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / b; + *dst2 = a % b; + if (!((a > 0) == (b > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += b; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_s64 a = *src1; + const npyv_lanetype_s64 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; +#if 1 == 1 /* remainder */ + if (!((a > 0) == (b > 0) || *dst1 == 0)) { + *dst1 += b; + } +#endif + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_remainder_by_scalar_contig_s64(char **args, npy_intp len) +{ + npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0]; + npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1]; + npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2]; + const npyv_s64 vscalar = npyv_setall_s64(scalar); + const npyv_s64 divisor = vsx4_divisor_s64(vscalar); + const int vstep = npyv_nlanes_s64; +#if 1 >= 1 /* remainder and divmod */ + const npyv_s64 vzero = npyv_zero_s64(); + npyv_b64 b_gt_zero = npyv_cmpgt_s64(vscalar, vzero); +#endif +#if 1 == 2 /* divmod */ + npyv_b64 warn = npyv_cvt_b64_s64(npyv_zero_s64()); + const npyv_s64 vmin = npyv_setall_s64(NPY_MIN_INT64); + const npyv_s64 vneg_one = npyv_setall_s64(-1); + npyv_b64 bneg_one = npyv_cmpeq_s64(vscalar, vneg_one); + npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { +#endif + npyv_s64 a = npyv_load_s64(src1); +#if 1 <= 1 /* fmod and remainder */ + npyv_s64 rem = vsx4_mod_scalar_s64(a, divisor); +#else /* divmod */ + npyv_s64 quo = vsx4_div_scalar_s64(a, divisor); + npyv_s64 rem = npyv_sub_s64(a, vec_mul(vscalar, quo)); + // (a == NPY_MIN_INT64 && b == -1) + npyv_b64 amin = npyv_cmpeq_s64(a, vmin); + npyv_b64 overflow = npyv_and_s64(bneg_one, amin); + // in case of overflow, 'cvtozero' forces quo/rem to be 0 + npyv_s64 cvtozero = npyv_select_s64(overflow, vzero, vneg_one); + warn = npyv_or_s64(overflow, warn); +#endif +#if 1 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b64 a_gt_zero = npyv_cmpgt_s64(a, vzero); + npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero); + npyv_b64 rem_zero = npyv_cmpeq_s64(rem, vzero); + npyv_b64 or = npyv_or_s64(ab_eq_cond, rem_zero); + npyv_s64 to_add = npyv_select_s64(or, vzero, vscalar); + rem = npyv_add_s64(rem, to_add); +#endif +#if 1 == 2 /* divmod */ + npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one); + quo = npyv_add_s64(quo, to_sub); + npyv_store_s64(dst1, npyv_and_s64(cvtozero, quo)); + npyv_store_s64(dst2, npyv_and_s64(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s64(dst1, rem); +#endif + } + +#if 1 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_s64 a = *src1; + if (a == NPY_MIN_INT64 && scalar == -1) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / scalar; + *dst2 = a % scalar; + if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += scalar; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_s64 a = *src1; + *dst1 = a % scalar; +#if 1 == 1 /* remainder */ + if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { + *dst1 += scalar; + } +#endif + } +#endif + npyv_cleanup(); +} + +#line 281 +static NPY_INLINE void +vsx4_simd_divmod_contig_s64(char **args, npy_intp len) +{ + npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0]; + npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1]; + npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2]; + const npyv_s64 vzero = npyv_zero_s64(); + const int vstep = npyv_nlanes_s64; +#if 2 == 2 /* divmod */ + npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3]; + const npyv_s64 vneg_one = npyv_setall_s64(-1); + const npyv_s64 vmin = npyv_setall_s64(NPY_MIN_INT64); + npyv_b64 warn = npyv_cvt_b64_s64(npyv_zero_s64()); + + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep, dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, + dst1 += vstep) { +#endif + npyv_s64 a = npyv_load_s64(src1); + npyv_s64 b = npyv_load_s64(src2); +#if 2 <= 1 /* fmod and remainder */ + npyv_s64 rem = vsx4_mod_s64(a, b); +#else /* divmod */ + npyv_s64 quo = vsx4_div_s64(a, b); + npyv_s64 rem = npyv_sub_s64(a, vec_mul(b, quo)); + // (b == 0 || (a == NPY_MIN_INT64 && b == -1)) + npyv_b64 bzero = npyv_cmpeq_s64(b, vzero); + npyv_b64 amin = npyv_cmpeq_s64(a, vmin); + npyv_b64 bneg_one = npyv_cmpeq_s64(b, vneg_one); + npyv_b64 overflow = npyv_and_s64(bneg_one, amin); + npyv_b64 error = npyv_or_s64(bzero, overflow); + // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 + npyv_s64 cvtozero = npyv_select_s64(error, vzero, vneg_one); + warn = npyv_or_s64(error, warn); +#endif +#if 2 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b64 a_gt_zero = npyv_cmpgt_s64(a, vzero); + npyv_b64 b_gt_zero = npyv_cmpgt_s64(b, vzero); + npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero); + npyv_b64 rem_zero = npyv_cmpeq_s64(rem, vzero); + npyv_b64 or = npyv_or_s64(ab_eq_cond, rem_zero); + npyv_s64 to_add = npyv_select_s64(or, vzero, b); + rem = npyv_add_s64(rem, to_add); +#endif +#if 2 == 2 /* divmod */ + npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one); + quo = npyv_add_s64(quo, to_sub); + npyv_store_s64(dst1, npyv_and_s64(cvtozero, quo)); + npyv_store_s64(dst2, npyv_and_s64(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s64(dst1, rem); + if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { + npy_set_floatstatus_divbyzero(); + } +#endif + } + +#if 2 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { + const npyv_lanetype_s64 a = *src1; + const npyv_lanetype_s64 b = *src2; + if (b == 0 || (a == NPY_MIN_INT64 && b == -1)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / b; + *dst2 = a % b; + if (!((a > 0) == (b > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += b; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++src2, ++dst1) { + const npyv_lanetype_s64 a = *src1; + const npyv_lanetype_s64 b = *src2; + if (NPY_UNLIKELY(b == 0)) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + } else{ + *dst1 = a % b; +#if 2 == 1 /* remainder */ + if (!((a > 0) == (b > 0) || *dst1 == 0)) { + *dst1 += b; + } +#endif + } + } +#endif + npyv_cleanup(); +} + +static NPY_INLINE void +vsx4_simd_divmod_by_scalar_contig_s64(char **args, npy_intp len) +{ + npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0]; + npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1]; + npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2]; + const npyv_s64 vscalar = npyv_setall_s64(scalar); + const npyv_s64 divisor = vsx4_divisor_s64(vscalar); + const int vstep = npyv_nlanes_s64; +#if 2 >= 1 /* remainder and divmod */ + const npyv_s64 vzero = npyv_zero_s64(); + npyv_b64 b_gt_zero = npyv_cmpgt_s64(vscalar, vzero); +#endif +#if 2 == 2 /* divmod */ + npyv_b64 warn = npyv_cvt_b64_s64(npyv_zero_s64()); + const npyv_s64 vmin = npyv_setall_s64(NPY_MIN_INT64); + const npyv_s64 vneg_one = npyv_setall_s64(-1); + npyv_b64 bneg_one = npyv_cmpeq_s64(vscalar, vneg_one); + npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3]; + + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, + dst2 += vstep) { +#else /* fmod and remainder */ + for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { +#endif + npyv_s64 a = npyv_load_s64(src1); +#if 2 <= 1 /* fmod and remainder */ + npyv_s64 rem = vsx4_mod_scalar_s64(a, divisor); +#else /* divmod */ + npyv_s64 quo = vsx4_div_scalar_s64(a, divisor); + npyv_s64 rem = npyv_sub_s64(a, vec_mul(vscalar, quo)); + // (a == NPY_MIN_INT64 && b == -1) + npyv_b64 amin = npyv_cmpeq_s64(a, vmin); + npyv_b64 overflow = npyv_and_s64(bneg_one, amin); + // in case of overflow, 'cvtozero' forces quo/rem to be 0 + npyv_s64 cvtozero = npyv_select_s64(overflow, vzero, vneg_one); + warn = npyv_or_s64(overflow, warn); +#endif +#if 2 >= 1 /* remainder and divmod */ + // handle mixed case the way Python does + // ((a > 0) == (b > 0) || rem == 0) + npyv_b64 a_gt_zero = npyv_cmpgt_s64(a, vzero); + npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero); + npyv_b64 rem_zero = npyv_cmpeq_s64(rem, vzero); + npyv_b64 or = npyv_or_s64(ab_eq_cond, rem_zero); + npyv_s64 to_add = npyv_select_s64(or, vzero, vscalar); + rem = npyv_add_s64(rem, to_add); +#endif +#if 2 == 2 /* divmod */ + npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one); + quo = npyv_add_s64(quo, to_sub); + npyv_store_s64(dst1, npyv_and_s64(cvtozero, quo)); + npyv_store_s64(dst2, npyv_and_s64(cvtozero, rem)); +#else /* fmod and remainder */ + npyv_store_s64(dst1, rem); +#endif + } + +#if 2 == 2 /* divmod */ + if (!vec_all_eq(warn, vzero)) { + npy_set_floatstatus_divbyzero(); + } + + for (; len > 0; --len, ++src1, ++dst1, ++dst2) { + const npyv_lanetype_s64 a = *src1; + if (a == NPY_MIN_INT64 && scalar == -1) { + npy_set_floatstatus_divbyzero(); + *dst1 = 0; + *dst2 = 0; + } + else { + *dst1 = a / scalar; + *dst2 = a % scalar; + if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { + *dst1 -= 1; + *dst2 += scalar; + } + } + } +#else /* fmod and remainder */ + for (; len > 0; --len, ++src1, ++dst1) { + const npyv_lanetype_s64 a = *src1; + *dst1 = a % scalar; +#if 2 == 1 /* remainder */ + if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { + *dst1 += scalar; + } +#endif + } +#endif + npyv_cleanup(); +} + + +#endif // NPY_SIMD && defined(NPY_HAVE_VSX4) + +/***************************************************************************** + ** Defining ufunc inner functions + *****************************************************************************/ + +#line 494 +#undef TO_SIMD_SFX +#if 0 +#line 499 +#elif NPY_BITSOF_BYTE == 8 + #if 0 + #define TO_SIMD_SFX(X) X##_s8 + #else + #define TO_SIMD_SFX(X) X##_u8 + #endif + +#line 499 +#elif NPY_BITSOF_BYTE == 16 + #if 0 + #define TO_SIMD_SFX(X) X##_s16 + #else + #define TO_SIMD_SFX(X) X##_u16 + #endif + +#line 499 +#elif NPY_BITSOF_BYTE == 32 + #if 0 + #define TO_SIMD_SFX(X) X##_s32 + #else + #define TO_SIMD_SFX(X) X##_u32 + #endif + +#line 499 +#elif NPY_BITSOF_BYTE == 64 + #if 0 + #define TO_SIMD_SFX(X) X##_s64 + #else + #define TO_SIMD_SFX(X) X##_u64 + #endif + +#endif + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_fmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_ubyte), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ubyte), NPY_SIMD_WIDTH) && + (*(npy_ubyte *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_ubyte in1 = *(npy_ubyte *)ip1; + const npy_ubyte in2 = *(npy_ubyte *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_ubyte *)op1) = 0; + } else{ + *((npy_ubyte *)op1)= in1 % in2; + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_remainder) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_ubyte), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ubyte), NPY_SIMD_WIDTH) && + (*(npy_ubyte *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_ubyte in1 = *(npy_ubyte *)ip1; + const npy_ubyte in2 = *(npy_ubyte *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_ubyte *)op1) = 0; + } else{ +#if 0 + /* handle mixed case the way Python does */ + const npy_ubyte rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_ubyte *)op1) = rem; + } + else { + *((npy_ubyte *)op1) = rem + in2; + } +#else + *((npy_ubyte *)op1)= in1 % in2; +#endif + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_divmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_ubyte), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ubyte), NPY_SIMD_WIDTH) && + (*(npy_ubyte *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif +#if 0 + BINARY_LOOP_TWO_OUT { + const npy_ubyte in1 = *(npy_ubyte *)ip1; + const npy_ubyte in2 = *(npy_ubyte *)ip2; + /* see FIXME note for divide above */ + if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_UBYTE && in2 == -1))) { + npy_set_floatstatus_divbyzero(); + *((npy_ubyte *)op1) = 0; + *((npy_ubyte *)op2) = 0; + } + else { + /* handle mixed case the way Python does */ + const npy_ubyte quo = in1 / in2; + const npy_ubyte rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_ubyte *)op1) = quo; + *((npy_ubyte *)op2) = rem; + } + else { + *((npy_ubyte *)op1) = quo - 1; + *((npy_ubyte *)op2) = rem + in2; + } + } + } +#else + BINARY_LOOP_TWO_OUT { + const npy_ubyte in1 = *(npy_ubyte *)ip1; + const npy_ubyte in2 = *(npy_ubyte *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_ubyte *)op1) = 0; + *((npy_ubyte *)op2) = 0; + } + else { + *((npy_ubyte *)op1)= in1/in2; + *((npy_ubyte *)op2) = in1 % in2; + } + } +#endif +} + +#line 494 +#undef TO_SIMD_SFX +#if 0 +#line 499 +#elif NPY_BITSOF_SHORT == 8 + #if 0 + #define TO_SIMD_SFX(X) X##_s8 + #else + #define TO_SIMD_SFX(X) X##_u8 + #endif + +#line 499 +#elif NPY_BITSOF_SHORT == 16 + #if 0 + #define TO_SIMD_SFX(X) X##_s16 + #else + #define TO_SIMD_SFX(X) X##_u16 + #endif + +#line 499 +#elif NPY_BITSOF_SHORT == 32 + #if 0 + #define TO_SIMD_SFX(X) X##_s32 + #else + #define TO_SIMD_SFX(X) X##_u32 + #endif + +#line 499 +#elif NPY_BITSOF_SHORT == 64 + #if 0 + #define TO_SIMD_SFX(X) X##_s64 + #else + #define TO_SIMD_SFX(X) X##_u64 + #endif + +#endif + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_fmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_ushort), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ushort), NPY_SIMD_WIDTH) && + (*(npy_ushort *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_ushort in1 = *(npy_ushort *)ip1; + const npy_ushort in2 = *(npy_ushort *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_ushort *)op1) = 0; + } else{ + *((npy_ushort *)op1)= in1 % in2; + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_remainder) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_ushort), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ushort), NPY_SIMD_WIDTH) && + (*(npy_ushort *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_ushort in1 = *(npy_ushort *)ip1; + const npy_ushort in2 = *(npy_ushort *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_ushort *)op1) = 0; + } else{ +#if 0 + /* handle mixed case the way Python does */ + const npy_ushort rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_ushort *)op1) = rem; + } + else { + *((npy_ushort *)op1) = rem + in2; + } +#else + *((npy_ushort *)op1)= in1 % in2; +#endif + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_divmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_ushort), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ushort), NPY_SIMD_WIDTH) && + (*(npy_ushort *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif +#if 0 + BINARY_LOOP_TWO_OUT { + const npy_ushort in1 = *(npy_ushort *)ip1; + const npy_ushort in2 = *(npy_ushort *)ip2; + /* see FIXME note for divide above */ + if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_USHORT && in2 == -1))) { + npy_set_floatstatus_divbyzero(); + *((npy_ushort *)op1) = 0; + *((npy_ushort *)op2) = 0; + } + else { + /* handle mixed case the way Python does */ + const npy_ushort quo = in1 / in2; + const npy_ushort rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_ushort *)op1) = quo; + *((npy_ushort *)op2) = rem; + } + else { + *((npy_ushort *)op1) = quo - 1; + *((npy_ushort *)op2) = rem + in2; + } + } + } +#else + BINARY_LOOP_TWO_OUT { + const npy_ushort in1 = *(npy_ushort *)ip1; + const npy_ushort in2 = *(npy_ushort *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_ushort *)op1) = 0; + *((npy_ushort *)op2) = 0; + } + else { + *((npy_ushort *)op1)= in1/in2; + *((npy_ushort *)op2) = in1 % in2; + } + } +#endif +} + +#line 494 +#undef TO_SIMD_SFX +#if 0 +#line 499 +#elif NPY_BITSOF_INT == 8 + #if 0 + #define TO_SIMD_SFX(X) X##_s8 + #else + #define TO_SIMD_SFX(X) X##_u8 + #endif + +#line 499 +#elif NPY_BITSOF_INT == 16 + #if 0 + #define TO_SIMD_SFX(X) X##_s16 + #else + #define TO_SIMD_SFX(X) X##_u16 + #endif + +#line 499 +#elif NPY_BITSOF_INT == 32 + #if 0 + #define TO_SIMD_SFX(X) X##_s32 + #else + #define TO_SIMD_SFX(X) X##_u32 + #endif + +#line 499 +#elif NPY_BITSOF_INT == 64 + #if 0 + #define TO_SIMD_SFX(X) X##_s64 + #else + #define TO_SIMD_SFX(X) X##_u64 + #endif + +#endif + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_fmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_uint), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_uint), NPY_SIMD_WIDTH) && + (*(npy_uint *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_uint in1 = *(npy_uint *)ip1; + const npy_uint in2 = *(npy_uint *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_uint *)op1) = 0; + } else{ + *((npy_uint *)op1)= in1 % in2; + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_remainder) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_uint), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_uint), NPY_SIMD_WIDTH) && + (*(npy_uint *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_uint in1 = *(npy_uint *)ip1; + const npy_uint in2 = *(npy_uint *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_uint *)op1) = 0; + } else{ +#if 0 + /* handle mixed case the way Python does */ + const npy_uint rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_uint *)op1) = rem; + } + else { + *((npy_uint *)op1) = rem + in2; + } +#else + *((npy_uint *)op1)= in1 % in2; +#endif + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_divmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_uint), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_uint), NPY_SIMD_WIDTH) && + (*(npy_uint *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif +#if 0 + BINARY_LOOP_TWO_OUT { + const npy_uint in1 = *(npy_uint *)ip1; + const npy_uint in2 = *(npy_uint *)ip2; + /* see FIXME note for divide above */ + if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_UINT && in2 == -1))) { + npy_set_floatstatus_divbyzero(); + *((npy_uint *)op1) = 0; + *((npy_uint *)op2) = 0; + } + else { + /* handle mixed case the way Python does */ + const npy_uint quo = in1 / in2; + const npy_uint rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_uint *)op1) = quo; + *((npy_uint *)op2) = rem; + } + else { + *((npy_uint *)op1) = quo - 1; + *((npy_uint *)op2) = rem + in2; + } + } + } +#else + BINARY_LOOP_TWO_OUT { + const npy_uint in1 = *(npy_uint *)ip1; + const npy_uint in2 = *(npy_uint *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_uint *)op1) = 0; + *((npy_uint *)op2) = 0; + } + else { + *((npy_uint *)op1)= in1/in2; + *((npy_uint *)op2) = in1 % in2; + } + } +#endif +} + +#line 494 +#undef TO_SIMD_SFX +#if 0 +#line 499 +#elif NPY_BITSOF_LONG == 8 + #if 0 + #define TO_SIMD_SFX(X) X##_s8 + #else + #define TO_SIMD_SFX(X) X##_u8 + #endif + +#line 499 +#elif NPY_BITSOF_LONG == 16 + #if 0 + #define TO_SIMD_SFX(X) X##_s16 + #else + #define TO_SIMD_SFX(X) X##_u16 + #endif + +#line 499 +#elif NPY_BITSOF_LONG == 32 + #if 0 + #define TO_SIMD_SFX(X) X##_s32 + #else + #define TO_SIMD_SFX(X) X##_u32 + #endif + +#line 499 +#elif NPY_BITSOF_LONG == 64 + #if 0 + #define TO_SIMD_SFX(X) X##_s64 + #else + #define TO_SIMD_SFX(X) X##_u64 + #endif + +#endif + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_fmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_ulong), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulong), NPY_SIMD_WIDTH) && + (*(npy_ulong *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_ulong in1 = *(npy_ulong *)ip1; + const npy_ulong in2 = *(npy_ulong *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_ulong *)op1) = 0; + } else{ + *((npy_ulong *)op1)= in1 % in2; + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_remainder) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_ulong), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulong), NPY_SIMD_WIDTH) && + (*(npy_ulong *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_ulong in1 = *(npy_ulong *)ip1; + const npy_ulong in2 = *(npy_ulong *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_ulong *)op1) = 0; + } else{ +#if 0 + /* handle mixed case the way Python does */ + const npy_ulong rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_ulong *)op1) = rem; + } + else { + *((npy_ulong *)op1) = rem + in2; + } +#else + *((npy_ulong *)op1)= in1 % in2; +#endif + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_divmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_ulong), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulong), NPY_SIMD_WIDTH) && + (*(npy_ulong *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif +#if 0 + BINARY_LOOP_TWO_OUT { + const npy_ulong in1 = *(npy_ulong *)ip1; + const npy_ulong in2 = *(npy_ulong *)ip2; + /* see FIXME note for divide above */ + if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_ULONG && in2 == -1))) { + npy_set_floatstatus_divbyzero(); + *((npy_ulong *)op1) = 0; + *((npy_ulong *)op2) = 0; + } + else { + /* handle mixed case the way Python does */ + const npy_ulong quo = in1 / in2; + const npy_ulong rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_ulong *)op1) = quo; + *((npy_ulong *)op2) = rem; + } + else { + *((npy_ulong *)op1) = quo - 1; + *((npy_ulong *)op2) = rem + in2; + } + } + } +#else + BINARY_LOOP_TWO_OUT { + const npy_ulong in1 = *(npy_ulong *)ip1; + const npy_ulong in2 = *(npy_ulong *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_ulong *)op1) = 0; + *((npy_ulong *)op2) = 0; + } + else { + *((npy_ulong *)op1)= in1/in2; + *((npy_ulong *)op2) = in1 % in2; + } + } +#endif +} + +#line 494 +#undef TO_SIMD_SFX +#if 0 +#line 499 +#elif NPY_BITSOF_LONGLONG == 8 + #if 0 + #define TO_SIMD_SFX(X) X##_s8 + #else + #define TO_SIMD_SFX(X) X##_u8 + #endif + +#line 499 +#elif NPY_BITSOF_LONGLONG == 16 + #if 0 + #define TO_SIMD_SFX(X) X##_s16 + #else + #define TO_SIMD_SFX(X) X##_u16 + #endif + +#line 499 +#elif NPY_BITSOF_LONGLONG == 32 + #if 0 + #define TO_SIMD_SFX(X) X##_s32 + #else + #define TO_SIMD_SFX(X) X##_u32 + #endif + +#line 499 +#elif NPY_BITSOF_LONGLONG == 64 + #if 0 + #define TO_SIMD_SFX(X) X##_s64 + #else + #define TO_SIMD_SFX(X) X##_u64 + #endif + +#endif + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_ulonglong), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulonglong), NPY_SIMD_WIDTH) && + (*(npy_ulonglong *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_ulonglong in1 = *(npy_ulonglong *)ip1; + const npy_ulonglong in2 = *(npy_ulonglong *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_ulonglong *)op1) = 0; + } else{ + *((npy_ulonglong *)op1)= in1 % in2; + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_remainder) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_ulonglong), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulonglong), NPY_SIMD_WIDTH) && + (*(npy_ulonglong *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_ulonglong in1 = *(npy_ulonglong *)ip1; + const npy_ulonglong in2 = *(npy_ulonglong *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_ulonglong *)op1) = 0; + } else{ +#if 0 + /* handle mixed case the way Python does */ + const npy_ulonglong rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_ulonglong *)op1) = rem; + } + else { + *((npy_ulonglong *)op1) = rem + in2; + } +#else + *((npy_ulonglong *)op1)= in1 % in2; +#endif + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_divmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_ulonglong), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulonglong), NPY_SIMD_WIDTH) && + (*(npy_ulonglong *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif +#if 0 + BINARY_LOOP_TWO_OUT { + const npy_ulonglong in1 = *(npy_ulonglong *)ip1; + const npy_ulonglong in2 = *(npy_ulonglong *)ip2; + /* see FIXME note for divide above */ + if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_ULONGLONG && in2 == -1))) { + npy_set_floatstatus_divbyzero(); + *((npy_ulonglong *)op1) = 0; + *((npy_ulonglong *)op2) = 0; + } + else { + /* handle mixed case the way Python does */ + const npy_ulonglong quo = in1 / in2; + const npy_ulonglong rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_ulonglong *)op1) = quo; + *((npy_ulonglong *)op2) = rem; + } + else { + *((npy_ulonglong *)op1) = quo - 1; + *((npy_ulonglong *)op2) = rem + in2; + } + } + } +#else + BINARY_LOOP_TWO_OUT { + const npy_ulonglong in1 = *(npy_ulonglong *)ip1; + const npy_ulonglong in2 = *(npy_ulonglong *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_ulonglong *)op1) = 0; + *((npy_ulonglong *)op2) = 0; + } + else { + *((npy_ulonglong *)op1)= in1/in2; + *((npy_ulonglong *)op2) = in1 % in2; + } + } +#endif +} + +#line 494 +#undef TO_SIMD_SFX +#if 0 +#line 499 +#elif NPY_BITSOF_BYTE == 8 + #if 1 + #define TO_SIMD_SFX(X) X##_s8 + #else + #define TO_SIMD_SFX(X) X##_u8 + #endif + +#line 499 +#elif NPY_BITSOF_BYTE == 16 + #if 1 + #define TO_SIMD_SFX(X) X##_s16 + #else + #define TO_SIMD_SFX(X) X##_u16 + #endif + +#line 499 +#elif NPY_BITSOF_BYTE == 32 + #if 1 + #define TO_SIMD_SFX(X) X##_s32 + #else + #define TO_SIMD_SFX(X) X##_u32 + #endif + +#line 499 +#elif NPY_BITSOF_BYTE == 64 + #if 1 + #define TO_SIMD_SFX(X) X##_s64 + #else + #define TO_SIMD_SFX(X) X##_u64 + #endif + +#endif + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_fmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_byte), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_byte), NPY_SIMD_WIDTH) && + (*(npy_byte *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_byte in1 = *(npy_byte *)ip1; + const npy_byte in2 = *(npy_byte *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_byte *)op1) = 0; + } else{ + *((npy_byte *)op1)= in1 % in2; + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_remainder) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_byte), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_byte), NPY_SIMD_WIDTH) && + (*(npy_byte *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_byte in1 = *(npy_byte *)ip1; + const npy_byte in2 = *(npy_byte *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_byte *)op1) = 0; + } else{ +#if 1 + /* handle mixed case the way Python does */ + const npy_byte rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_byte *)op1) = rem; + } + else { + *((npy_byte *)op1) = rem + in2; + } +#else + *((npy_byte *)op1)= in1 % in2; +#endif + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_divmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_byte), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_byte), NPY_SIMD_WIDTH) && + (*(npy_byte *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif +#if 1 + BINARY_LOOP_TWO_OUT { + const npy_byte in1 = *(npy_byte *)ip1; + const npy_byte in2 = *(npy_byte *)ip2; + /* see FIXME note for divide above */ + if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_BYTE && in2 == -1))) { + npy_set_floatstatus_divbyzero(); + *((npy_byte *)op1) = 0; + *((npy_byte *)op2) = 0; + } + else { + /* handle mixed case the way Python does */ + const npy_byte quo = in1 / in2; + const npy_byte rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_byte *)op1) = quo; + *((npy_byte *)op2) = rem; + } + else { + *((npy_byte *)op1) = quo - 1; + *((npy_byte *)op2) = rem + in2; + } + } + } +#else + BINARY_LOOP_TWO_OUT { + const npy_byte in1 = *(npy_byte *)ip1; + const npy_byte in2 = *(npy_byte *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_byte *)op1) = 0; + *((npy_byte *)op2) = 0; + } + else { + *((npy_byte *)op1)= in1/in2; + *((npy_byte *)op2) = in1 % in2; + } + } +#endif +} + +#line 494 +#undef TO_SIMD_SFX +#if 0 +#line 499 +#elif NPY_BITSOF_SHORT == 8 + #if 1 + #define TO_SIMD_SFX(X) X##_s8 + #else + #define TO_SIMD_SFX(X) X##_u8 + #endif + +#line 499 +#elif NPY_BITSOF_SHORT == 16 + #if 1 + #define TO_SIMD_SFX(X) X##_s16 + #else + #define TO_SIMD_SFX(X) X##_u16 + #endif + +#line 499 +#elif NPY_BITSOF_SHORT == 32 + #if 1 + #define TO_SIMD_SFX(X) X##_s32 + #else + #define TO_SIMD_SFX(X) X##_u32 + #endif + +#line 499 +#elif NPY_BITSOF_SHORT == 64 + #if 1 + #define TO_SIMD_SFX(X) X##_s64 + #else + #define TO_SIMD_SFX(X) X##_u64 + #endif + +#endif + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_fmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_short), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_short), NPY_SIMD_WIDTH) && + (*(npy_short *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_short in1 = *(npy_short *)ip1; + const npy_short in2 = *(npy_short *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_short *)op1) = 0; + } else{ + *((npy_short *)op1)= in1 % in2; + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_remainder) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_short), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_short), NPY_SIMD_WIDTH) && + (*(npy_short *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_short in1 = *(npy_short *)ip1; + const npy_short in2 = *(npy_short *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_short *)op1) = 0; + } else{ +#if 1 + /* handle mixed case the way Python does */ + const npy_short rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_short *)op1) = rem; + } + else { + *((npy_short *)op1) = rem + in2; + } +#else + *((npy_short *)op1)= in1 % in2; +#endif + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_divmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_short), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_short), NPY_SIMD_WIDTH) && + (*(npy_short *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif +#if 1 + BINARY_LOOP_TWO_OUT { + const npy_short in1 = *(npy_short *)ip1; + const npy_short in2 = *(npy_short *)ip2; + /* see FIXME note for divide above */ + if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_SHORT && in2 == -1))) { + npy_set_floatstatus_divbyzero(); + *((npy_short *)op1) = 0; + *((npy_short *)op2) = 0; + } + else { + /* handle mixed case the way Python does */ + const npy_short quo = in1 / in2; + const npy_short rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_short *)op1) = quo; + *((npy_short *)op2) = rem; + } + else { + *((npy_short *)op1) = quo - 1; + *((npy_short *)op2) = rem + in2; + } + } + } +#else + BINARY_LOOP_TWO_OUT { + const npy_short in1 = *(npy_short *)ip1; + const npy_short in2 = *(npy_short *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_short *)op1) = 0; + *((npy_short *)op2) = 0; + } + else { + *((npy_short *)op1)= in1/in2; + *((npy_short *)op2) = in1 % in2; + } + } +#endif +} + +#line 494 +#undef TO_SIMD_SFX +#if 0 +#line 499 +#elif NPY_BITSOF_INT == 8 + #if 1 + #define TO_SIMD_SFX(X) X##_s8 + #else + #define TO_SIMD_SFX(X) X##_u8 + #endif + +#line 499 +#elif NPY_BITSOF_INT == 16 + #if 1 + #define TO_SIMD_SFX(X) X##_s16 + #else + #define TO_SIMD_SFX(X) X##_u16 + #endif + +#line 499 +#elif NPY_BITSOF_INT == 32 + #if 1 + #define TO_SIMD_SFX(X) X##_s32 + #else + #define TO_SIMD_SFX(X) X##_u32 + #endif + +#line 499 +#elif NPY_BITSOF_INT == 64 + #if 1 + #define TO_SIMD_SFX(X) X##_s64 + #else + #define TO_SIMD_SFX(X) X##_u64 + #endif + +#endif + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_fmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_int), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_int), NPY_SIMD_WIDTH) && + (*(npy_int *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_int in1 = *(npy_int *)ip1; + const npy_int in2 = *(npy_int *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_int *)op1) = 0; + } else{ + *((npy_int *)op1)= in1 % in2; + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_remainder) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_int), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_int), NPY_SIMD_WIDTH) && + (*(npy_int *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_int in1 = *(npy_int *)ip1; + const npy_int in2 = *(npy_int *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_int *)op1) = 0; + } else{ +#if 1 + /* handle mixed case the way Python does */ + const npy_int rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_int *)op1) = rem; + } + else { + *((npy_int *)op1) = rem + in2; + } +#else + *((npy_int *)op1)= in1 % in2; +#endif + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_divmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_int), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_int), NPY_SIMD_WIDTH) && + (*(npy_int *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif +#if 1 + BINARY_LOOP_TWO_OUT { + const npy_int in1 = *(npy_int *)ip1; + const npy_int in2 = *(npy_int *)ip2; + /* see FIXME note for divide above */ + if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_INT && in2 == -1))) { + npy_set_floatstatus_divbyzero(); + *((npy_int *)op1) = 0; + *((npy_int *)op2) = 0; + } + else { + /* handle mixed case the way Python does */ + const npy_int quo = in1 / in2; + const npy_int rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_int *)op1) = quo; + *((npy_int *)op2) = rem; + } + else { + *((npy_int *)op1) = quo - 1; + *((npy_int *)op2) = rem + in2; + } + } + } +#else + BINARY_LOOP_TWO_OUT { + const npy_int in1 = *(npy_int *)ip1; + const npy_int in2 = *(npy_int *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_int *)op1) = 0; + *((npy_int *)op2) = 0; + } + else { + *((npy_int *)op1)= in1/in2; + *((npy_int *)op2) = in1 % in2; + } + } +#endif +} + +#line 494 +#undef TO_SIMD_SFX +#if 0 +#line 499 +#elif NPY_BITSOF_LONG == 8 + #if 1 + #define TO_SIMD_SFX(X) X##_s8 + #else + #define TO_SIMD_SFX(X) X##_u8 + #endif + +#line 499 +#elif NPY_BITSOF_LONG == 16 + #if 1 + #define TO_SIMD_SFX(X) X##_s16 + #else + #define TO_SIMD_SFX(X) X##_u16 + #endif + +#line 499 +#elif NPY_BITSOF_LONG == 32 + #if 1 + #define TO_SIMD_SFX(X) X##_s32 + #else + #define TO_SIMD_SFX(X) X##_u32 + #endif + +#line 499 +#elif NPY_BITSOF_LONG == 64 + #if 1 + #define TO_SIMD_SFX(X) X##_s64 + #else + #define TO_SIMD_SFX(X) X##_u64 + #endif + +#endif + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_fmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_long), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_long), NPY_SIMD_WIDTH) && + (*(npy_long *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_long in1 = *(npy_long *)ip1; + const npy_long in2 = *(npy_long *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_long *)op1) = 0; + } else{ + *((npy_long *)op1)= in1 % in2; + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_remainder) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_long), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_long), NPY_SIMD_WIDTH) && + (*(npy_long *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_long in1 = *(npy_long *)ip1; + const npy_long in2 = *(npy_long *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_long *)op1) = 0; + } else{ +#if 1 + /* handle mixed case the way Python does */ + const npy_long rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_long *)op1) = rem; + } + else { + *((npy_long *)op1) = rem + in2; + } +#else + *((npy_long *)op1)= in1 % in2; +#endif + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_divmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_long), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_long), NPY_SIMD_WIDTH) && + (*(npy_long *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif +#if 1 + BINARY_LOOP_TWO_OUT { + const npy_long in1 = *(npy_long *)ip1; + const npy_long in2 = *(npy_long *)ip2; + /* see FIXME note for divide above */ + if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_LONG && in2 == -1))) { + npy_set_floatstatus_divbyzero(); + *((npy_long *)op1) = 0; + *((npy_long *)op2) = 0; + } + else { + /* handle mixed case the way Python does */ + const npy_long quo = in1 / in2; + const npy_long rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_long *)op1) = quo; + *((npy_long *)op2) = rem; + } + else { + *((npy_long *)op1) = quo - 1; + *((npy_long *)op2) = rem + in2; + } + } + } +#else + BINARY_LOOP_TWO_OUT { + const npy_long in1 = *(npy_long *)ip1; + const npy_long in2 = *(npy_long *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_long *)op1) = 0; + *((npy_long *)op2) = 0; + } + else { + *((npy_long *)op1)= in1/in2; + *((npy_long *)op2) = in1 % in2; + } + } +#endif +} + +#line 494 +#undef TO_SIMD_SFX +#if 0 +#line 499 +#elif NPY_BITSOF_LONGLONG == 8 + #if 1 + #define TO_SIMD_SFX(X) X##_s8 + #else + #define TO_SIMD_SFX(X) X##_u8 + #endif + +#line 499 +#elif NPY_BITSOF_LONGLONG == 16 + #if 1 + #define TO_SIMD_SFX(X) X##_s16 + #else + #define TO_SIMD_SFX(X) X##_u16 + #endif + +#line 499 +#elif NPY_BITSOF_LONGLONG == 32 + #if 1 + #define TO_SIMD_SFX(X) X##_s32 + #else + #define TO_SIMD_SFX(X) X##_u32 + #endif + +#line 499 +#elif NPY_BITSOF_LONGLONG == 64 + #if 1 + #define TO_SIMD_SFX(X) X##_s64 + #else + #define TO_SIMD_SFX(X) X##_u64 + #endif + +#endif + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_fmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_longlong), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longlong), NPY_SIMD_WIDTH) && + (*(npy_longlong *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_longlong in1 = *(npy_longlong *)ip1; + const npy_longlong in2 = *(npy_longlong *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_longlong *)op1) = 0; + } else{ + *((npy_longlong *)op1)= in1 % in2; + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_remainder) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_longlong), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longlong), NPY_SIMD_WIDTH) && + (*(npy_longlong *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif + BINARY_LOOP { + const npy_longlong in1 = *(npy_longlong *)ip1; + const npy_longlong in2 = *(npy_longlong *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_longlong *)op1) = 0; + } else{ +#if 1 + /* handle mixed case the way Python does */ + const npy_longlong rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_longlong *)op1) = rem; + } + else { + *((npy_longlong *)op1) = rem + in2; + } +#else + *((npy_longlong *)op1)= in1 % in2; +#endif + } + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_divmod) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) + // both arguments are arrays of the same size + if (IS_BLOCKABLE_BINARY(sizeof(npy_longlong), NPY_SIMD_WIDTH)) { + TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); + return; + } + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longlong), NPY_SIMD_WIDTH) && + (*(npy_longlong *)args[1]) != 0) { + TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); + return ; + } +#endif +#if 1 + BINARY_LOOP_TWO_OUT { + const npy_longlong in1 = *(npy_longlong *)ip1; + const npy_longlong in2 = *(npy_longlong *)ip2; + /* see FIXME note for divide above */ + if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_LONGLONG && in2 == -1))) { + npy_set_floatstatus_divbyzero(); + *((npy_longlong *)op1) = 0; + *((npy_longlong *)op2) = 0; + } + else { + /* handle mixed case the way Python does */ + const npy_longlong quo = in1 / in2; + const npy_longlong rem = in1 % in2; + if ((in1 > 0) == (in2 > 0) || rem == 0) { + *((npy_longlong *)op1) = quo; + *((npy_longlong *)op2) = rem; + } + else { + *((npy_longlong *)op1) = quo - 1; + *((npy_longlong *)op2) = rem + in2; + } + } + } +#else + BINARY_LOOP_TWO_OUT { + const npy_longlong in1 = *(npy_longlong *)ip1; + const npy_longlong in2 = *(npy_longlong *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((npy_longlong *)op1) = 0; + *((npy_longlong *)op2) = 0; + } + else { + *((npy_longlong *)op1)= in1/in2; + *((npy_longlong *)op2) = in1 % in2; + } + } +#endif +} + + From 5a252a91f5e8075ab8e375bee240d39f0e33d43f Mon Sep 17 00:00:00 2001 From: JessePires Date: Mon, 30 May 2022 20:58:33 -0300 Subject: [PATCH 02/15] ENH: adding dtype option to numpy.stack. See #20959 --- numpy/core/shape_base.py | 21 ++++++++++++++++----- numpy/core/shape_base.pyi | 21 ++++++++++++++++++++- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py index 1a4198c5f8e9..6544ce68ba8f 100644 --- a/numpy/core/shape_base.py +++ b/numpy/core/shape_base.py @@ -345,7 +345,8 @@ def hstack(tup): return _nx.concatenate(arrs, 1) -def _stack_dispatcher(arrays, axis=None, out=None): +def _stack_dispatcher(arrays, axis=None, out=None, *, + dtype=None, casting=None): arrays = _arrays_for_stack_dispatcher(arrays, stacklevel=6) if out is not None: # optimize for the typical case where only arrays is provided @@ -355,7 +356,7 @@ def _stack_dispatcher(arrays, axis=None, out=None): @array_function_dispatch(_stack_dispatcher) -def stack(arrays, axis=0, out=None): +def stack(arrays, axis=0, out=None, *, dtype=None, casting="same_kind"): """ Join a sequence of arrays along a new axis. @@ -378,6 +379,14 @@ def stack(arrays, axis=0, out=None): correct, matching that of what stack would have returned if no out argument were specified. + dtype : str or dtype + If provided, the destination array will have this dtype. Cannot be + provided together with `out`. + + casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional + Controls what kind of data casting may occur. Defaults to 'same_kind'. + + Returns ------- stacked : ndarray @@ -430,7 +439,8 @@ def stack(arrays, axis=0, out=None): sl = (slice(None),) * axis + (_nx.newaxis,) expanded_arrays = [arr[sl] for arr in arrays] - return _nx.concatenate(expanded_arrays, axis=axis, out=out) + return _nx.concatenate(expanded_arrays, axis=axis, out=out, + dtype=dtype, casting=casting) # Internal functions to eliminate the overhead of repeated dispatch in one of @@ -438,7 +448,8 @@ def stack(arrays, axis=0, out=None): # Use getattr to protect against __array_function__ being disabled. _size = getattr(_from_nx.size, '__wrapped__', _from_nx.size) _ndim = getattr(_from_nx.ndim, '__wrapped__', _from_nx.ndim) -_concatenate = getattr(_from_nx.concatenate, '__wrapped__', _from_nx.concatenate) +_concatenate = getattr(_from_nx.concatenate, + '__wrapped__', _from_nx.concatenate) def _block_format_index(index): @@ -539,7 +550,7 @@ def _concatenate_shapes(shapes, axis): """Given array shapes, return the resulting shape and slices prefixes. These help in nested concatenation. - + Returns ------- shape: tuple of int diff --git a/numpy/core/shape_base.pyi b/numpy/core/shape_base.pyi index cea355d443c0..b27db30ea95d 100644 --- a/numpy/core/shape_base.pyi +++ b/numpy/core/shape_base.pyi @@ -2,7 +2,7 @@ from collections.abc import Sequence from typing import TypeVar, overload, Any, SupportsIndex from numpy import generic -from numpy._typing import ArrayLike, NDArray, _ArrayLike +from numpy._typing import ArrayLike, NDArray, _ArrayLike, _CastingKind _SCT = TypeVar("_SCT", bound=generic) _ArrayType = TypeVar("_ArrayType", bound=NDArray[Any]) @@ -16,6 +16,7 @@ def atleast_1d(arys: ArrayLike, /) -> NDArray[Any]: ... @overload def atleast_1d(*arys: ArrayLike) -> list[NDArray[Any]]: ... + @overload def atleast_2d(arys: _ArrayLike[_SCT], /) -> NDArray[_SCT]: ... @overload @@ -23,6 +24,7 @@ def atleast_2d(arys: ArrayLike, /) -> NDArray[Any]: ... @overload def atleast_2d(*arys: ArrayLike) -> list[NDArray[Any]]: ... + @overload def atleast_3d(arys: _ArrayLike[_SCT], /) -> NDArray[_SCT]: ... @overload @@ -30,35 +32,52 @@ def atleast_3d(arys: ArrayLike, /) -> NDArray[Any]: ... @overload def atleast_3d(*arys: ArrayLike) -> list[NDArray[Any]]: ... + @overload def vstack(tup: Sequence[_ArrayLike[_SCT]]) -> NDArray[_SCT]: ... @overload def vstack(tup: Sequence[ArrayLike]) -> NDArray[Any]: ... + @overload def hstack(tup: Sequence[_ArrayLike[_SCT]]) -> NDArray[_SCT]: ... @overload def hstack(tup: Sequence[ArrayLike]) -> NDArray[Any]: ... + @overload def stack( arrays: Sequence[_ArrayLike[_SCT]], axis: SupportsIndex = ..., out: None = ..., + *, + dtype: None = ..., + casting: None | _CastingKind = ... ) -> NDArray[_SCT]: ... + + @overload def stack( arrays: Sequence[ArrayLike], axis: SupportsIndex = ..., out: None = ..., + *, + dtype: None = ..., + casting: None | _CastingKind = ... ) -> NDArray[Any]: ... + + @overload def stack( arrays: Sequence[ArrayLike], axis: SupportsIndex = ..., out: _ArrayType = ..., + *, + dtype: None = ..., + casting: None | _CastingKind = ... ) -> _ArrayType: ... + @overload def block(arrays: _ArrayLike[_SCT]) -> NDArray[_SCT]: ... @overload From c6ec33e9d27188506c9eaebb0fde96dd899e9f71 Mon Sep 17 00:00:00 2001 From: jhonatancunha Date: Mon, 30 May 2022 22:06:29 -0300 Subject: [PATCH 03/15] REV: removing auto-generated file loops_modulo.dispatch.c See numpy#20959 --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index d905f638f8d8..bfa7ca1128de 100644 --- a/.gitignore +++ b/.gitignore @@ -173,6 +173,7 @@ numpy/core/src/umath/struct_ufunc_test.c numpy/core/src/umath/test_rational.c numpy/core/src/umath/umath_tests.c numpy/core/src/umath/loops_utils.h +numpy/core/src/umath/loops_modulo.dispatch.c numpy/distutils/__config__.py numpy/linalg/umath_linalg.c doc/source/**/generated/ From 48101bc7e5129bfa2994da7dfda92f097a7ace16 Mon Sep 17 00:00:00 2001 From: Jhonatan Cunha Date: Mon, 30 May 2022 22:08:09 -0300 Subject: [PATCH 04/15] REV: removing auto-generated file loops_modulo.dispatch.c See numpy#20959 --- numpy/core/src/umath/loops_modulo.dispatch.c | 5714 ------------------ 1 file changed, 5714 deletions(-) delete mode 100644 numpy/core/src/umath/loops_modulo.dispatch.c diff --git a/numpy/core/src/umath/loops_modulo.dispatch.c b/numpy/core/src/umath/loops_modulo.dispatch.c deleted file mode 100644 index d29a0179560f..000000000000 --- a/numpy/core/src/umath/loops_modulo.dispatch.c +++ /dev/null @@ -1,5714 +0,0 @@ -#line 1 "numpy/core/src/umath/loops_modulo.dispatch.c.src" - -/* - ***************************************************************************** - ** This file was autogenerated from a template DO NOT EDIT!!!! ** - ** Changes should be made to the original source (.src) file ** - ***************************************************************************** - */ - -#line 1 -/*@targets - ** baseline vsx4 - **/ -#define _UMATHMODULE -#define _MULTIARRAYMODULE -#define NPY_NO_DEPRECATED_API NPY_API_VERSION - -#include "simd/simd.h" -#include "loops_utils.h" -#include "loops.h" -#include "lowlevel_strided_loops.h" -// Provides the various *_LOOP macros -#include "fast_loop_macros.h" - -#if NPY_SIMD && defined(NPY_HAVE_VSX4) -typedef struct { - npyv_u32x2 hi; - npyv_u32x2 lo; -} vsx4_u32x4; - -typedef struct { - npyv_s32x2 hi; - npyv_s32x2 lo; -} vsx4_s32x4; - -// Converts 1 8-bit vector into 2 16-bit vectors -NPY_FINLINE npyv_s16x2 -vsx4_expand_s16_s8(npyv_s8 data) -{ - npyv_s16x2 r; - r.val[0] = vec_unpackh(data); - r.val[1] = vec_unpackl(data); - return r; -} - -// Converts 1 16-bit vector into 2 32-bit vectors -NPY_FINLINE npyv_s32x2 -vsx4_expand_s32_s16(npyv_s16 data) -{ - npyv_s32x2 r; - r.val[0] = vec_unpackh(data); - r.val[1] = vec_unpackl(data); - return r; -} - -#line 50 -// Converts 1 8-bit vector into 4 32-bit vectors -NPY_FINLINE vsx4_u32x4 -vsx4_expand_u32_u8(npyv_u8 data) -{ - vsx4_u32x4 r; - npyv_u16x2 expand = npyv_expand_u16_u8(data); - r.hi = npyv_expand_u32_u16(expand.val[0]); - r.lo = npyv_expand_u32_u16(expand.val[1]); - return r; -} - -#line 64 -/* - * Computes division/modulo of 2 8-bit signed/unsigned integer vectors - * - * As Power10 only supports integer vector division/modulo for data of 32 bits - * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer - * vector division/modulo instruction, and then, convert the result back to - * npyv_u8. - */ -NPY_FINLINE npyv_u8 -vsx4_div_u8(npyv_u8 a, npyv_u8 b) -{ - vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a); - vsx4_u32x4 b_expand = vsx4_expand_u32_u8(b); - npyv_u32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]); - npyv_u32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]); - npyv_u32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]); - npyv_u32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]); - npyv_u16 hi = vec_pack(v1, v2); - npyv_u16 lo = vec_pack(v3, v4); - return vec_pack(hi, lo); -} - -NPY_FINLINE npyv_u8 -vsx4_div_scalar_u8(npyv_u8 a, const vsx4_u32x4 b_expand) -{ - vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a); - npyv_u32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]); - npyv_u32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]); - npyv_u32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]); - npyv_u32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]); - npyv_u16 hi = vec_pack(v1, v2); - npyv_u16 lo = vec_pack(v3, v4); - return vec_pack(hi, lo); -} - -NPY_FINLINE npyv_u16 -vsx4_div_u16(npyv_u16 a, npyv_u16 b) -{ - npyv_u32x2 a_expand = npyv_expand_u32_u16(a); - npyv_u32x2 b_expand = npyv_expand_u32_u16(b); - npyv_u32 v1 = vec_div(a_expand.val[0], b_expand.val[0]); - npyv_u32 v2 = vec_div(a_expand.val[1], b_expand.val[1]); - return vec_pack(v1, v2); -} - -NPY_FINLINE npyv_u16 -vsx4_div_scalar_u16(npyv_u16 a, const npyv_u32x2 b_expand) -{ - npyv_u32x2 a_expand = npyv_expand_u32_u16(a); - npyv_u32 v1 = vec_div(a_expand.val[0], b_expand.val[0]); - npyv_u32 v2 = vec_div(a_expand.val[1], b_expand.val[1]); - return vec_pack(v1, v2); -} - -#define vsx4_div_u32 vec_div -#define vsx4_div_u64 vec_div -#define vsx4_div_scalar_u32 vec_div -#define vsx4_div_scalar_u64 vec_div - -#line 64 -/* - * Computes division/modulo of 2 8-bit signed/unsigned integer vectors - * - * As Power10 only supports integer vector division/modulo for data of 32 bits - * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer - * vector division/modulo instruction, and then, convert the result back to - * npyv_u8. - */ -NPY_FINLINE npyv_u8 -vsx4_mod_u8(npyv_u8 a, npyv_u8 b) -{ - vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a); - vsx4_u32x4 b_expand = vsx4_expand_u32_u8(b); - npyv_u32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]); - npyv_u32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]); - npyv_u32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]); - npyv_u32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]); - npyv_u16 hi = vec_pack(v1, v2); - npyv_u16 lo = vec_pack(v3, v4); - return vec_pack(hi, lo); -} - -NPY_FINLINE npyv_u8 -vsx4_mod_scalar_u8(npyv_u8 a, const vsx4_u32x4 b_expand) -{ - vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a); - npyv_u32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]); - npyv_u32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]); - npyv_u32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]); - npyv_u32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]); - npyv_u16 hi = vec_pack(v1, v2); - npyv_u16 lo = vec_pack(v3, v4); - return vec_pack(hi, lo); -} - -NPY_FINLINE npyv_u16 -vsx4_mod_u16(npyv_u16 a, npyv_u16 b) -{ - npyv_u32x2 a_expand = npyv_expand_u32_u16(a); - npyv_u32x2 b_expand = npyv_expand_u32_u16(b); - npyv_u32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]); - npyv_u32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]); - return vec_pack(v1, v2); -} - -NPY_FINLINE npyv_u16 -vsx4_mod_scalar_u16(npyv_u16 a, const npyv_u32x2 b_expand) -{ - npyv_u32x2 a_expand = npyv_expand_u32_u16(a); - npyv_u32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]); - npyv_u32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]); - return vec_pack(v1, v2); -} - -#define vsx4_mod_u32 vec_mod -#define vsx4_mod_u64 vec_mod -#define vsx4_mod_scalar_u32 vec_mod -#define vsx4_mod_scalar_u64 vec_mod - - -#line 50 -// Converts 1 8-bit vector into 4 32-bit vectors -NPY_FINLINE vsx4_s32x4 -vsx4_expand_s32_s8(npyv_s8 data) -{ - vsx4_s32x4 r; - npyv_s16x2 expand = vsx4_expand_s16_s8(data); - r.hi = vsx4_expand_s32_s16(expand.val[0]); - r.lo = vsx4_expand_s32_s16(expand.val[1]); - return r; -} - -#line 64 -/* - * Computes division/modulo of 2 8-bit signed/unsigned integer vectors - * - * As Power10 only supports integer vector division/modulo for data of 32 bits - * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer - * vector division/modulo instruction, and then, convert the result back to - * npyv_u8. - */ -NPY_FINLINE npyv_s8 -vsx4_div_s8(npyv_s8 a, npyv_s8 b) -{ - vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a); - vsx4_s32x4 b_expand = vsx4_expand_s32_s8(b); - npyv_s32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]); - npyv_s32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]); - npyv_s32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]); - npyv_s32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]); - npyv_s16 hi = vec_pack(v1, v2); - npyv_s16 lo = vec_pack(v3, v4); - return vec_pack(hi, lo); -} - -NPY_FINLINE npyv_s8 -vsx4_div_scalar_s8(npyv_s8 a, const vsx4_s32x4 b_expand) -{ - vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a); - npyv_s32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]); - npyv_s32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]); - npyv_s32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]); - npyv_s32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]); - npyv_s16 hi = vec_pack(v1, v2); - npyv_s16 lo = vec_pack(v3, v4); - return vec_pack(hi, lo); -} - -NPY_FINLINE npyv_s16 -vsx4_div_s16(npyv_s16 a, npyv_s16 b) -{ - npyv_s32x2 a_expand = vsx4_expand_s32_s16(a); - npyv_s32x2 b_expand = vsx4_expand_s32_s16(b); - npyv_s32 v1 = vec_div(a_expand.val[0], b_expand.val[0]); - npyv_s32 v2 = vec_div(a_expand.val[1], b_expand.val[1]); - return vec_pack(v1, v2); -} - -NPY_FINLINE npyv_s16 -vsx4_div_scalar_s16(npyv_s16 a, const npyv_s32x2 b_expand) -{ - npyv_s32x2 a_expand = vsx4_expand_s32_s16(a); - npyv_s32 v1 = vec_div(a_expand.val[0], b_expand.val[0]); - npyv_s32 v2 = vec_div(a_expand.val[1], b_expand.val[1]); - return vec_pack(v1, v2); -} - -#define vsx4_div_s32 vec_div -#define vsx4_div_s64 vec_div -#define vsx4_div_scalar_s32 vec_div -#define vsx4_div_scalar_s64 vec_div - -#line 64 -/* - * Computes division/modulo of 2 8-bit signed/unsigned integer vectors - * - * As Power10 only supports integer vector division/modulo for data of 32 bits - * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer - * vector division/modulo instruction, and then, convert the result back to - * npyv_u8. - */ -NPY_FINLINE npyv_s8 -vsx4_mod_s8(npyv_s8 a, npyv_s8 b) -{ - vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a); - vsx4_s32x4 b_expand = vsx4_expand_s32_s8(b); - npyv_s32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]); - npyv_s32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]); - npyv_s32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]); - npyv_s32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]); - npyv_s16 hi = vec_pack(v1, v2); - npyv_s16 lo = vec_pack(v3, v4); - return vec_pack(hi, lo); -} - -NPY_FINLINE npyv_s8 -vsx4_mod_scalar_s8(npyv_s8 a, const vsx4_s32x4 b_expand) -{ - vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a); - npyv_s32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]); - npyv_s32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]); - npyv_s32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]); - npyv_s32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]); - npyv_s16 hi = vec_pack(v1, v2); - npyv_s16 lo = vec_pack(v3, v4); - return vec_pack(hi, lo); -} - -NPY_FINLINE npyv_s16 -vsx4_mod_s16(npyv_s16 a, npyv_s16 b) -{ - npyv_s32x2 a_expand = vsx4_expand_s32_s16(a); - npyv_s32x2 b_expand = vsx4_expand_s32_s16(b); - npyv_s32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]); - npyv_s32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]); - return vec_pack(v1, v2); -} - -NPY_FINLINE npyv_s16 -vsx4_mod_scalar_s16(npyv_s16 a, const npyv_s32x2 b_expand) -{ - npyv_s32x2 a_expand = vsx4_expand_s32_s16(a); - npyv_s32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]); - npyv_s32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]); - return vec_pack(v1, v2); -} - -#define vsx4_mod_s32 vec_mod -#define vsx4_mod_s64 vec_mod -#define vsx4_mod_scalar_s32 vec_mod -#define vsx4_mod_scalar_s64 vec_mod - - - -#line 131 -// Generates the divisor for the division/modulo operations -NPY_FINLINE vsx4_u32x4 -vsx4_divisor_u8(const npyv_u8 vscalar) -{ - return vsx4_expand_u32_u8(vscalar); -} - -#line 131 -// Generates the divisor for the division/modulo operations -NPY_FINLINE npyv_u32x2 -vsx4_divisor_u16(const npyv_u16 vscalar) -{ - return npyv_expand_u32_u16(vscalar); -} - -#line 131 -// Generates the divisor for the division/modulo operations -NPY_FINLINE vsx4_s32x4 -vsx4_divisor_s8(const npyv_s8 vscalar) -{ - return vsx4_expand_s32_s8(vscalar); -} - -#line 131 -// Generates the divisor for the division/modulo operations -NPY_FINLINE npyv_s32x2 -vsx4_divisor_s16(const npyv_s16 vscalar) -{ - return vsx4_expand_s32_s16(vscalar); -} - - -#line 142 -NPY_FINLINE npyv_u32 -vsx4_divisor_u32(const npyv_u32 vscalar) -{ - return vscalar; -} - -#line 142 -NPY_FINLINE npyv_u64 -vsx4_divisor_u64(const npyv_u64 vscalar) -{ - return vscalar; -} - -#line 142 -NPY_FINLINE npyv_s32 -vsx4_divisor_s32(const npyv_s32 vscalar) -{ - return vscalar; -} - -#line 142 -NPY_FINLINE npyv_s64 -vsx4_divisor_s64(const npyv_s64 vscalar) -{ - return vscalar; -} - - -#line 155 -#line 159 -static NPY_INLINE void -vsx4_simd_fmod_contig_u8(char **args, npy_intp len) -{ - npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0]; - npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1]; - npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2]; - const npyv_u8 vzero = npyv_zero_u8(); - const int vstep = npyv_nlanes_u8; -#if 0 == 2 /* divmod */ - npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3]; - const npyv_u8 vneg_one = npyv_setall_u8(-1); - npyv_b8 warn = npyv_cvt_b8_u8(npyv_zero_u8()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { - npyv_u8 a = npyv_load_u8(src1); - npyv_u8 b = npyv_load_u8(src2); - npyv_u8 quo = vsx4_div_u8(a, b); - npyv_u8 rem = npyv_sub_u8(a, vec_mul(b, quo)); - npyv_b8 bzero = npyv_cmpeq_u8(b, vzero); - // when b is 0, 'cvtozero' forces the modulo to be 0 too - npyv_u8 cvtozero = npyv_select_u8(bzero, vzero, vneg_one); - warn = npyv_or_u8(bzero, warn); - npyv_store_u8(dst1, quo); - npyv_store_u8(dst2, npyv_and_u8(cvtozero, rem)); - } - - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_u8 a = *src1; - const npyv_lanetype_u8 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } else{ - *dst1 = a / b; - *dst2 = a % b; - } - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_u8 a = npyv_load_u8(src1); - npyv_u8 b = npyv_load_u8(src2); - npyv_u8 c = vsx4_mod_u8(a, b); - npyv_store_u8(dst1, c); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_u8 a = *src1; - const npyv_lanetype_u8 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_fmod_by_scalar_contig_u8(char **args, npy_intp len) -{ - npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0]; - npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1]; - npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2]; - const int vstep = npyv_nlanes_u8; - const npyv_u8 vscalar = npyv_setall_u8(scalar); - const vsx4_u32x4 divisor = vsx4_divisor_u8(vscalar); -#if 0 == 2 /* divmod */ - npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { - npyv_u8 a = npyv_load_u8(src1); - npyv_u8 quo = vsx4_div_scalar_u8(a, divisor); - npyv_u8 rem = npyv_sub_u8(a, vec_mul(vscalar, quo)); - npyv_store_u8(dst1, quo); - npyv_store_u8(dst2, rem); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_u8 a = *src1; - *dst1 = a / scalar; - *dst2 = a % scalar; - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { - npyv_u8 a = npyv_load_u8(src1); - npyv_u8 c = vsx4_mod_scalar_u8(a, divisor); - npyv_store_u8(dst1, c); - } - - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_u8 a = *src1; - *dst1 = a % scalar; - } -#endif - npyv_cleanup(); -} - -#line 159 -static NPY_INLINE void -vsx4_simd_remainder_contig_u8(char **args, npy_intp len) -{ - npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0]; - npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1]; - npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2]; - const npyv_u8 vzero = npyv_zero_u8(); - const int vstep = npyv_nlanes_u8; -#if 1 == 2 /* divmod */ - npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3]; - const npyv_u8 vneg_one = npyv_setall_u8(-1); - npyv_b8 warn = npyv_cvt_b8_u8(npyv_zero_u8()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { - npyv_u8 a = npyv_load_u8(src1); - npyv_u8 b = npyv_load_u8(src2); - npyv_u8 quo = vsx4_div_u8(a, b); - npyv_u8 rem = npyv_sub_u8(a, vec_mul(b, quo)); - npyv_b8 bzero = npyv_cmpeq_u8(b, vzero); - // when b is 0, 'cvtozero' forces the modulo to be 0 too - npyv_u8 cvtozero = npyv_select_u8(bzero, vzero, vneg_one); - warn = npyv_or_u8(bzero, warn); - npyv_store_u8(dst1, quo); - npyv_store_u8(dst2, npyv_and_u8(cvtozero, rem)); - } - - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_u8 a = *src1; - const npyv_lanetype_u8 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } else{ - *dst1 = a / b; - *dst2 = a % b; - } - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_u8 a = npyv_load_u8(src1); - npyv_u8 b = npyv_load_u8(src2); - npyv_u8 c = vsx4_mod_u8(a, b); - npyv_store_u8(dst1, c); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_u8 a = *src1; - const npyv_lanetype_u8 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_remainder_by_scalar_contig_u8(char **args, npy_intp len) -{ - npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0]; - npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1]; - npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2]; - const int vstep = npyv_nlanes_u8; - const npyv_u8 vscalar = npyv_setall_u8(scalar); - const vsx4_u32x4 divisor = vsx4_divisor_u8(vscalar); -#if 1 == 2 /* divmod */ - npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { - npyv_u8 a = npyv_load_u8(src1); - npyv_u8 quo = vsx4_div_scalar_u8(a, divisor); - npyv_u8 rem = npyv_sub_u8(a, vec_mul(vscalar, quo)); - npyv_store_u8(dst1, quo); - npyv_store_u8(dst2, rem); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_u8 a = *src1; - *dst1 = a / scalar; - *dst2 = a % scalar; - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { - npyv_u8 a = npyv_load_u8(src1); - npyv_u8 c = vsx4_mod_scalar_u8(a, divisor); - npyv_store_u8(dst1, c); - } - - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_u8 a = *src1; - *dst1 = a % scalar; - } -#endif - npyv_cleanup(); -} - -#line 159 -static NPY_INLINE void -vsx4_simd_divmod_contig_u8(char **args, npy_intp len) -{ - npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0]; - npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1]; - npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2]; - const npyv_u8 vzero = npyv_zero_u8(); - const int vstep = npyv_nlanes_u8; -#if 2 == 2 /* divmod */ - npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3]; - const npyv_u8 vneg_one = npyv_setall_u8(-1); - npyv_b8 warn = npyv_cvt_b8_u8(npyv_zero_u8()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { - npyv_u8 a = npyv_load_u8(src1); - npyv_u8 b = npyv_load_u8(src2); - npyv_u8 quo = vsx4_div_u8(a, b); - npyv_u8 rem = npyv_sub_u8(a, vec_mul(b, quo)); - npyv_b8 bzero = npyv_cmpeq_u8(b, vzero); - // when b is 0, 'cvtozero' forces the modulo to be 0 too - npyv_u8 cvtozero = npyv_select_u8(bzero, vzero, vneg_one); - warn = npyv_or_u8(bzero, warn); - npyv_store_u8(dst1, quo); - npyv_store_u8(dst2, npyv_and_u8(cvtozero, rem)); - } - - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_u8 a = *src1; - const npyv_lanetype_u8 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } else{ - *dst1 = a / b; - *dst2 = a % b; - } - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_u8 a = npyv_load_u8(src1); - npyv_u8 b = npyv_load_u8(src2); - npyv_u8 c = vsx4_mod_u8(a, b); - npyv_store_u8(dst1, c); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_u8 a = *src1; - const npyv_lanetype_u8 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_divmod_by_scalar_contig_u8(char **args, npy_intp len) -{ - npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0]; - npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1]; - npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2]; - const int vstep = npyv_nlanes_u8; - const npyv_u8 vscalar = npyv_setall_u8(scalar); - const vsx4_u32x4 divisor = vsx4_divisor_u8(vscalar); -#if 2 == 2 /* divmod */ - npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { - npyv_u8 a = npyv_load_u8(src1); - npyv_u8 quo = vsx4_div_scalar_u8(a, divisor); - npyv_u8 rem = npyv_sub_u8(a, vec_mul(vscalar, quo)); - npyv_store_u8(dst1, quo); - npyv_store_u8(dst2, rem); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_u8 a = *src1; - *dst1 = a / scalar; - *dst2 = a % scalar; - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { - npyv_u8 a = npyv_load_u8(src1); - npyv_u8 c = vsx4_mod_scalar_u8(a, divisor); - npyv_store_u8(dst1, c); - } - - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_u8 a = *src1; - *dst1 = a % scalar; - } -#endif - npyv_cleanup(); -} - - -#line 155 -#line 159 -static NPY_INLINE void -vsx4_simd_fmod_contig_u16(char **args, npy_intp len) -{ - npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0]; - npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1]; - npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2]; - const npyv_u16 vzero = npyv_zero_u16(); - const int vstep = npyv_nlanes_u16; -#if 0 == 2 /* divmod */ - npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3]; - const npyv_u16 vneg_one = npyv_setall_u16(-1); - npyv_b16 warn = npyv_cvt_b16_u16(npyv_zero_u16()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { - npyv_u16 a = npyv_load_u16(src1); - npyv_u16 b = npyv_load_u16(src2); - npyv_u16 quo = vsx4_div_u16(a, b); - npyv_u16 rem = npyv_sub_u16(a, vec_mul(b, quo)); - npyv_b16 bzero = npyv_cmpeq_u16(b, vzero); - // when b is 0, 'cvtozero' forces the modulo to be 0 too - npyv_u16 cvtozero = npyv_select_u16(bzero, vzero, vneg_one); - warn = npyv_or_u16(bzero, warn); - npyv_store_u16(dst1, quo); - npyv_store_u16(dst2, npyv_and_u16(cvtozero, rem)); - } - - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_u16 a = *src1; - const npyv_lanetype_u16 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } else{ - *dst1 = a / b; - *dst2 = a % b; - } - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_u16 a = npyv_load_u16(src1); - npyv_u16 b = npyv_load_u16(src2); - npyv_u16 c = vsx4_mod_u16(a, b); - npyv_store_u16(dst1, c); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_u16 a = *src1; - const npyv_lanetype_u16 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_fmod_by_scalar_contig_u16(char **args, npy_intp len) -{ - npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0]; - npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1]; - npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2]; - const int vstep = npyv_nlanes_u16; - const npyv_u16 vscalar = npyv_setall_u16(scalar); - const npyv_u32x2 divisor = vsx4_divisor_u16(vscalar); -#if 0 == 2 /* divmod */ - npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { - npyv_u16 a = npyv_load_u16(src1); - npyv_u16 quo = vsx4_div_scalar_u16(a, divisor); - npyv_u16 rem = npyv_sub_u16(a, vec_mul(vscalar, quo)); - npyv_store_u16(dst1, quo); - npyv_store_u16(dst2, rem); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_u16 a = *src1; - *dst1 = a / scalar; - *dst2 = a % scalar; - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { - npyv_u16 a = npyv_load_u16(src1); - npyv_u16 c = vsx4_mod_scalar_u16(a, divisor); - npyv_store_u16(dst1, c); - } - - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_u16 a = *src1; - *dst1 = a % scalar; - } -#endif - npyv_cleanup(); -} - -#line 159 -static NPY_INLINE void -vsx4_simd_remainder_contig_u16(char **args, npy_intp len) -{ - npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0]; - npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1]; - npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2]; - const npyv_u16 vzero = npyv_zero_u16(); - const int vstep = npyv_nlanes_u16; -#if 1 == 2 /* divmod */ - npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3]; - const npyv_u16 vneg_one = npyv_setall_u16(-1); - npyv_b16 warn = npyv_cvt_b16_u16(npyv_zero_u16()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { - npyv_u16 a = npyv_load_u16(src1); - npyv_u16 b = npyv_load_u16(src2); - npyv_u16 quo = vsx4_div_u16(a, b); - npyv_u16 rem = npyv_sub_u16(a, vec_mul(b, quo)); - npyv_b16 bzero = npyv_cmpeq_u16(b, vzero); - // when b is 0, 'cvtozero' forces the modulo to be 0 too - npyv_u16 cvtozero = npyv_select_u16(bzero, vzero, vneg_one); - warn = npyv_or_u16(bzero, warn); - npyv_store_u16(dst1, quo); - npyv_store_u16(dst2, npyv_and_u16(cvtozero, rem)); - } - - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_u16 a = *src1; - const npyv_lanetype_u16 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } else{ - *dst1 = a / b; - *dst2 = a % b; - } - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_u16 a = npyv_load_u16(src1); - npyv_u16 b = npyv_load_u16(src2); - npyv_u16 c = vsx4_mod_u16(a, b); - npyv_store_u16(dst1, c); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_u16 a = *src1; - const npyv_lanetype_u16 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_remainder_by_scalar_contig_u16(char **args, npy_intp len) -{ - npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0]; - npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1]; - npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2]; - const int vstep = npyv_nlanes_u16; - const npyv_u16 vscalar = npyv_setall_u16(scalar); - const npyv_u32x2 divisor = vsx4_divisor_u16(vscalar); -#if 1 == 2 /* divmod */ - npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { - npyv_u16 a = npyv_load_u16(src1); - npyv_u16 quo = vsx4_div_scalar_u16(a, divisor); - npyv_u16 rem = npyv_sub_u16(a, vec_mul(vscalar, quo)); - npyv_store_u16(dst1, quo); - npyv_store_u16(dst2, rem); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_u16 a = *src1; - *dst1 = a / scalar; - *dst2 = a % scalar; - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { - npyv_u16 a = npyv_load_u16(src1); - npyv_u16 c = vsx4_mod_scalar_u16(a, divisor); - npyv_store_u16(dst1, c); - } - - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_u16 a = *src1; - *dst1 = a % scalar; - } -#endif - npyv_cleanup(); -} - -#line 159 -static NPY_INLINE void -vsx4_simd_divmod_contig_u16(char **args, npy_intp len) -{ - npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0]; - npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1]; - npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2]; - const npyv_u16 vzero = npyv_zero_u16(); - const int vstep = npyv_nlanes_u16; -#if 2 == 2 /* divmod */ - npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3]; - const npyv_u16 vneg_one = npyv_setall_u16(-1); - npyv_b16 warn = npyv_cvt_b16_u16(npyv_zero_u16()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { - npyv_u16 a = npyv_load_u16(src1); - npyv_u16 b = npyv_load_u16(src2); - npyv_u16 quo = vsx4_div_u16(a, b); - npyv_u16 rem = npyv_sub_u16(a, vec_mul(b, quo)); - npyv_b16 bzero = npyv_cmpeq_u16(b, vzero); - // when b is 0, 'cvtozero' forces the modulo to be 0 too - npyv_u16 cvtozero = npyv_select_u16(bzero, vzero, vneg_one); - warn = npyv_or_u16(bzero, warn); - npyv_store_u16(dst1, quo); - npyv_store_u16(dst2, npyv_and_u16(cvtozero, rem)); - } - - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_u16 a = *src1; - const npyv_lanetype_u16 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } else{ - *dst1 = a / b; - *dst2 = a % b; - } - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_u16 a = npyv_load_u16(src1); - npyv_u16 b = npyv_load_u16(src2); - npyv_u16 c = vsx4_mod_u16(a, b); - npyv_store_u16(dst1, c); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_u16 a = *src1; - const npyv_lanetype_u16 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_divmod_by_scalar_contig_u16(char **args, npy_intp len) -{ - npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0]; - npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1]; - npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2]; - const int vstep = npyv_nlanes_u16; - const npyv_u16 vscalar = npyv_setall_u16(scalar); - const npyv_u32x2 divisor = vsx4_divisor_u16(vscalar); -#if 2 == 2 /* divmod */ - npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { - npyv_u16 a = npyv_load_u16(src1); - npyv_u16 quo = vsx4_div_scalar_u16(a, divisor); - npyv_u16 rem = npyv_sub_u16(a, vec_mul(vscalar, quo)); - npyv_store_u16(dst1, quo); - npyv_store_u16(dst2, rem); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_u16 a = *src1; - *dst1 = a / scalar; - *dst2 = a % scalar; - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { - npyv_u16 a = npyv_load_u16(src1); - npyv_u16 c = vsx4_mod_scalar_u16(a, divisor); - npyv_store_u16(dst1, c); - } - - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_u16 a = *src1; - *dst1 = a % scalar; - } -#endif - npyv_cleanup(); -} - - -#line 155 -#line 159 -static NPY_INLINE void -vsx4_simd_fmod_contig_u32(char **args, npy_intp len) -{ - npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0]; - npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1]; - npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2]; - const npyv_u32 vzero = npyv_zero_u32(); - const int vstep = npyv_nlanes_u32; -#if 0 == 2 /* divmod */ - npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3]; - const npyv_u32 vneg_one = npyv_setall_u32(-1); - npyv_b32 warn = npyv_cvt_b32_u32(npyv_zero_u32()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { - npyv_u32 a = npyv_load_u32(src1); - npyv_u32 b = npyv_load_u32(src2); - npyv_u32 quo = vsx4_div_u32(a, b); - npyv_u32 rem = npyv_sub_u32(a, vec_mul(b, quo)); - npyv_b32 bzero = npyv_cmpeq_u32(b, vzero); - // when b is 0, 'cvtozero' forces the modulo to be 0 too - npyv_u32 cvtozero = npyv_select_u32(bzero, vzero, vneg_one); - warn = npyv_or_u32(bzero, warn); - npyv_store_u32(dst1, quo); - npyv_store_u32(dst2, npyv_and_u32(cvtozero, rem)); - } - - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_u32 a = *src1; - const npyv_lanetype_u32 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } else{ - *dst1 = a / b; - *dst2 = a % b; - } - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_u32 a = npyv_load_u32(src1); - npyv_u32 b = npyv_load_u32(src2); - npyv_u32 c = vsx4_mod_u32(a, b); - npyv_store_u32(dst1, c); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_u32 a = *src1; - const npyv_lanetype_u32 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_fmod_by_scalar_contig_u32(char **args, npy_intp len) -{ - npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0]; - npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1]; - npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2]; - const int vstep = npyv_nlanes_u32; - const npyv_u32 vscalar = npyv_setall_u32(scalar); - const npyv_u32 divisor = vsx4_divisor_u32(vscalar); -#if 0 == 2 /* divmod */ - npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { - npyv_u32 a = npyv_load_u32(src1); - npyv_u32 quo = vsx4_div_scalar_u32(a, divisor); - npyv_u32 rem = npyv_sub_u32(a, vec_mul(vscalar, quo)); - npyv_store_u32(dst1, quo); - npyv_store_u32(dst2, rem); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_u32 a = *src1; - *dst1 = a / scalar; - *dst2 = a % scalar; - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { - npyv_u32 a = npyv_load_u32(src1); - npyv_u32 c = vsx4_mod_scalar_u32(a, divisor); - npyv_store_u32(dst1, c); - } - - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_u32 a = *src1; - *dst1 = a % scalar; - } -#endif - npyv_cleanup(); -} - -#line 159 -static NPY_INLINE void -vsx4_simd_remainder_contig_u32(char **args, npy_intp len) -{ - npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0]; - npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1]; - npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2]; - const npyv_u32 vzero = npyv_zero_u32(); - const int vstep = npyv_nlanes_u32; -#if 1 == 2 /* divmod */ - npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3]; - const npyv_u32 vneg_one = npyv_setall_u32(-1); - npyv_b32 warn = npyv_cvt_b32_u32(npyv_zero_u32()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { - npyv_u32 a = npyv_load_u32(src1); - npyv_u32 b = npyv_load_u32(src2); - npyv_u32 quo = vsx4_div_u32(a, b); - npyv_u32 rem = npyv_sub_u32(a, vec_mul(b, quo)); - npyv_b32 bzero = npyv_cmpeq_u32(b, vzero); - // when b is 0, 'cvtozero' forces the modulo to be 0 too - npyv_u32 cvtozero = npyv_select_u32(bzero, vzero, vneg_one); - warn = npyv_or_u32(bzero, warn); - npyv_store_u32(dst1, quo); - npyv_store_u32(dst2, npyv_and_u32(cvtozero, rem)); - } - - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_u32 a = *src1; - const npyv_lanetype_u32 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } else{ - *dst1 = a / b; - *dst2 = a % b; - } - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_u32 a = npyv_load_u32(src1); - npyv_u32 b = npyv_load_u32(src2); - npyv_u32 c = vsx4_mod_u32(a, b); - npyv_store_u32(dst1, c); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_u32 a = *src1; - const npyv_lanetype_u32 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_remainder_by_scalar_contig_u32(char **args, npy_intp len) -{ - npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0]; - npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1]; - npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2]; - const int vstep = npyv_nlanes_u32; - const npyv_u32 vscalar = npyv_setall_u32(scalar); - const npyv_u32 divisor = vsx4_divisor_u32(vscalar); -#if 1 == 2 /* divmod */ - npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { - npyv_u32 a = npyv_load_u32(src1); - npyv_u32 quo = vsx4_div_scalar_u32(a, divisor); - npyv_u32 rem = npyv_sub_u32(a, vec_mul(vscalar, quo)); - npyv_store_u32(dst1, quo); - npyv_store_u32(dst2, rem); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_u32 a = *src1; - *dst1 = a / scalar; - *dst2 = a % scalar; - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { - npyv_u32 a = npyv_load_u32(src1); - npyv_u32 c = vsx4_mod_scalar_u32(a, divisor); - npyv_store_u32(dst1, c); - } - - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_u32 a = *src1; - *dst1 = a % scalar; - } -#endif - npyv_cleanup(); -} - -#line 159 -static NPY_INLINE void -vsx4_simd_divmod_contig_u32(char **args, npy_intp len) -{ - npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0]; - npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1]; - npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2]; - const npyv_u32 vzero = npyv_zero_u32(); - const int vstep = npyv_nlanes_u32; -#if 2 == 2 /* divmod */ - npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3]; - const npyv_u32 vneg_one = npyv_setall_u32(-1); - npyv_b32 warn = npyv_cvt_b32_u32(npyv_zero_u32()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { - npyv_u32 a = npyv_load_u32(src1); - npyv_u32 b = npyv_load_u32(src2); - npyv_u32 quo = vsx4_div_u32(a, b); - npyv_u32 rem = npyv_sub_u32(a, vec_mul(b, quo)); - npyv_b32 bzero = npyv_cmpeq_u32(b, vzero); - // when b is 0, 'cvtozero' forces the modulo to be 0 too - npyv_u32 cvtozero = npyv_select_u32(bzero, vzero, vneg_one); - warn = npyv_or_u32(bzero, warn); - npyv_store_u32(dst1, quo); - npyv_store_u32(dst2, npyv_and_u32(cvtozero, rem)); - } - - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_u32 a = *src1; - const npyv_lanetype_u32 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } else{ - *dst1 = a / b; - *dst2 = a % b; - } - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_u32 a = npyv_load_u32(src1); - npyv_u32 b = npyv_load_u32(src2); - npyv_u32 c = vsx4_mod_u32(a, b); - npyv_store_u32(dst1, c); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_u32 a = *src1; - const npyv_lanetype_u32 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_divmod_by_scalar_contig_u32(char **args, npy_intp len) -{ - npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0]; - npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1]; - npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2]; - const int vstep = npyv_nlanes_u32; - const npyv_u32 vscalar = npyv_setall_u32(scalar); - const npyv_u32 divisor = vsx4_divisor_u32(vscalar); -#if 2 == 2 /* divmod */ - npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { - npyv_u32 a = npyv_load_u32(src1); - npyv_u32 quo = vsx4_div_scalar_u32(a, divisor); - npyv_u32 rem = npyv_sub_u32(a, vec_mul(vscalar, quo)); - npyv_store_u32(dst1, quo); - npyv_store_u32(dst2, rem); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_u32 a = *src1; - *dst1 = a / scalar; - *dst2 = a % scalar; - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { - npyv_u32 a = npyv_load_u32(src1); - npyv_u32 c = vsx4_mod_scalar_u32(a, divisor); - npyv_store_u32(dst1, c); - } - - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_u32 a = *src1; - *dst1 = a % scalar; - } -#endif - npyv_cleanup(); -} - - -#line 155 -#line 159 -static NPY_INLINE void -vsx4_simd_fmod_contig_u64(char **args, npy_intp len) -{ - npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0]; - npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1]; - npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2]; - const npyv_u64 vzero = npyv_zero_u64(); - const int vstep = npyv_nlanes_u64; -#if 0 == 2 /* divmod */ - npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3]; - const npyv_u64 vneg_one = npyv_setall_u64(-1); - npyv_b64 warn = npyv_cvt_b64_u64(npyv_zero_u64()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { - npyv_u64 a = npyv_load_u64(src1); - npyv_u64 b = npyv_load_u64(src2); - npyv_u64 quo = vsx4_div_u64(a, b); - npyv_u64 rem = npyv_sub_u64(a, vec_mul(b, quo)); - npyv_b64 bzero = npyv_cmpeq_u64(b, vzero); - // when b is 0, 'cvtozero' forces the modulo to be 0 too - npyv_u64 cvtozero = npyv_select_u64(bzero, vzero, vneg_one); - warn = npyv_or_u64(bzero, warn); - npyv_store_u64(dst1, quo); - npyv_store_u64(dst2, npyv_and_u64(cvtozero, rem)); - } - - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_u64 a = *src1; - const npyv_lanetype_u64 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } else{ - *dst1 = a / b; - *dst2 = a % b; - } - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_u64 a = npyv_load_u64(src1); - npyv_u64 b = npyv_load_u64(src2); - npyv_u64 c = vsx4_mod_u64(a, b); - npyv_store_u64(dst1, c); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_u64 a = *src1; - const npyv_lanetype_u64 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_fmod_by_scalar_contig_u64(char **args, npy_intp len) -{ - npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0]; - npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1]; - npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2]; - const int vstep = npyv_nlanes_u64; - const npyv_u64 vscalar = npyv_setall_u64(scalar); - const npyv_u64 divisor = vsx4_divisor_u64(vscalar); -#if 0 == 2 /* divmod */ - npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { - npyv_u64 a = npyv_load_u64(src1); - npyv_u64 quo = vsx4_div_scalar_u64(a, divisor); - npyv_u64 rem = npyv_sub_u64(a, vec_mul(vscalar, quo)); - npyv_store_u64(dst1, quo); - npyv_store_u64(dst2, rem); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_u64 a = *src1; - *dst1 = a / scalar; - *dst2 = a % scalar; - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { - npyv_u64 a = npyv_load_u64(src1); - npyv_u64 c = vsx4_mod_scalar_u64(a, divisor); - npyv_store_u64(dst1, c); - } - - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_u64 a = *src1; - *dst1 = a % scalar; - } -#endif - npyv_cleanup(); -} - -#line 159 -static NPY_INLINE void -vsx4_simd_remainder_contig_u64(char **args, npy_intp len) -{ - npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0]; - npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1]; - npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2]; - const npyv_u64 vzero = npyv_zero_u64(); - const int vstep = npyv_nlanes_u64; -#if 1 == 2 /* divmod */ - npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3]; - const npyv_u64 vneg_one = npyv_setall_u64(-1); - npyv_b64 warn = npyv_cvt_b64_u64(npyv_zero_u64()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { - npyv_u64 a = npyv_load_u64(src1); - npyv_u64 b = npyv_load_u64(src2); - npyv_u64 quo = vsx4_div_u64(a, b); - npyv_u64 rem = npyv_sub_u64(a, vec_mul(b, quo)); - npyv_b64 bzero = npyv_cmpeq_u64(b, vzero); - // when b is 0, 'cvtozero' forces the modulo to be 0 too - npyv_u64 cvtozero = npyv_select_u64(bzero, vzero, vneg_one); - warn = npyv_or_u64(bzero, warn); - npyv_store_u64(dst1, quo); - npyv_store_u64(dst2, npyv_and_u64(cvtozero, rem)); - } - - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_u64 a = *src1; - const npyv_lanetype_u64 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } else{ - *dst1 = a / b; - *dst2 = a % b; - } - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_u64 a = npyv_load_u64(src1); - npyv_u64 b = npyv_load_u64(src2); - npyv_u64 c = vsx4_mod_u64(a, b); - npyv_store_u64(dst1, c); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_u64 a = *src1; - const npyv_lanetype_u64 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_remainder_by_scalar_contig_u64(char **args, npy_intp len) -{ - npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0]; - npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1]; - npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2]; - const int vstep = npyv_nlanes_u64; - const npyv_u64 vscalar = npyv_setall_u64(scalar); - const npyv_u64 divisor = vsx4_divisor_u64(vscalar); -#if 1 == 2 /* divmod */ - npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { - npyv_u64 a = npyv_load_u64(src1); - npyv_u64 quo = vsx4_div_scalar_u64(a, divisor); - npyv_u64 rem = npyv_sub_u64(a, vec_mul(vscalar, quo)); - npyv_store_u64(dst1, quo); - npyv_store_u64(dst2, rem); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_u64 a = *src1; - *dst1 = a / scalar; - *dst2 = a % scalar; - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { - npyv_u64 a = npyv_load_u64(src1); - npyv_u64 c = vsx4_mod_scalar_u64(a, divisor); - npyv_store_u64(dst1, c); - } - - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_u64 a = *src1; - *dst1 = a % scalar; - } -#endif - npyv_cleanup(); -} - -#line 159 -static NPY_INLINE void -vsx4_simd_divmod_contig_u64(char **args, npy_intp len) -{ - npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0]; - npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1]; - npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2]; - const npyv_u64 vzero = npyv_zero_u64(); - const int vstep = npyv_nlanes_u64; -#if 2 == 2 /* divmod */ - npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3]; - const npyv_u64 vneg_one = npyv_setall_u64(-1); - npyv_b64 warn = npyv_cvt_b64_u64(npyv_zero_u64()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { - npyv_u64 a = npyv_load_u64(src1); - npyv_u64 b = npyv_load_u64(src2); - npyv_u64 quo = vsx4_div_u64(a, b); - npyv_u64 rem = npyv_sub_u64(a, vec_mul(b, quo)); - npyv_b64 bzero = npyv_cmpeq_u64(b, vzero); - // when b is 0, 'cvtozero' forces the modulo to be 0 too - npyv_u64 cvtozero = npyv_select_u64(bzero, vzero, vneg_one); - warn = npyv_or_u64(bzero, warn); - npyv_store_u64(dst1, quo); - npyv_store_u64(dst2, npyv_and_u64(cvtozero, rem)); - } - - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_u64 a = *src1; - const npyv_lanetype_u64 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } else{ - *dst1 = a / b; - *dst2 = a % b; - } - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_u64 a = npyv_load_u64(src1); - npyv_u64 b = npyv_load_u64(src2); - npyv_u64 c = vsx4_mod_u64(a, b); - npyv_store_u64(dst1, c); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_u64 a = *src1; - const npyv_lanetype_u64 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_divmod_by_scalar_contig_u64(char **args, npy_intp len) -{ - npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0]; - npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1]; - npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2]; - const int vstep = npyv_nlanes_u64; - const npyv_u64 vscalar = npyv_setall_u64(scalar); - const npyv_u64 divisor = vsx4_divisor_u64(vscalar); -#if 2 == 2 /* divmod */ - npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { - npyv_u64 a = npyv_load_u64(src1); - npyv_u64 quo = vsx4_div_scalar_u64(a, divisor); - npyv_u64 rem = npyv_sub_u64(a, vec_mul(vscalar, quo)); - npyv_store_u64(dst1, quo); - npyv_store_u64(dst2, rem); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_u64 a = *src1; - *dst1 = a / scalar; - *dst2 = a % scalar; - } -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { - npyv_u64 a = npyv_load_u64(src1); - npyv_u64 c = vsx4_mod_scalar_u64(a, divisor); - npyv_store_u64(dst1, c); - } - - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_u64 a = *src1; - *dst1 = a % scalar; - } -#endif - npyv_cleanup(); -} - - - -#line 277 -#line 281 -static NPY_INLINE void -vsx4_simd_fmod_contig_s8(char **args, npy_intp len) -{ - npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0]; - npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1]; - npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2]; - const npyv_s8 vzero = npyv_zero_s8(); - const int vstep = npyv_nlanes_s8; -#if 0 == 2 /* divmod */ - npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3]; - const npyv_s8 vneg_one = npyv_setall_s8(-1); - const npyv_s8 vmin = npyv_setall_s8(NPY_MIN_INT8); - npyv_b8 warn = npyv_cvt_b8_s8(npyv_zero_s8()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { -#endif - npyv_s8 a = npyv_load_s8(src1); - npyv_s8 b = npyv_load_s8(src2); -#if 0 <= 1 /* fmod and remainder */ - npyv_s8 rem = vsx4_mod_s8(a, b); -#else /* divmod */ - npyv_s8 quo = vsx4_div_s8(a, b); - npyv_s8 rem = npyv_sub_s8(a, vec_mul(b, quo)); - // (b == 0 || (a == NPY_MIN_INT8 && b == -1)) - npyv_b8 bzero = npyv_cmpeq_s8(b, vzero); - npyv_b8 amin = npyv_cmpeq_s8(a, vmin); - npyv_b8 bneg_one = npyv_cmpeq_s8(b, vneg_one); - npyv_b8 overflow = npyv_and_s8(bneg_one, amin); - npyv_b8 error = npyv_or_s8(bzero, overflow); - // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 - npyv_s8 cvtozero = npyv_select_s8(error, vzero, vneg_one); - warn = npyv_or_s8(error, warn); -#endif -#if 0 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b8 a_gt_zero = npyv_cmpgt_s8(a, vzero); - npyv_b8 b_gt_zero = npyv_cmpgt_s8(b, vzero); - npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero); - npyv_b8 rem_zero = npyv_cmpeq_s8(rem, vzero); - npyv_b8 or = npyv_or_s8(ab_eq_cond, rem_zero); - npyv_s8 to_add = npyv_select_s8(or, vzero, b); - rem = npyv_add_s8(rem, to_add); -#endif -#if 0 == 2 /* divmod */ - npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one); - quo = npyv_add_s8(quo, to_sub); - npyv_store_s8(dst1, npyv_and_s8(cvtozero, quo)); - npyv_store_s8(dst2, npyv_and_s8(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s8(dst1, rem); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } -#endif - } - -#if 0 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_s8 a = *src1; - const npyv_lanetype_s8 b = *src2; - if (b == 0 || (a == NPY_MIN_INT8 && b == -1)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / b; - *dst2 = a % b; - if (!((a > 0) == (b > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += b; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_s8 a = *src1; - const npyv_lanetype_s8 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; -#if 0 == 1 /* remainder */ - if (!((a > 0) == (b > 0) || *dst1 == 0)) { - *dst1 += b; - } -#endif - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_fmod_by_scalar_contig_s8(char **args, npy_intp len) -{ - npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0]; - npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1]; - npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2]; - const npyv_s8 vscalar = npyv_setall_s8(scalar); - const vsx4_s32x4 divisor = vsx4_divisor_s8(vscalar); - const int vstep = npyv_nlanes_s8; -#if 0 >= 1 /* remainder and divmod */ - const npyv_s8 vzero = npyv_zero_s8(); - npyv_b8 b_gt_zero = npyv_cmpgt_s8(vscalar, vzero); -#endif -#if 0 == 2 /* divmod */ - npyv_b8 warn = npyv_cvt_b8_s8(npyv_zero_s8()); - const npyv_s8 vmin = npyv_setall_s8(NPY_MIN_INT8); - const npyv_s8 vneg_one = npyv_setall_s8(-1); - npyv_b8 bneg_one = npyv_cmpeq_s8(vscalar, vneg_one); - npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { -#endif - npyv_s8 a = npyv_load_s8(src1); -#if 0 <= 1 /* fmod and remainder */ - npyv_s8 rem = vsx4_mod_scalar_s8(a, divisor); -#else /* divmod */ - npyv_s8 quo = vsx4_div_scalar_s8(a, divisor); - npyv_s8 rem = npyv_sub_s8(a, vec_mul(vscalar, quo)); - // (a == NPY_MIN_INT8 && b == -1) - npyv_b8 amin = npyv_cmpeq_s8(a, vmin); - npyv_b8 overflow = npyv_and_s8(bneg_one, amin); - // in case of overflow, 'cvtozero' forces quo/rem to be 0 - npyv_s8 cvtozero = npyv_select_s8(overflow, vzero, vneg_one); - warn = npyv_or_s8(overflow, warn); -#endif -#if 0 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b8 a_gt_zero = npyv_cmpgt_s8(a, vzero); - npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero); - npyv_b8 rem_zero = npyv_cmpeq_s8(rem, vzero); - npyv_b8 or = npyv_or_s8(ab_eq_cond, rem_zero); - npyv_s8 to_add = npyv_select_s8(or, vzero, vscalar); - rem = npyv_add_s8(rem, to_add); -#endif -#if 0 == 2 /* divmod */ - npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one); - quo = npyv_add_s8(quo, to_sub); - npyv_store_s8(dst1, npyv_and_s8(cvtozero, quo)); - npyv_store_s8(dst2, npyv_and_s8(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s8(dst1, rem); -#endif - } - -#if 0 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_s8 a = *src1; - if (a == NPY_MIN_INT8 && scalar == -1) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / scalar; - *dst2 = a % scalar; - if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += scalar; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_s8 a = *src1; - *dst1 = a % scalar; -#if 0 == 1 /* remainder */ - if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { - *dst1 += scalar; - } -#endif - } -#endif - npyv_cleanup(); -} - -#line 281 -static NPY_INLINE void -vsx4_simd_remainder_contig_s8(char **args, npy_intp len) -{ - npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0]; - npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1]; - npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2]; - const npyv_s8 vzero = npyv_zero_s8(); - const int vstep = npyv_nlanes_s8; -#if 1 == 2 /* divmod */ - npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3]; - const npyv_s8 vneg_one = npyv_setall_s8(-1); - const npyv_s8 vmin = npyv_setall_s8(NPY_MIN_INT8); - npyv_b8 warn = npyv_cvt_b8_s8(npyv_zero_s8()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { -#endif - npyv_s8 a = npyv_load_s8(src1); - npyv_s8 b = npyv_load_s8(src2); -#if 1 <= 1 /* fmod and remainder */ - npyv_s8 rem = vsx4_mod_s8(a, b); -#else /* divmod */ - npyv_s8 quo = vsx4_div_s8(a, b); - npyv_s8 rem = npyv_sub_s8(a, vec_mul(b, quo)); - // (b == 0 || (a == NPY_MIN_INT8 && b == -1)) - npyv_b8 bzero = npyv_cmpeq_s8(b, vzero); - npyv_b8 amin = npyv_cmpeq_s8(a, vmin); - npyv_b8 bneg_one = npyv_cmpeq_s8(b, vneg_one); - npyv_b8 overflow = npyv_and_s8(bneg_one, amin); - npyv_b8 error = npyv_or_s8(bzero, overflow); - // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 - npyv_s8 cvtozero = npyv_select_s8(error, vzero, vneg_one); - warn = npyv_or_s8(error, warn); -#endif -#if 1 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b8 a_gt_zero = npyv_cmpgt_s8(a, vzero); - npyv_b8 b_gt_zero = npyv_cmpgt_s8(b, vzero); - npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero); - npyv_b8 rem_zero = npyv_cmpeq_s8(rem, vzero); - npyv_b8 or = npyv_or_s8(ab_eq_cond, rem_zero); - npyv_s8 to_add = npyv_select_s8(or, vzero, b); - rem = npyv_add_s8(rem, to_add); -#endif -#if 1 == 2 /* divmod */ - npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one); - quo = npyv_add_s8(quo, to_sub); - npyv_store_s8(dst1, npyv_and_s8(cvtozero, quo)); - npyv_store_s8(dst2, npyv_and_s8(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s8(dst1, rem); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } -#endif - } - -#if 1 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_s8 a = *src1; - const npyv_lanetype_s8 b = *src2; - if (b == 0 || (a == NPY_MIN_INT8 && b == -1)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / b; - *dst2 = a % b; - if (!((a > 0) == (b > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += b; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_s8 a = *src1; - const npyv_lanetype_s8 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; -#if 1 == 1 /* remainder */ - if (!((a > 0) == (b > 0) || *dst1 == 0)) { - *dst1 += b; - } -#endif - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_remainder_by_scalar_contig_s8(char **args, npy_intp len) -{ - npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0]; - npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1]; - npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2]; - const npyv_s8 vscalar = npyv_setall_s8(scalar); - const vsx4_s32x4 divisor = vsx4_divisor_s8(vscalar); - const int vstep = npyv_nlanes_s8; -#if 1 >= 1 /* remainder and divmod */ - const npyv_s8 vzero = npyv_zero_s8(); - npyv_b8 b_gt_zero = npyv_cmpgt_s8(vscalar, vzero); -#endif -#if 1 == 2 /* divmod */ - npyv_b8 warn = npyv_cvt_b8_s8(npyv_zero_s8()); - const npyv_s8 vmin = npyv_setall_s8(NPY_MIN_INT8); - const npyv_s8 vneg_one = npyv_setall_s8(-1); - npyv_b8 bneg_one = npyv_cmpeq_s8(vscalar, vneg_one); - npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { -#endif - npyv_s8 a = npyv_load_s8(src1); -#if 1 <= 1 /* fmod and remainder */ - npyv_s8 rem = vsx4_mod_scalar_s8(a, divisor); -#else /* divmod */ - npyv_s8 quo = vsx4_div_scalar_s8(a, divisor); - npyv_s8 rem = npyv_sub_s8(a, vec_mul(vscalar, quo)); - // (a == NPY_MIN_INT8 && b == -1) - npyv_b8 amin = npyv_cmpeq_s8(a, vmin); - npyv_b8 overflow = npyv_and_s8(bneg_one, amin); - // in case of overflow, 'cvtozero' forces quo/rem to be 0 - npyv_s8 cvtozero = npyv_select_s8(overflow, vzero, vneg_one); - warn = npyv_or_s8(overflow, warn); -#endif -#if 1 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b8 a_gt_zero = npyv_cmpgt_s8(a, vzero); - npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero); - npyv_b8 rem_zero = npyv_cmpeq_s8(rem, vzero); - npyv_b8 or = npyv_or_s8(ab_eq_cond, rem_zero); - npyv_s8 to_add = npyv_select_s8(or, vzero, vscalar); - rem = npyv_add_s8(rem, to_add); -#endif -#if 1 == 2 /* divmod */ - npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one); - quo = npyv_add_s8(quo, to_sub); - npyv_store_s8(dst1, npyv_and_s8(cvtozero, quo)); - npyv_store_s8(dst2, npyv_and_s8(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s8(dst1, rem); -#endif - } - -#if 1 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_s8 a = *src1; - if (a == NPY_MIN_INT8 && scalar == -1) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / scalar; - *dst2 = a % scalar; - if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += scalar; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_s8 a = *src1; - *dst1 = a % scalar; -#if 1 == 1 /* remainder */ - if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { - *dst1 += scalar; - } -#endif - } -#endif - npyv_cleanup(); -} - -#line 281 -static NPY_INLINE void -vsx4_simd_divmod_contig_s8(char **args, npy_intp len) -{ - npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0]; - npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1]; - npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2]; - const npyv_s8 vzero = npyv_zero_s8(); - const int vstep = npyv_nlanes_s8; -#if 2 == 2 /* divmod */ - npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3]; - const npyv_s8 vneg_one = npyv_setall_s8(-1); - const npyv_s8 vmin = npyv_setall_s8(NPY_MIN_INT8); - npyv_b8 warn = npyv_cvt_b8_s8(npyv_zero_s8()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { -#endif - npyv_s8 a = npyv_load_s8(src1); - npyv_s8 b = npyv_load_s8(src2); -#if 2 <= 1 /* fmod and remainder */ - npyv_s8 rem = vsx4_mod_s8(a, b); -#else /* divmod */ - npyv_s8 quo = vsx4_div_s8(a, b); - npyv_s8 rem = npyv_sub_s8(a, vec_mul(b, quo)); - // (b == 0 || (a == NPY_MIN_INT8 && b == -1)) - npyv_b8 bzero = npyv_cmpeq_s8(b, vzero); - npyv_b8 amin = npyv_cmpeq_s8(a, vmin); - npyv_b8 bneg_one = npyv_cmpeq_s8(b, vneg_one); - npyv_b8 overflow = npyv_and_s8(bneg_one, amin); - npyv_b8 error = npyv_or_s8(bzero, overflow); - // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 - npyv_s8 cvtozero = npyv_select_s8(error, vzero, vneg_one); - warn = npyv_or_s8(error, warn); -#endif -#if 2 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b8 a_gt_zero = npyv_cmpgt_s8(a, vzero); - npyv_b8 b_gt_zero = npyv_cmpgt_s8(b, vzero); - npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero); - npyv_b8 rem_zero = npyv_cmpeq_s8(rem, vzero); - npyv_b8 or = npyv_or_s8(ab_eq_cond, rem_zero); - npyv_s8 to_add = npyv_select_s8(or, vzero, b); - rem = npyv_add_s8(rem, to_add); -#endif -#if 2 == 2 /* divmod */ - npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one); - quo = npyv_add_s8(quo, to_sub); - npyv_store_s8(dst1, npyv_and_s8(cvtozero, quo)); - npyv_store_s8(dst2, npyv_and_s8(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s8(dst1, rem); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } -#endif - } - -#if 2 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_s8 a = *src1; - const npyv_lanetype_s8 b = *src2; - if (b == 0 || (a == NPY_MIN_INT8 && b == -1)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / b; - *dst2 = a % b; - if (!((a > 0) == (b > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += b; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_s8 a = *src1; - const npyv_lanetype_s8 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; -#if 2 == 1 /* remainder */ - if (!((a > 0) == (b > 0) || *dst1 == 0)) { - *dst1 += b; - } -#endif - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_divmod_by_scalar_contig_s8(char **args, npy_intp len) -{ - npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0]; - npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1]; - npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2]; - const npyv_s8 vscalar = npyv_setall_s8(scalar); - const vsx4_s32x4 divisor = vsx4_divisor_s8(vscalar); - const int vstep = npyv_nlanes_s8; -#if 2 >= 1 /* remainder and divmod */ - const npyv_s8 vzero = npyv_zero_s8(); - npyv_b8 b_gt_zero = npyv_cmpgt_s8(vscalar, vzero); -#endif -#if 2 == 2 /* divmod */ - npyv_b8 warn = npyv_cvt_b8_s8(npyv_zero_s8()); - const npyv_s8 vmin = npyv_setall_s8(NPY_MIN_INT8); - const npyv_s8 vneg_one = npyv_setall_s8(-1); - npyv_b8 bneg_one = npyv_cmpeq_s8(vscalar, vneg_one); - npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { -#endif - npyv_s8 a = npyv_load_s8(src1); -#if 2 <= 1 /* fmod and remainder */ - npyv_s8 rem = vsx4_mod_scalar_s8(a, divisor); -#else /* divmod */ - npyv_s8 quo = vsx4_div_scalar_s8(a, divisor); - npyv_s8 rem = npyv_sub_s8(a, vec_mul(vscalar, quo)); - // (a == NPY_MIN_INT8 && b == -1) - npyv_b8 amin = npyv_cmpeq_s8(a, vmin); - npyv_b8 overflow = npyv_and_s8(bneg_one, amin); - // in case of overflow, 'cvtozero' forces quo/rem to be 0 - npyv_s8 cvtozero = npyv_select_s8(overflow, vzero, vneg_one); - warn = npyv_or_s8(overflow, warn); -#endif -#if 2 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b8 a_gt_zero = npyv_cmpgt_s8(a, vzero); - npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero); - npyv_b8 rem_zero = npyv_cmpeq_s8(rem, vzero); - npyv_b8 or = npyv_or_s8(ab_eq_cond, rem_zero); - npyv_s8 to_add = npyv_select_s8(or, vzero, vscalar); - rem = npyv_add_s8(rem, to_add); -#endif -#if 2 == 2 /* divmod */ - npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one); - quo = npyv_add_s8(quo, to_sub); - npyv_store_s8(dst1, npyv_and_s8(cvtozero, quo)); - npyv_store_s8(dst2, npyv_and_s8(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s8(dst1, rem); -#endif - } - -#if 2 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_s8 a = *src1; - if (a == NPY_MIN_INT8 && scalar == -1) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / scalar; - *dst2 = a % scalar; - if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += scalar; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_s8 a = *src1; - *dst1 = a % scalar; -#if 2 == 1 /* remainder */ - if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { - *dst1 += scalar; - } -#endif - } -#endif - npyv_cleanup(); -} - - -#line 277 -#line 281 -static NPY_INLINE void -vsx4_simd_fmod_contig_s16(char **args, npy_intp len) -{ - npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0]; - npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1]; - npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2]; - const npyv_s16 vzero = npyv_zero_s16(); - const int vstep = npyv_nlanes_s16; -#if 0 == 2 /* divmod */ - npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3]; - const npyv_s16 vneg_one = npyv_setall_s16(-1); - const npyv_s16 vmin = npyv_setall_s16(NPY_MIN_INT16); - npyv_b16 warn = npyv_cvt_b16_s16(npyv_zero_s16()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { -#endif - npyv_s16 a = npyv_load_s16(src1); - npyv_s16 b = npyv_load_s16(src2); -#if 0 <= 1 /* fmod and remainder */ - npyv_s16 rem = vsx4_mod_s16(a, b); -#else /* divmod */ - npyv_s16 quo = vsx4_div_s16(a, b); - npyv_s16 rem = npyv_sub_s16(a, vec_mul(b, quo)); - // (b == 0 || (a == NPY_MIN_INT16 && b == -1)) - npyv_b16 bzero = npyv_cmpeq_s16(b, vzero); - npyv_b16 amin = npyv_cmpeq_s16(a, vmin); - npyv_b16 bneg_one = npyv_cmpeq_s16(b, vneg_one); - npyv_b16 overflow = npyv_and_s16(bneg_one, amin); - npyv_b16 error = npyv_or_s16(bzero, overflow); - // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 - npyv_s16 cvtozero = npyv_select_s16(error, vzero, vneg_one); - warn = npyv_or_s16(error, warn); -#endif -#if 0 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b16 a_gt_zero = npyv_cmpgt_s16(a, vzero); - npyv_b16 b_gt_zero = npyv_cmpgt_s16(b, vzero); - npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero); - npyv_b16 rem_zero = npyv_cmpeq_s16(rem, vzero); - npyv_b16 or = npyv_or_s16(ab_eq_cond, rem_zero); - npyv_s16 to_add = npyv_select_s16(or, vzero, b); - rem = npyv_add_s16(rem, to_add); -#endif -#if 0 == 2 /* divmod */ - npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one); - quo = npyv_add_s16(quo, to_sub); - npyv_store_s16(dst1, npyv_and_s16(cvtozero, quo)); - npyv_store_s16(dst2, npyv_and_s16(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s16(dst1, rem); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } -#endif - } - -#if 0 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_s16 a = *src1; - const npyv_lanetype_s16 b = *src2; - if (b == 0 || (a == NPY_MIN_INT16 && b == -1)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / b; - *dst2 = a % b; - if (!((a > 0) == (b > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += b; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_s16 a = *src1; - const npyv_lanetype_s16 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; -#if 0 == 1 /* remainder */ - if (!((a > 0) == (b > 0) || *dst1 == 0)) { - *dst1 += b; - } -#endif - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_fmod_by_scalar_contig_s16(char **args, npy_intp len) -{ - npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0]; - npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1]; - npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2]; - const npyv_s16 vscalar = npyv_setall_s16(scalar); - const npyv_s32x2 divisor = vsx4_divisor_s16(vscalar); - const int vstep = npyv_nlanes_s16; -#if 0 >= 1 /* remainder and divmod */ - const npyv_s16 vzero = npyv_zero_s16(); - npyv_b16 b_gt_zero = npyv_cmpgt_s16(vscalar, vzero); -#endif -#if 0 == 2 /* divmod */ - npyv_b16 warn = npyv_cvt_b16_s16(npyv_zero_s16()); - const npyv_s16 vmin = npyv_setall_s16(NPY_MIN_INT16); - const npyv_s16 vneg_one = npyv_setall_s16(-1); - npyv_b16 bneg_one = npyv_cmpeq_s16(vscalar, vneg_one); - npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { -#endif - npyv_s16 a = npyv_load_s16(src1); -#if 0 <= 1 /* fmod and remainder */ - npyv_s16 rem = vsx4_mod_scalar_s16(a, divisor); -#else /* divmod */ - npyv_s16 quo = vsx4_div_scalar_s16(a, divisor); - npyv_s16 rem = npyv_sub_s16(a, vec_mul(vscalar, quo)); - // (a == NPY_MIN_INT16 && b == -1) - npyv_b16 amin = npyv_cmpeq_s16(a, vmin); - npyv_b16 overflow = npyv_and_s16(bneg_one, amin); - // in case of overflow, 'cvtozero' forces quo/rem to be 0 - npyv_s16 cvtozero = npyv_select_s16(overflow, vzero, vneg_one); - warn = npyv_or_s16(overflow, warn); -#endif -#if 0 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b16 a_gt_zero = npyv_cmpgt_s16(a, vzero); - npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero); - npyv_b16 rem_zero = npyv_cmpeq_s16(rem, vzero); - npyv_b16 or = npyv_or_s16(ab_eq_cond, rem_zero); - npyv_s16 to_add = npyv_select_s16(or, vzero, vscalar); - rem = npyv_add_s16(rem, to_add); -#endif -#if 0 == 2 /* divmod */ - npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one); - quo = npyv_add_s16(quo, to_sub); - npyv_store_s16(dst1, npyv_and_s16(cvtozero, quo)); - npyv_store_s16(dst2, npyv_and_s16(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s16(dst1, rem); -#endif - } - -#if 0 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_s16 a = *src1; - if (a == NPY_MIN_INT16 && scalar == -1) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / scalar; - *dst2 = a % scalar; - if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += scalar; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_s16 a = *src1; - *dst1 = a % scalar; -#if 0 == 1 /* remainder */ - if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { - *dst1 += scalar; - } -#endif - } -#endif - npyv_cleanup(); -} - -#line 281 -static NPY_INLINE void -vsx4_simd_remainder_contig_s16(char **args, npy_intp len) -{ - npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0]; - npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1]; - npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2]; - const npyv_s16 vzero = npyv_zero_s16(); - const int vstep = npyv_nlanes_s16; -#if 1 == 2 /* divmod */ - npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3]; - const npyv_s16 vneg_one = npyv_setall_s16(-1); - const npyv_s16 vmin = npyv_setall_s16(NPY_MIN_INT16); - npyv_b16 warn = npyv_cvt_b16_s16(npyv_zero_s16()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { -#endif - npyv_s16 a = npyv_load_s16(src1); - npyv_s16 b = npyv_load_s16(src2); -#if 1 <= 1 /* fmod and remainder */ - npyv_s16 rem = vsx4_mod_s16(a, b); -#else /* divmod */ - npyv_s16 quo = vsx4_div_s16(a, b); - npyv_s16 rem = npyv_sub_s16(a, vec_mul(b, quo)); - // (b == 0 || (a == NPY_MIN_INT16 && b == -1)) - npyv_b16 bzero = npyv_cmpeq_s16(b, vzero); - npyv_b16 amin = npyv_cmpeq_s16(a, vmin); - npyv_b16 bneg_one = npyv_cmpeq_s16(b, vneg_one); - npyv_b16 overflow = npyv_and_s16(bneg_one, amin); - npyv_b16 error = npyv_or_s16(bzero, overflow); - // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 - npyv_s16 cvtozero = npyv_select_s16(error, vzero, vneg_one); - warn = npyv_or_s16(error, warn); -#endif -#if 1 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b16 a_gt_zero = npyv_cmpgt_s16(a, vzero); - npyv_b16 b_gt_zero = npyv_cmpgt_s16(b, vzero); - npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero); - npyv_b16 rem_zero = npyv_cmpeq_s16(rem, vzero); - npyv_b16 or = npyv_or_s16(ab_eq_cond, rem_zero); - npyv_s16 to_add = npyv_select_s16(or, vzero, b); - rem = npyv_add_s16(rem, to_add); -#endif -#if 1 == 2 /* divmod */ - npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one); - quo = npyv_add_s16(quo, to_sub); - npyv_store_s16(dst1, npyv_and_s16(cvtozero, quo)); - npyv_store_s16(dst2, npyv_and_s16(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s16(dst1, rem); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } -#endif - } - -#if 1 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_s16 a = *src1; - const npyv_lanetype_s16 b = *src2; - if (b == 0 || (a == NPY_MIN_INT16 && b == -1)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / b; - *dst2 = a % b; - if (!((a > 0) == (b > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += b; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_s16 a = *src1; - const npyv_lanetype_s16 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; -#if 1 == 1 /* remainder */ - if (!((a > 0) == (b > 0) || *dst1 == 0)) { - *dst1 += b; - } -#endif - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_remainder_by_scalar_contig_s16(char **args, npy_intp len) -{ - npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0]; - npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1]; - npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2]; - const npyv_s16 vscalar = npyv_setall_s16(scalar); - const npyv_s32x2 divisor = vsx4_divisor_s16(vscalar); - const int vstep = npyv_nlanes_s16; -#if 1 >= 1 /* remainder and divmod */ - const npyv_s16 vzero = npyv_zero_s16(); - npyv_b16 b_gt_zero = npyv_cmpgt_s16(vscalar, vzero); -#endif -#if 1 == 2 /* divmod */ - npyv_b16 warn = npyv_cvt_b16_s16(npyv_zero_s16()); - const npyv_s16 vmin = npyv_setall_s16(NPY_MIN_INT16); - const npyv_s16 vneg_one = npyv_setall_s16(-1); - npyv_b16 bneg_one = npyv_cmpeq_s16(vscalar, vneg_one); - npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { -#endif - npyv_s16 a = npyv_load_s16(src1); -#if 1 <= 1 /* fmod and remainder */ - npyv_s16 rem = vsx4_mod_scalar_s16(a, divisor); -#else /* divmod */ - npyv_s16 quo = vsx4_div_scalar_s16(a, divisor); - npyv_s16 rem = npyv_sub_s16(a, vec_mul(vscalar, quo)); - // (a == NPY_MIN_INT16 && b == -1) - npyv_b16 amin = npyv_cmpeq_s16(a, vmin); - npyv_b16 overflow = npyv_and_s16(bneg_one, amin); - // in case of overflow, 'cvtozero' forces quo/rem to be 0 - npyv_s16 cvtozero = npyv_select_s16(overflow, vzero, vneg_one); - warn = npyv_or_s16(overflow, warn); -#endif -#if 1 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b16 a_gt_zero = npyv_cmpgt_s16(a, vzero); - npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero); - npyv_b16 rem_zero = npyv_cmpeq_s16(rem, vzero); - npyv_b16 or = npyv_or_s16(ab_eq_cond, rem_zero); - npyv_s16 to_add = npyv_select_s16(or, vzero, vscalar); - rem = npyv_add_s16(rem, to_add); -#endif -#if 1 == 2 /* divmod */ - npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one); - quo = npyv_add_s16(quo, to_sub); - npyv_store_s16(dst1, npyv_and_s16(cvtozero, quo)); - npyv_store_s16(dst2, npyv_and_s16(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s16(dst1, rem); -#endif - } - -#if 1 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_s16 a = *src1; - if (a == NPY_MIN_INT16 && scalar == -1) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / scalar; - *dst2 = a % scalar; - if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += scalar; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_s16 a = *src1; - *dst1 = a % scalar; -#if 1 == 1 /* remainder */ - if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { - *dst1 += scalar; - } -#endif - } -#endif - npyv_cleanup(); -} - -#line 281 -static NPY_INLINE void -vsx4_simd_divmod_contig_s16(char **args, npy_intp len) -{ - npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0]; - npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1]; - npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2]; - const npyv_s16 vzero = npyv_zero_s16(); - const int vstep = npyv_nlanes_s16; -#if 2 == 2 /* divmod */ - npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3]; - const npyv_s16 vneg_one = npyv_setall_s16(-1); - const npyv_s16 vmin = npyv_setall_s16(NPY_MIN_INT16); - npyv_b16 warn = npyv_cvt_b16_s16(npyv_zero_s16()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { -#endif - npyv_s16 a = npyv_load_s16(src1); - npyv_s16 b = npyv_load_s16(src2); -#if 2 <= 1 /* fmod and remainder */ - npyv_s16 rem = vsx4_mod_s16(a, b); -#else /* divmod */ - npyv_s16 quo = vsx4_div_s16(a, b); - npyv_s16 rem = npyv_sub_s16(a, vec_mul(b, quo)); - // (b == 0 || (a == NPY_MIN_INT16 && b == -1)) - npyv_b16 bzero = npyv_cmpeq_s16(b, vzero); - npyv_b16 amin = npyv_cmpeq_s16(a, vmin); - npyv_b16 bneg_one = npyv_cmpeq_s16(b, vneg_one); - npyv_b16 overflow = npyv_and_s16(bneg_one, amin); - npyv_b16 error = npyv_or_s16(bzero, overflow); - // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 - npyv_s16 cvtozero = npyv_select_s16(error, vzero, vneg_one); - warn = npyv_or_s16(error, warn); -#endif -#if 2 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b16 a_gt_zero = npyv_cmpgt_s16(a, vzero); - npyv_b16 b_gt_zero = npyv_cmpgt_s16(b, vzero); - npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero); - npyv_b16 rem_zero = npyv_cmpeq_s16(rem, vzero); - npyv_b16 or = npyv_or_s16(ab_eq_cond, rem_zero); - npyv_s16 to_add = npyv_select_s16(or, vzero, b); - rem = npyv_add_s16(rem, to_add); -#endif -#if 2 == 2 /* divmod */ - npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one); - quo = npyv_add_s16(quo, to_sub); - npyv_store_s16(dst1, npyv_and_s16(cvtozero, quo)); - npyv_store_s16(dst2, npyv_and_s16(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s16(dst1, rem); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } -#endif - } - -#if 2 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_s16 a = *src1; - const npyv_lanetype_s16 b = *src2; - if (b == 0 || (a == NPY_MIN_INT16 && b == -1)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / b; - *dst2 = a % b; - if (!((a > 0) == (b > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += b; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_s16 a = *src1; - const npyv_lanetype_s16 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; -#if 2 == 1 /* remainder */ - if (!((a > 0) == (b > 0) || *dst1 == 0)) { - *dst1 += b; - } -#endif - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_divmod_by_scalar_contig_s16(char **args, npy_intp len) -{ - npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0]; - npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1]; - npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2]; - const npyv_s16 vscalar = npyv_setall_s16(scalar); - const npyv_s32x2 divisor = vsx4_divisor_s16(vscalar); - const int vstep = npyv_nlanes_s16; -#if 2 >= 1 /* remainder and divmod */ - const npyv_s16 vzero = npyv_zero_s16(); - npyv_b16 b_gt_zero = npyv_cmpgt_s16(vscalar, vzero); -#endif -#if 2 == 2 /* divmod */ - npyv_b16 warn = npyv_cvt_b16_s16(npyv_zero_s16()); - const npyv_s16 vmin = npyv_setall_s16(NPY_MIN_INT16); - const npyv_s16 vneg_one = npyv_setall_s16(-1); - npyv_b16 bneg_one = npyv_cmpeq_s16(vscalar, vneg_one); - npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { -#endif - npyv_s16 a = npyv_load_s16(src1); -#if 2 <= 1 /* fmod and remainder */ - npyv_s16 rem = vsx4_mod_scalar_s16(a, divisor); -#else /* divmod */ - npyv_s16 quo = vsx4_div_scalar_s16(a, divisor); - npyv_s16 rem = npyv_sub_s16(a, vec_mul(vscalar, quo)); - // (a == NPY_MIN_INT16 && b == -1) - npyv_b16 amin = npyv_cmpeq_s16(a, vmin); - npyv_b16 overflow = npyv_and_s16(bneg_one, amin); - // in case of overflow, 'cvtozero' forces quo/rem to be 0 - npyv_s16 cvtozero = npyv_select_s16(overflow, vzero, vneg_one); - warn = npyv_or_s16(overflow, warn); -#endif -#if 2 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b16 a_gt_zero = npyv_cmpgt_s16(a, vzero); - npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero); - npyv_b16 rem_zero = npyv_cmpeq_s16(rem, vzero); - npyv_b16 or = npyv_or_s16(ab_eq_cond, rem_zero); - npyv_s16 to_add = npyv_select_s16(or, vzero, vscalar); - rem = npyv_add_s16(rem, to_add); -#endif -#if 2 == 2 /* divmod */ - npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one); - quo = npyv_add_s16(quo, to_sub); - npyv_store_s16(dst1, npyv_and_s16(cvtozero, quo)); - npyv_store_s16(dst2, npyv_and_s16(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s16(dst1, rem); -#endif - } - -#if 2 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_s16 a = *src1; - if (a == NPY_MIN_INT16 && scalar == -1) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / scalar; - *dst2 = a % scalar; - if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += scalar; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_s16 a = *src1; - *dst1 = a % scalar; -#if 2 == 1 /* remainder */ - if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { - *dst1 += scalar; - } -#endif - } -#endif - npyv_cleanup(); -} - - -#line 277 -#line 281 -static NPY_INLINE void -vsx4_simd_fmod_contig_s32(char **args, npy_intp len) -{ - npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0]; - npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1]; - npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2]; - const npyv_s32 vzero = npyv_zero_s32(); - const int vstep = npyv_nlanes_s32; -#if 0 == 2 /* divmod */ - npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3]; - const npyv_s32 vneg_one = npyv_setall_s32(-1); - const npyv_s32 vmin = npyv_setall_s32(NPY_MIN_INT32); - npyv_b32 warn = npyv_cvt_b32_s32(npyv_zero_s32()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { -#endif - npyv_s32 a = npyv_load_s32(src1); - npyv_s32 b = npyv_load_s32(src2); -#if 0 <= 1 /* fmod and remainder */ - npyv_s32 rem = vsx4_mod_s32(a, b); -#else /* divmod */ - npyv_s32 quo = vsx4_div_s32(a, b); - npyv_s32 rem = npyv_sub_s32(a, vec_mul(b, quo)); - // (b == 0 || (a == NPY_MIN_INT32 && b == -1)) - npyv_b32 bzero = npyv_cmpeq_s32(b, vzero); - npyv_b32 amin = npyv_cmpeq_s32(a, vmin); - npyv_b32 bneg_one = npyv_cmpeq_s32(b, vneg_one); - npyv_b32 overflow = npyv_and_s32(bneg_one, amin); - npyv_b32 error = npyv_or_s32(bzero, overflow); - // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 - npyv_s32 cvtozero = npyv_select_s32(error, vzero, vneg_one); - warn = npyv_or_s32(error, warn); -#endif -#if 0 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b32 a_gt_zero = npyv_cmpgt_s32(a, vzero); - npyv_b32 b_gt_zero = npyv_cmpgt_s32(b, vzero); - npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero); - npyv_b32 rem_zero = npyv_cmpeq_s32(rem, vzero); - npyv_b32 or = npyv_or_s32(ab_eq_cond, rem_zero); - npyv_s32 to_add = npyv_select_s32(or, vzero, b); - rem = npyv_add_s32(rem, to_add); -#endif -#if 0 == 2 /* divmod */ - npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one); - quo = npyv_add_s32(quo, to_sub); - npyv_store_s32(dst1, npyv_and_s32(cvtozero, quo)); - npyv_store_s32(dst2, npyv_and_s32(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s32(dst1, rem); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } -#endif - } - -#if 0 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_s32 a = *src1; - const npyv_lanetype_s32 b = *src2; - if (b == 0 || (a == NPY_MIN_INT32 && b == -1)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / b; - *dst2 = a % b; - if (!((a > 0) == (b > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += b; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_s32 a = *src1; - const npyv_lanetype_s32 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; -#if 0 == 1 /* remainder */ - if (!((a > 0) == (b > 0) || *dst1 == 0)) { - *dst1 += b; - } -#endif - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_fmod_by_scalar_contig_s32(char **args, npy_intp len) -{ - npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0]; - npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1]; - npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2]; - const npyv_s32 vscalar = npyv_setall_s32(scalar); - const npyv_s32 divisor = vsx4_divisor_s32(vscalar); - const int vstep = npyv_nlanes_s32; -#if 0 >= 1 /* remainder and divmod */ - const npyv_s32 vzero = npyv_zero_s32(); - npyv_b32 b_gt_zero = npyv_cmpgt_s32(vscalar, vzero); -#endif -#if 0 == 2 /* divmod */ - npyv_b32 warn = npyv_cvt_b32_s32(npyv_zero_s32()); - const npyv_s32 vmin = npyv_setall_s32(NPY_MIN_INT32); - const npyv_s32 vneg_one = npyv_setall_s32(-1); - npyv_b32 bneg_one = npyv_cmpeq_s32(vscalar, vneg_one); - npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { -#endif - npyv_s32 a = npyv_load_s32(src1); -#if 0 <= 1 /* fmod and remainder */ - npyv_s32 rem = vsx4_mod_scalar_s32(a, divisor); -#else /* divmod */ - npyv_s32 quo = vsx4_div_scalar_s32(a, divisor); - npyv_s32 rem = npyv_sub_s32(a, vec_mul(vscalar, quo)); - // (a == NPY_MIN_INT32 && b == -1) - npyv_b32 amin = npyv_cmpeq_s32(a, vmin); - npyv_b32 overflow = npyv_and_s32(bneg_one, amin); - // in case of overflow, 'cvtozero' forces quo/rem to be 0 - npyv_s32 cvtozero = npyv_select_s32(overflow, vzero, vneg_one); - warn = npyv_or_s32(overflow, warn); -#endif -#if 0 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b32 a_gt_zero = npyv_cmpgt_s32(a, vzero); - npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero); - npyv_b32 rem_zero = npyv_cmpeq_s32(rem, vzero); - npyv_b32 or = npyv_or_s32(ab_eq_cond, rem_zero); - npyv_s32 to_add = npyv_select_s32(or, vzero, vscalar); - rem = npyv_add_s32(rem, to_add); -#endif -#if 0 == 2 /* divmod */ - npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one); - quo = npyv_add_s32(quo, to_sub); - npyv_store_s32(dst1, npyv_and_s32(cvtozero, quo)); - npyv_store_s32(dst2, npyv_and_s32(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s32(dst1, rem); -#endif - } - -#if 0 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_s32 a = *src1; - if (a == NPY_MIN_INT32 && scalar == -1) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / scalar; - *dst2 = a % scalar; - if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += scalar; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_s32 a = *src1; - *dst1 = a % scalar; -#if 0 == 1 /* remainder */ - if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { - *dst1 += scalar; - } -#endif - } -#endif - npyv_cleanup(); -} - -#line 281 -static NPY_INLINE void -vsx4_simd_remainder_contig_s32(char **args, npy_intp len) -{ - npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0]; - npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1]; - npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2]; - const npyv_s32 vzero = npyv_zero_s32(); - const int vstep = npyv_nlanes_s32; -#if 1 == 2 /* divmod */ - npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3]; - const npyv_s32 vneg_one = npyv_setall_s32(-1); - const npyv_s32 vmin = npyv_setall_s32(NPY_MIN_INT32); - npyv_b32 warn = npyv_cvt_b32_s32(npyv_zero_s32()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { -#endif - npyv_s32 a = npyv_load_s32(src1); - npyv_s32 b = npyv_load_s32(src2); -#if 1 <= 1 /* fmod and remainder */ - npyv_s32 rem = vsx4_mod_s32(a, b); -#else /* divmod */ - npyv_s32 quo = vsx4_div_s32(a, b); - npyv_s32 rem = npyv_sub_s32(a, vec_mul(b, quo)); - // (b == 0 || (a == NPY_MIN_INT32 && b == -1)) - npyv_b32 bzero = npyv_cmpeq_s32(b, vzero); - npyv_b32 amin = npyv_cmpeq_s32(a, vmin); - npyv_b32 bneg_one = npyv_cmpeq_s32(b, vneg_one); - npyv_b32 overflow = npyv_and_s32(bneg_one, amin); - npyv_b32 error = npyv_or_s32(bzero, overflow); - // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 - npyv_s32 cvtozero = npyv_select_s32(error, vzero, vneg_one); - warn = npyv_or_s32(error, warn); -#endif -#if 1 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b32 a_gt_zero = npyv_cmpgt_s32(a, vzero); - npyv_b32 b_gt_zero = npyv_cmpgt_s32(b, vzero); - npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero); - npyv_b32 rem_zero = npyv_cmpeq_s32(rem, vzero); - npyv_b32 or = npyv_or_s32(ab_eq_cond, rem_zero); - npyv_s32 to_add = npyv_select_s32(or, vzero, b); - rem = npyv_add_s32(rem, to_add); -#endif -#if 1 == 2 /* divmod */ - npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one); - quo = npyv_add_s32(quo, to_sub); - npyv_store_s32(dst1, npyv_and_s32(cvtozero, quo)); - npyv_store_s32(dst2, npyv_and_s32(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s32(dst1, rem); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } -#endif - } - -#if 1 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_s32 a = *src1; - const npyv_lanetype_s32 b = *src2; - if (b == 0 || (a == NPY_MIN_INT32 && b == -1)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / b; - *dst2 = a % b; - if (!((a > 0) == (b > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += b; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_s32 a = *src1; - const npyv_lanetype_s32 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; -#if 1 == 1 /* remainder */ - if (!((a > 0) == (b > 0) || *dst1 == 0)) { - *dst1 += b; - } -#endif - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_remainder_by_scalar_contig_s32(char **args, npy_intp len) -{ - npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0]; - npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1]; - npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2]; - const npyv_s32 vscalar = npyv_setall_s32(scalar); - const npyv_s32 divisor = vsx4_divisor_s32(vscalar); - const int vstep = npyv_nlanes_s32; -#if 1 >= 1 /* remainder and divmod */ - const npyv_s32 vzero = npyv_zero_s32(); - npyv_b32 b_gt_zero = npyv_cmpgt_s32(vscalar, vzero); -#endif -#if 1 == 2 /* divmod */ - npyv_b32 warn = npyv_cvt_b32_s32(npyv_zero_s32()); - const npyv_s32 vmin = npyv_setall_s32(NPY_MIN_INT32); - const npyv_s32 vneg_one = npyv_setall_s32(-1); - npyv_b32 bneg_one = npyv_cmpeq_s32(vscalar, vneg_one); - npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { -#endif - npyv_s32 a = npyv_load_s32(src1); -#if 1 <= 1 /* fmod and remainder */ - npyv_s32 rem = vsx4_mod_scalar_s32(a, divisor); -#else /* divmod */ - npyv_s32 quo = vsx4_div_scalar_s32(a, divisor); - npyv_s32 rem = npyv_sub_s32(a, vec_mul(vscalar, quo)); - // (a == NPY_MIN_INT32 && b == -1) - npyv_b32 amin = npyv_cmpeq_s32(a, vmin); - npyv_b32 overflow = npyv_and_s32(bneg_one, amin); - // in case of overflow, 'cvtozero' forces quo/rem to be 0 - npyv_s32 cvtozero = npyv_select_s32(overflow, vzero, vneg_one); - warn = npyv_or_s32(overflow, warn); -#endif -#if 1 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b32 a_gt_zero = npyv_cmpgt_s32(a, vzero); - npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero); - npyv_b32 rem_zero = npyv_cmpeq_s32(rem, vzero); - npyv_b32 or = npyv_or_s32(ab_eq_cond, rem_zero); - npyv_s32 to_add = npyv_select_s32(or, vzero, vscalar); - rem = npyv_add_s32(rem, to_add); -#endif -#if 1 == 2 /* divmod */ - npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one); - quo = npyv_add_s32(quo, to_sub); - npyv_store_s32(dst1, npyv_and_s32(cvtozero, quo)); - npyv_store_s32(dst2, npyv_and_s32(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s32(dst1, rem); -#endif - } - -#if 1 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_s32 a = *src1; - if (a == NPY_MIN_INT32 && scalar == -1) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / scalar; - *dst2 = a % scalar; - if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += scalar; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_s32 a = *src1; - *dst1 = a % scalar; -#if 1 == 1 /* remainder */ - if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { - *dst1 += scalar; - } -#endif - } -#endif - npyv_cleanup(); -} - -#line 281 -static NPY_INLINE void -vsx4_simd_divmod_contig_s32(char **args, npy_intp len) -{ - npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0]; - npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1]; - npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2]; - const npyv_s32 vzero = npyv_zero_s32(); - const int vstep = npyv_nlanes_s32; -#if 2 == 2 /* divmod */ - npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3]; - const npyv_s32 vneg_one = npyv_setall_s32(-1); - const npyv_s32 vmin = npyv_setall_s32(NPY_MIN_INT32); - npyv_b32 warn = npyv_cvt_b32_s32(npyv_zero_s32()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { -#endif - npyv_s32 a = npyv_load_s32(src1); - npyv_s32 b = npyv_load_s32(src2); -#if 2 <= 1 /* fmod and remainder */ - npyv_s32 rem = vsx4_mod_s32(a, b); -#else /* divmod */ - npyv_s32 quo = vsx4_div_s32(a, b); - npyv_s32 rem = npyv_sub_s32(a, vec_mul(b, quo)); - // (b == 0 || (a == NPY_MIN_INT32 && b == -1)) - npyv_b32 bzero = npyv_cmpeq_s32(b, vzero); - npyv_b32 amin = npyv_cmpeq_s32(a, vmin); - npyv_b32 bneg_one = npyv_cmpeq_s32(b, vneg_one); - npyv_b32 overflow = npyv_and_s32(bneg_one, amin); - npyv_b32 error = npyv_or_s32(bzero, overflow); - // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 - npyv_s32 cvtozero = npyv_select_s32(error, vzero, vneg_one); - warn = npyv_or_s32(error, warn); -#endif -#if 2 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b32 a_gt_zero = npyv_cmpgt_s32(a, vzero); - npyv_b32 b_gt_zero = npyv_cmpgt_s32(b, vzero); - npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero); - npyv_b32 rem_zero = npyv_cmpeq_s32(rem, vzero); - npyv_b32 or = npyv_or_s32(ab_eq_cond, rem_zero); - npyv_s32 to_add = npyv_select_s32(or, vzero, b); - rem = npyv_add_s32(rem, to_add); -#endif -#if 2 == 2 /* divmod */ - npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one); - quo = npyv_add_s32(quo, to_sub); - npyv_store_s32(dst1, npyv_and_s32(cvtozero, quo)); - npyv_store_s32(dst2, npyv_and_s32(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s32(dst1, rem); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } -#endif - } - -#if 2 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_s32 a = *src1; - const npyv_lanetype_s32 b = *src2; - if (b == 0 || (a == NPY_MIN_INT32 && b == -1)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / b; - *dst2 = a % b; - if (!((a > 0) == (b > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += b; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_s32 a = *src1; - const npyv_lanetype_s32 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; -#if 2 == 1 /* remainder */ - if (!((a > 0) == (b > 0) || *dst1 == 0)) { - *dst1 += b; - } -#endif - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_divmod_by_scalar_contig_s32(char **args, npy_intp len) -{ - npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0]; - npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1]; - npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2]; - const npyv_s32 vscalar = npyv_setall_s32(scalar); - const npyv_s32 divisor = vsx4_divisor_s32(vscalar); - const int vstep = npyv_nlanes_s32; -#if 2 >= 1 /* remainder and divmod */ - const npyv_s32 vzero = npyv_zero_s32(); - npyv_b32 b_gt_zero = npyv_cmpgt_s32(vscalar, vzero); -#endif -#if 2 == 2 /* divmod */ - npyv_b32 warn = npyv_cvt_b32_s32(npyv_zero_s32()); - const npyv_s32 vmin = npyv_setall_s32(NPY_MIN_INT32); - const npyv_s32 vneg_one = npyv_setall_s32(-1); - npyv_b32 bneg_one = npyv_cmpeq_s32(vscalar, vneg_one); - npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { -#endif - npyv_s32 a = npyv_load_s32(src1); -#if 2 <= 1 /* fmod and remainder */ - npyv_s32 rem = vsx4_mod_scalar_s32(a, divisor); -#else /* divmod */ - npyv_s32 quo = vsx4_div_scalar_s32(a, divisor); - npyv_s32 rem = npyv_sub_s32(a, vec_mul(vscalar, quo)); - // (a == NPY_MIN_INT32 && b == -1) - npyv_b32 amin = npyv_cmpeq_s32(a, vmin); - npyv_b32 overflow = npyv_and_s32(bneg_one, amin); - // in case of overflow, 'cvtozero' forces quo/rem to be 0 - npyv_s32 cvtozero = npyv_select_s32(overflow, vzero, vneg_one); - warn = npyv_or_s32(overflow, warn); -#endif -#if 2 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b32 a_gt_zero = npyv_cmpgt_s32(a, vzero); - npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero); - npyv_b32 rem_zero = npyv_cmpeq_s32(rem, vzero); - npyv_b32 or = npyv_or_s32(ab_eq_cond, rem_zero); - npyv_s32 to_add = npyv_select_s32(or, vzero, vscalar); - rem = npyv_add_s32(rem, to_add); -#endif -#if 2 == 2 /* divmod */ - npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one); - quo = npyv_add_s32(quo, to_sub); - npyv_store_s32(dst1, npyv_and_s32(cvtozero, quo)); - npyv_store_s32(dst2, npyv_and_s32(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s32(dst1, rem); -#endif - } - -#if 2 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_s32 a = *src1; - if (a == NPY_MIN_INT32 && scalar == -1) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / scalar; - *dst2 = a % scalar; - if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += scalar; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_s32 a = *src1; - *dst1 = a % scalar; -#if 2 == 1 /* remainder */ - if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { - *dst1 += scalar; - } -#endif - } -#endif - npyv_cleanup(); -} - - -#line 277 -#line 281 -static NPY_INLINE void -vsx4_simd_fmod_contig_s64(char **args, npy_intp len) -{ - npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0]; - npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1]; - npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2]; - const npyv_s64 vzero = npyv_zero_s64(); - const int vstep = npyv_nlanes_s64; -#if 0 == 2 /* divmod */ - npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3]; - const npyv_s64 vneg_one = npyv_setall_s64(-1); - const npyv_s64 vmin = npyv_setall_s64(NPY_MIN_INT64); - npyv_b64 warn = npyv_cvt_b64_s64(npyv_zero_s64()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { -#endif - npyv_s64 a = npyv_load_s64(src1); - npyv_s64 b = npyv_load_s64(src2); -#if 0 <= 1 /* fmod and remainder */ - npyv_s64 rem = vsx4_mod_s64(a, b); -#else /* divmod */ - npyv_s64 quo = vsx4_div_s64(a, b); - npyv_s64 rem = npyv_sub_s64(a, vec_mul(b, quo)); - // (b == 0 || (a == NPY_MIN_INT64 && b == -1)) - npyv_b64 bzero = npyv_cmpeq_s64(b, vzero); - npyv_b64 amin = npyv_cmpeq_s64(a, vmin); - npyv_b64 bneg_one = npyv_cmpeq_s64(b, vneg_one); - npyv_b64 overflow = npyv_and_s64(bneg_one, amin); - npyv_b64 error = npyv_or_s64(bzero, overflow); - // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 - npyv_s64 cvtozero = npyv_select_s64(error, vzero, vneg_one); - warn = npyv_or_s64(error, warn); -#endif -#if 0 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b64 a_gt_zero = npyv_cmpgt_s64(a, vzero); - npyv_b64 b_gt_zero = npyv_cmpgt_s64(b, vzero); - npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero); - npyv_b64 rem_zero = npyv_cmpeq_s64(rem, vzero); - npyv_b64 or = npyv_or_s64(ab_eq_cond, rem_zero); - npyv_s64 to_add = npyv_select_s64(or, vzero, b); - rem = npyv_add_s64(rem, to_add); -#endif -#if 0 == 2 /* divmod */ - npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one); - quo = npyv_add_s64(quo, to_sub); - npyv_store_s64(dst1, npyv_and_s64(cvtozero, quo)); - npyv_store_s64(dst2, npyv_and_s64(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s64(dst1, rem); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } -#endif - } - -#if 0 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_s64 a = *src1; - const npyv_lanetype_s64 b = *src2; - if (b == 0 || (a == NPY_MIN_INT64 && b == -1)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / b; - *dst2 = a % b; - if (!((a > 0) == (b > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += b; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_s64 a = *src1; - const npyv_lanetype_s64 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; -#if 0 == 1 /* remainder */ - if (!((a > 0) == (b > 0) || *dst1 == 0)) { - *dst1 += b; - } -#endif - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_fmod_by_scalar_contig_s64(char **args, npy_intp len) -{ - npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0]; - npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1]; - npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2]; - const npyv_s64 vscalar = npyv_setall_s64(scalar); - const npyv_s64 divisor = vsx4_divisor_s64(vscalar); - const int vstep = npyv_nlanes_s64; -#if 0 >= 1 /* remainder and divmod */ - const npyv_s64 vzero = npyv_zero_s64(); - npyv_b64 b_gt_zero = npyv_cmpgt_s64(vscalar, vzero); -#endif -#if 0 == 2 /* divmod */ - npyv_b64 warn = npyv_cvt_b64_s64(npyv_zero_s64()); - const npyv_s64 vmin = npyv_setall_s64(NPY_MIN_INT64); - const npyv_s64 vneg_one = npyv_setall_s64(-1); - npyv_b64 bneg_one = npyv_cmpeq_s64(vscalar, vneg_one); - npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { -#endif - npyv_s64 a = npyv_load_s64(src1); -#if 0 <= 1 /* fmod and remainder */ - npyv_s64 rem = vsx4_mod_scalar_s64(a, divisor); -#else /* divmod */ - npyv_s64 quo = vsx4_div_scalar_s64(a, divisor); - npyv_s64 rem = npyv_sub_s64(a, vec_mul(vscalar, quo)); - // (a == NPY_MIN_INT64 && b == -1) - npyv_b64 amin = npyv_cmpeq_s64(a, vmin); - npyv_b64 overflow = npyv_and_s64(bneg_one, amin); - // in case of overflow, 'cvtozero' forces quo/rem to be 0 - npyv_s64 cvtozero = npyv_select_s64(overflow, vzero, vneg_one); - warn = npyv_or_s64(overflow, warn); -#endif -#if 0 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b64 a_gt_zero = npyv_cmpgt_s64(a, vzero); - npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero); - npyv_b64 rem_zero = npyv_cmpeq_s64(rem, vzero); - npyv_b64 or = npyv_or_s64(ab_eq_cond, rem_zero); - npyv_s64 to_add = npyv_select_s64(or, vzero, vscalar); - rem = npyv_add_s64(rem, to_add); -#endif -#if 0 == 2 /* divmod */ - npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one); - quo = npyv_add_s64(quo, to_sub); - npyv_store_s64(dst1, npyv_and_s64(cvtozero, quo)); - npyv_store_s64(dst2, npyv_and_s64(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s64(dst1, rem); -#endif - } - -#if 0 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_s64 a = *src1; - if (a == NPY_MIN_INT64 && scalar == -1) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / scalar; - *dst2 = a % scalar; - if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += scalar; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_s64 a = *src1; - *dst1 = a % scalar; -#if 0 == 1 /* remainder */ - if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { - *dst1 += scalar; - } -#endif - } -#endif - npyv_cleanup(); -} - -#line 281 -static NPY_INLINE void -vsx4_simd_remainder_contig_s64(char **args, npy_intp len) -{ - npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0]; - npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1]; - npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2]; - const npyv_s64 vzero = npyv_zero_s64(); - const int vstep = npyv_nlanes_s64; -#if 1 == 2 /* divmod */ - npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3]; - const npyv_s64 vneg_one = npyv_setall_s64(-1); - const npyv_s64 vmin = npyv_setall_s64(NPY_MIN_INT64); - npyv_b64 warn = npyv_cvt_b64_s64(npyv_zero_s64()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { -#endif - npyv_s64 a = npyv_load_s64(src1); - npyv_s64 b = npyv_load_s64(src2); -#if 1 <= 1 /* fmod and remainder */ - npyv_s64 rem = vsx4_mod_s64(a, b); -#else /* divmod */ - npyv_s64 quo = vsx4_div_s64(a, b); - npyv_s64 rem = npyv_sub_s64(a, vec_mul(b, quo)); - // (b == 0 || (a == NPY_MIN_INT64 && b == -1)) - npyv_b64 bzero = npyv_cmpeq_s64(b, vzero); - npyv_b64 amin = npyv_cmpeq_s64(a, vmin); - npyv_b64 bneg_one = npyv_cmpeq_s64(b, vneg_one); - npyv_b64 overflow = npyv_and_s64(bneg_one, amin); - npyv_b64 error = npyv_or_s64(bzero, overflow); - // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 - npyv_s64 cvtozero = npyv_select_s64(error, vzero, vneg_one); - warn = npyv_or_s64(error, warn); -#endif -#if 1 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b64 a_gt_zero = npyv_cmpgt_s64(a, vzero); - npyv_b64 b_gt_zero = npyv_cmpgt_s64(b, vzero); - npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero); - npyv_b64 rem_zero = npyv_cmpeq_s64(rem, vzero); - npyv_b64 or = npyv_or_s64(ab_eq_cond, rem_zero); - npyv_s64 to_add = npyv_select_s64(or, vzero, b); - rem = npyv_add_s64(rem, to_add); -#endif -#if 1 == 2 /* divmod */ - npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one); - quo = npyv_add_s64(quo, to_sub); - npyv_store_s64(dst1, npyv_and_s64(cvtozero, quo)); - npyv_store_s64(dst2, npyv_and_s64(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s64(dst1, rem); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } -#endif - } - -#if 1 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_s64 a = *src1; - const npyv_lanetype_s64 b = *src2; - if (b == 0 || (a == NPY_MIN_INT64 && b == -1)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / b; - *dst2 = a % b; - if (!((a > 0) == (b > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += b; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_s64 a = *src1; - const npyv_lanetype_s64 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; -#if 1 == 1 /* remainder */ - if (!((a > 0) == (b > 0) || *dst1 == 0)) { - *dst1 += b; - } -#endif - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_remainder_by_scalar_contig_s64(char **args, npy_intp len) -{ - npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0]; - npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1]; - npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2]; - const npyv_s64 vscalar = npyv_setall_s64(scalar); - const npyv_s64 divisor = vsx4_divisor_s64(vscalar); - const int vstep = npyv_nlanes_s64; -#if 1 >= 1 /* remainder and divmod */ - const npyv_s64 vzero = npyv_zero_s64(); - npyv_b64 b_gt_zero = npyv_cmpgt_s64(vscalar, vzero); -#endif -#if 1 == 2 /* divmod */ - npyv_b64 warn = npyv_cvt_b64_s64(npyv_zero_s64()); - const npyv_s64 vmin = npyv_setall_s64(NPY_MIN_INT64); - const npyv_s64 vneg_one = npyv_setall_s64(-1); - npyv_b64 bneg_one = npyv_cmpeq_s64(vscalar, vneg_one); - npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { -#endif - npyv_s64 a = npyv_load_s64(src1); -#if 1 <= 1 /* fmod and remainder */ - npyv_s64 rem = vsx4_mod_scalar_s64(a, divisor); -#else /* divmod */ - npyv_s64 quo = vsx4_div_scalar_s64(a, divisor); - npyv_s64 rem = npyv_sub_s64(a, vec_mul(vscalar, quo)); - // (a == NPY_MIN_INT64 && b == -1) - npyv_b64 amin = npyv_cmpeq_s64(a, vmin); - npyv_b64 overflow = npyv_and_s64(bneg_one, amin); - // in case of overflow, 'cvtozero' forces quo/rem to be 0 - npyv_s64 cvtozero = npyv_select_s64(overflow, vzero, vneg_one); - warn = npyv_or_s64(overflow, warn); -#endif -#if 1 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b64 a_gt_zero = npyv_cmpgt_s64(a, vzero); - npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero); - npyv_b64 rem_zero = npyv_cmpeq_s64(rem, vzero); - npyv_b64 or = npyv_or_s64(ab_eq_cond, rem_zero); - npyv_s64 to_add = npyv_select_s64(or, vzero, vscalar); - rem = npyv_add_s64(rem, to_add); -#endif -#if 1 == 2 /* divmod */ - npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one); - quo = npyv_add_s64(quo, to_sub); - npyv_store_s64(dst1, npyv_and_s64(cvtozero, quo)); - npyv_store_s64(dst2, npyv_and_s64(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s64(dst1, rem); -#endif - } - -#if 1 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_s64 a = *src1; - if (a == NPY_MIN_INT64 && scalar == -1) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / scalar; - *dst2 = a % scalar; - if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += scalar; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_s64 a = *src1; - *dst1 = a % scalar; -#if 1 == 1 /* remainder */ - if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { - *dst1 += scalar; - } -#endif - } -#endif - npyv_cleanup(); -} - -#line 281 -static NPY_INLINE void -vsx4_simd_divmod_contig_s64(char **args, npy_intp len) -{ - npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0]; - npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1]; - npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2]; - const npyv_s64 vzero = npyv_zero_s64(); - const int vstep = npyv_nlanes_s64; -#if 2 == 2 /* divmod */ - npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3]; - const npyv_s64 vneg_one = npyv_setall_s64(-1); - const npyv_s64 vmin = npyv_setall_s64(NPY_MIN_INT64); - npyv_b64 warn = npyv_cvt_b64_s64(npyv_zero_s64()); - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep, dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { -#endif - npyv_s64 a = npyv_load_s64(src1); - npyv_s64 b = npyv_load_s64(src2); -#if 2 <= 1 /* fmod and remainder */ - npyv_s64 rem = vsx4_mod_s64(a, b); -#else /* divmod */ - npyv_s64 quo = vsx4_div_s64(a, b); - npyv_s64 rem = npyv_sub_s64(a, vec_mul(b, quo)); - // (b == 0 || (a == NPY_MIN_INT64 && b == -1)) - npyv_b64 bzero = npyv_cmpeq_s64(b, vzero); - npyv_b64 amin = npyv_cmpeq_s64(a, vmin); - npyv_b64 bneg_one = npyv_cmpeq_s64(b, vneg_one); - npyv_b64 overflow = npyv_and_s64(bneg_one, amin); - npyv_b64 error = npyv_or_s64(bzero, overflow); - // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 - npyv_s64 cvtozero = npyv_select_s64(error, vzero, vneg_one); - warn = npyv_or_s64(error, warn); -#endif -#if 2 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b64 a_gt_zero = npyv_cmpgt_s64(a, vzero); - npyv_b64 b_gt_zero = npyv_cmpgt_s64(b, vzero); - npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero); - npyv_b64 rem_zero = npyv_cmpeq_s64(rem, vzero); - npyv_b64 or = npyv_or_s64(ab_eq_cond, rem_zero); - npyv_s64 to_add = npyv_select_s64(or, vzero, b); - rem = npyv_add_s64(rem, to_add); -#endif -#if 2 == 2 /* divmod */ - npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one); - quo = npyv_add_s64(quo, to_sub); - npyv_store_s64(dst1, npyv_and_s64(cvtozero, quo)); - npyv_store_s64(dst2, npyv_and_s64(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s64(dst1, rem); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } -#endif - } - -#if 2 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) { - const npyv_lanetype_s64 a = *src1; - const npyv_lanetype_s64 b = *src2; - if (b == 0 || (a == NPY_MIN_INT64 && b == -1)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / b; - *dst2 = a % b; - if (!((a > 0) == (b > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += b; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_s64 a = *src1; - const npyv_lanetype_s64 b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a % b; -#if 2 == 1 /* remainder */ - if (!((a > 0) == (b > 0) || *dst1 == 0)) { - *dst1 += b; - } -#endif - } - } -#endif - npyv_cleanup(); -} - -static NPY_INLINE void -vsx4_simd_divmod_by_scalar_contig_s64(char **args, npy_intp len) -{ - npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0]; - npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1]; - npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2]; - const npyv_s64 vscalar = npyv_setall_s64(scalar); - const npyv_s64 divisor = vsx4_divisor_s64(vscalar); - const int vstep = npyv_nlanes_s64; -#if 2 >= 1 /* remainder and divmod */ - const npyv_s64 vzero = npyv_zero_s64(); - npyv_b64 b_gt_zero = npyv_cmpgt_s64(vscalar, vzero); -#endif -#if 2 == 2 /* divmod */ - npyv_b64 warn = npyv_cvt_b64_s64(npyv_zero_s64()); - const npyv_s64 vmin = npyv_setall_s64(NPY_MIN_INT64); - const npyv_s64 vneg_one = npyv_setall_s64(-1); - npyv_b64 bneg_one = npyv_cmpeq_s64(vscalar, vneg_one); - npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3]; - - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep, - dst2 += vstep) { -#else /* fmod and remainder */ - for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) { -#endif - npyv_s64 a = npyv_load_s64(src1); -#if 2 <= 1 /* fmod and remainder */ - npyv_s64 rem = vsx4_mod_scalar_s64(a, divisor); -#else /* divmod */ - npyv_s64 quo = vsx4_div_scalar_s64(a, divisor); - npyv_s64 rem = npyv_sub_s64(a, vec_mul(vscalar, quo)); - // (a == NPY_MIN_INT64 && b == -1) - npyv_b64 amin = npyv_cmpeq_s64(a, vmin); - npyv_b64 overflow = npyv_and_s64(bneg_one, amin); - // in case of overflow, 'cvtozero' forces quo/rem to be 0 - npyv_s64 cvtozero = npyv_select_s64(overflow, vzero, vneg_one); - warn = npyv_or_s64(overflow, warn); -#endif -#if 2 >= 1 /* remainder and divmod */ - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b64 a_gt_zero = npyv_cmpgt_s64(a, vzero); - npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero); - npyv_b64 rem_zero = npyv_cmpeq_s64(rem, vzero); - npyv_b64 or = npyv_or_s64(ab_eq_cond, rem_zero); - npyv_s64 to_add = npyv_select_s64(or, vzero, vscalar); - rem = npyv_add_s64(rem, to_add); -#endif -#if 2 == 2 /* divmod */ - npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one); - quo = npyv_add_s64(quo, to_sub); - npyv_store_s64(dst1, npyv_and_s64(cvtozero, quo)); - npyv_store_s64(dst2, npyv_and_s64(cvtozero, rem)); -#else /* fmod and remainder */ - npyv_store_s64(dst1, rem); -#endif - } - -#if 2 == 2 /* divmod */ - if (!vec_all_eq(warn, vzero)) { - npy_set_floatstatus_divbyzero(); - } - - for (; len > 0; --len, ++src1, ++dst1, ++dst2) { - const npyv_lanetype_s64 a = *src1; - if (a == NPY_MIN_INT64 && scalar == -1) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - *dst2 = 0; - } - else { - *dst1 = a / scalar; - *dst2 = a % scalar; - if (!((a > 0) == (scalar > 0) || *dst2 == 0)) { - *dst1 -= 1; - *dst2 += scalar; - } - } - } -#else /* fmod and remainder */ - for (; len > 0; --len, ++src1, ++dst1) { - const npyv_lanetype_s64 a = *src1; - *dst1 = a % scalar; -#if 2 == 1 /* remainder */ - if (!((a > 0) == (scalar > 0) || *dst1 == 0)) { - *dst1 += scalar; - } -#endif - } -#endif - npyv_cleanup(); -} - - -#endif // NPY_SIMD && defined(NPY_HAVE_VSX4) - -/***************************************************************************** - ** Defining ufunc inner functions - *****************************************************************************/ - -#line 494 -#undef TO_SIMD_SFX -#if 0 -#line 499 -#elif NPY_BITSOF_BYTE == 8 - #if 0 - #define TO_SIMD_SFX(X) X##_s8 - #else - #define TO_SIMD_SFX(X) X##_u8 - #endif - -#line 499 -#elif NPY_BITSOF_BYTE == 16 - #if 0 - #define TO_SIMD_SFX(X) X##_s16 - #else - #define TO_SIMD_SFX(X) X##_u16 - #endif - -#line 499 -#elif NPY_BITSOF_BYTE == 32 - #if 0 - #define TO_SIMD_SFX(X) X##_s32 - #else - #define TO_SIMD_SFX(X) X##_u32 - #endif - -#line 499 -#elif NPY_BITSOF_BYTE == 64 - #if 0 - #define TO_SIMD_SFX(X) X##_s64 - #else - #define TO_SIMD_SFX(X) X##_u64 - #endif - -#endif - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_fmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_ubyte), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ubyte), NPY_SIMD_WIDTH) && - (*(npy_ubyte *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_ubyte in1 = *(npy_ubyte *)ip1; - const npy_ubyte in2 = *(npy_ubyte *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_ubyte *)op1) = 0; - } else{ - *((npy_ubyte *)op1)= in1 % in2; - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_remainder) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_ubyte), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ubyte), NPY_SIMD_WIDTH) && - (*(npy_ubyte *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_ubyte in1 = *(npy_ubyte *)ip1; - const npy_ubyte in2 = *(npy_ubyte *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_ubyte *)op1) = 0; - } else{ -#if 0 - /* handle mixed case the way Python does */ - const npy_ubyte rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_ubyte *)op1) = rem; - } - else { - *((npy_ubyte *)op1) = rem + in2; - } -#else - *((npy_ubyte *)op1)= in1 % in2; -#endif - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_divmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_ubyte), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ubyte), NPY_SIMD_WIDTH) && - (*(npy_ubyte *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif -#if 0 - BINARY_LOOP_TWO_OUT { - const npy_ubyte in1 = *(npy_ubyte *)ip1; - const npy_ubyte in2 = *(npy_ubyte *)ip2; - /* see FIXME note for divide above */ - if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_UBYTE && in2 == -1))) { - npy_set_floatstatus_divbyzero(); - *((npy_ubyte *)op1) = 0; - *((npy_ubyte *)op2) = 0; - } - else { - /* handle mixed case the way Python does */ - const npy_ubyte quo = in1 / in2; - const npy_ubyte rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_ubyte *)op1) = quo; - *((npy_ubyte *)op2) = rem; - } - else { - *((npy_ubyte *)op1) = quo - 1; - *((npy_ubyte *)op2) = rem + in2; - } - } - } -#else - BINARY_LOOP_TWO_OUT { - const npy_ubyte in1 = *(npy_ubyte *)ip1; - const npy_ubyte in2 = *(npy_ubyte *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_ubyte *)op1) = 0; - *((npy_ubyte *)op2) = 0; - } - else { - *((npy_ubyte *)op1)= in1/in2; - *((npy_ubyte *)op2) = in1 % in2; - } - } -#endif -} - -#line 494 -#undef TO_SIMD_SFX -#if 0 -#line 499 -#elif NPY_BITSOF_SHORT == 8 - #if 0 - #define TO_SIMD_SFX(X) X##_s8 - #else - #define TO_SIMD_SFX(X) X##_u8 - #endif - -#line 499 -#elif NPY_BITSOF_SHORT == 16 - #if 0 - #define TO_SIMD_SFX(X) X##_s16 - #else - #define TO_SIMD_SFX(X) X##_u16 - #endif - -#line 499 -#elif NPY_BITSOF_SHORT == 32 - #if 0 - #define TO_SIMD_SFX(X) X##_s32 - #else - #define TO_SIMD_SFX(X) X##_u32 - #endif - -#line 499 -#elif NPY_BITSOF_SHORT == 64 - #if 0 - #define TO_SIMD_SFX(X) X##_s64 - #else - #define TO_SIMD_SFX(X) X##_u64 - #endif - -#endif - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_fmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_ushort), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ushort), NPY_SIMD_WIDTH) && - (*(npy_ushort *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_ushort in1 = *(npy_ushort *)ip1; - const npy_ushort in2 = *(npy_ushort *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_ushort *)op1) = 0; - } else{ - *((npy_ushort *)op1)= in1 % in2; - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_remainder) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_ushort), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ushort), NPY_SIMD_WIDTH) && - (*(npy_ushort *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_ushort in1 = *(npy_ushort *)ip1; - const npy_ushort in2 = *(npy_ushort *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_ushort *)op1) = 0; - } else{ -#if 0 - /* handle mixed case the way Python does */ - const npy_ushort rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_ushort *)op1) = rem; - } - else { - *((npy_ushort *)op1) = rem + in2; - } -#else - *((npy_ushort *)op1)= in1 % in2; -#endif - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_divmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_ushort), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ushort), NPY_SIMD_WIDTH) && - (*(npy_ushort *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif -#if 0 - BINARY_LOOP_TWO_OUT { - const npy_ushort in1 = *(npy_ushort *)ip1; - const npy_ushort in2 = *(npy_ushort *)ip2; - /* see FIXME note for divide above */ - if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_USHORT && in2 == -1))) { - npy_set_floatstatus_divbyzero(); - *((npy_ushort *)op1) = 0; - *((npy_ushort *)op2) = 0; - } - else { - /* handle mixed case the way Python does */ - const npy_ushort quo = in1 / in2; - const npy_ushort rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_ushort *)op1) = quo; - *((npy_ushort *)op2) = rem; - } - else { - *((npy_ushort *)op1) = quo - 1; - *((npy_ushort *)op2) = rem + in2; - } - } - } -#else - BINARY_LOOP_TWO_OUT { - const npy_ushort in1 = *(npy_ushort *)ip1; - const npy_ushort in2 = *(npy_ushort *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_ushort *)op1) = 0; - *((npy_ushort *)op2) = 0; - } - else { - *((npy_ushort *)op1)= in1/in2; - *((npy_ushort *)op2) = in1 % in2; - } - } -#endif -} - -#line 494 -#undef TO_SIMD_SFX -#if 0 -#line 499 -#elif NPY_BITSOF_INT == 8 - #if 0 - #define TO_SIMD_SFX(X) X##_s8 - #else - #define TO_SIMD_SFX(X) X##_u8 - #endif - -#line 499 -#elif NPY_BITSOF_INT == 16 - #if 0 - #define TO_SIMD_SFX(X) X##_s16 - #else - #define TO_SIMD_SFX(X) X##_u16 - #endif - -#line 499 -#elif NPY_BITSOF_INT == 32 - #if 0 - #define TO_SIMD_SFX(X) X##_s32 - #else - #define TO_SIMD_SFX(X) X##_u32 - #endif - -#line 499 -#elif NPY_BITSOF_INT == 64 - #if 0 - #define TO_SIMD_SFX(X) X##_s64 - #else - #define TO_SIMD_SFX(X) X##_u64 - #endif - -#endif - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_fmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_uint), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_uint), NPY_SIMD_WIDTH) && - (*(npy_uint *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_uint in1 = *(npy_uint *)ip1; - const npy_uint in2 = *(npy_uint *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_uint *)op1) = 0; - } else{ - *((npy_uint *)op1)= in1 % in2; - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_remainder) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_uint), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_uint), NPY_SIMD_WIDTH) && - (*(npy_uint *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_uint in1 = *(npy_uint *)ip1; - const npy_uint in2 = *(npy_uint *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_uint *)op1) = 0; - } else{ -#if 0 - /* handle mixed case the way Python does */ - const npy_uint rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_uint *)op1) = rem; - } - else { - *((npy_uint *)op1) = rem + in2; - } -#else - *((npy_uint *)op1)= in1 % in2; -#endif - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_divmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_uint), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_uint), NPY_SIMD_WIDTH) && - (*(npy_uint *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif -#if 0 - BINARY_LOOP_TWO_OUT { - const npy_uint in1 = *(npy_uint *)ip1; - const npy_uint in2 = *(npy_uint *)ip2; - /* see FIXME note for divide above */ - if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_UINT && in2 == -1))) { - npy_set_floatstatus_divbyzero(); - *((npy_uint *)op1) = 0; - *((npy_uint *)op2) = 0; - } - else { - /* handle mixed case the way Python does */ - const npy_uint quo = in1 / in2; - const npy_uint rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_uint *)op1) = quo; - *((npy_uint *)op2) = rem; - } - else { - *((npy_uint *)op1) = quo - 1; - *((npy_uint *)op2) = rem + in2; - } - } - } -#else - BINARY_LOOP_TWO_OUT { - const npy_uint in1 = *(npy_uint *)ip1; - const npy_uint in2 = *(npy_uint *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_uint *)op1) = 0; - *((npy_uint *)op2) = 0; - } - else { - *((npy_uint *)op1)= in1/in2; - *((npy_uint *)op2) = in1 % in2; - } - } -#endif -} - -#line 494 -#undef TO_SIMD_SFX -#if 0 -#line 499 -#elif NPY_BITSOF_LONG == 8 - #if 0 - #define TO_SIMD_SFX(X) X##_s8 - #else - #define TO_SIMD_SFX(X) X##_u8 - #endif - -#line 499 -#elif NPY_BITSOF_LONG == 16 - #if 0 - #define TO_SIMD_SFX(X) X##_s16 - #else - #define TO_SIMD_SFX(X) X##_u16 - #endif - -#line 499 -#elif NPY_BITSOF_LONG == 32 - #if 0 - #define TO_SIMD_SFX(X) X##_s32 - #else - #define TO_SIMD_SFX(X) X##_u32 - #endif - -#line 499 -#elif NPY_BITSOF_LONG == 64 - #if 0 - #define TO_SIMD_SFX(X) X##_s64 - #else - #define TO_SIMD_SFX(X) X##_u64 - #endif - -#endif - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_fmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_ulong), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulong), NPY_SIMD_WIDTH) && - (*(npy_ulong *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_ulong in1 = *(npy_ulong *)ip1; - const npy_ulong in2 = *(npy_ulong *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_ulong *)op1) = 0; - } else{ - *((npy_ulong *)op1)= in1 % in2; - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_remainder) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_ulong), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulong), NPY_SIMD_WIDTH) && - (*(npy_ulong *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_ulong in1 = *(npy_ulong *)ip1; - const npy_ulong in2 = *(npy_ulong *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_ulong *)op1) = 0; - } else{ -#if 0 - /* handle mixed case the way Python does */ - const npy_ulong rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_ulong *)op1) = rem; - } - else { - *((npy_ulong *)op1) = rem + in2; - } -#else - *((npy_ulong *)op1)= in1 % in2; -#endif - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_divmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_ulong), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulong), NPY_SIMD_WIDTH) && - (*(npy_ulong *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif -#if 0 - BINARY_LOOP_TWO_OUT { - const npy_ulong in1 = *(npy_ulong *)ip1; - const npy_ulong in2 = *(npy_ulong *)ip2; - /* see FIXME note for divide above */ - if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_ULONG && in2 == -1))) { - npy_set_floatstatus_divbyzero(); - *((npy_ulong *)op1) = 0; - *((npy_ulong *)op2) = 0; - } - else { - /* handle mixed case the way Python does */ - const npy_ulong quo = in1 / in2; - const npy_ulong rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_ulong *)op1) = quo; - *((npy_ulong *)op2) = rem; - } - else { - *((npy_ulong *)op1) = quo - 1; - *((npy_ulong *)op2) = rem + in2; - } - } - } -#else - BINARY_LOOP_TWO_OUT { - const npy_ulong in1 = *(npy_ulong *)ip1; - const npy_ulong in2 = *(npy_ulong *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_ulong *)op1) = 0; - *((npy_ulong *)op2) = 0; - } - else { - *((npy_ulong *)op1)= in1/in2; - *((npy_ulong *)op2) = in1 % in2; - } - } -#endif -} - -#line 494 -#undef TO_SIMD_SFX -#if 0 -#line 499 -#elif NPY_BITSOF_LONGLONG == 8 - #if 0 - #define TO_SIMD_SFX(X) X##_s8 - #else - #define TO_SIMD_SFX(X) X##_u8 - #endif - -#line 499 -#elif NPY_BITSOF_LONGLONG == 16 - #if 0 - #define TO_SIMD_SFX(X) X##_s16 - #else - #define TO_SIMD_SFX(X) X##_u16 - #endif - -#line 499 -#elif NPY_BITSOF_LONGLONG == 32 - #if 0 - #define TO_SIMD_SFX(X) X##_s32 - #else - #define TO_SIMD_SFX(X) X##_u32 - #endif - -#line 499 -#elif NPY_BITSOF_LONGLONG == 64 - #if 0 - #define TO_SIMD_SFX(X) X##_s64 - #else - #define TO_SIMD_SFX(X) X##_u64 - #endif - -#endif - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_ulonglong), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulonglong), NPY_SIMD_WIDTH) && - (*(npy_ulonglong *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_ulonglong in1 = *(npy_ulonglong *)ip1; - const npy_ulonglong in2 = *(npy_ulonglong *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_ulonglong *)op1) = 0; - } else{ - *((npy_ulonglong *)op1)= in1 % in2; - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_remainder) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_ulonglong), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulonglong), NPY_SIMD_WIDTH) && - (*(npy_ulonglong *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_ulonglong in1 = *(npy_ulonglong *)ip1; - const npy_ulonglong in2 = *(npy_ulonglong *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_ulonglong *)op1) = 0; - } else{ -#if 0 - /* handle mixed case the way Python does */ - const npy_ulonglong rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_ulonglong *)op1) = rem; - } - else { - *((npy_ulonglong *)op1) = rem + in2; - } -#else - *((npy_ulonglong *)op1)= in1 % in2; -#endif - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_divmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_ulonglong), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulonglong), NPY_SIMD_WIDTH) && - (*(npy_ulonglong *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif -#if 0 - BINARY_LOOP_TWO_OUT { - const npy_ulonglong in1 = *(npy_ulonglong *)ip1; - const npy_ulonglong in2 = *(npy_ulonglong *)ip2; - /* see FIXME note for divide above */ - if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_ULONGLONG && in2 == -1))) { - npy_set_floatstatus_divbyzero(); - *((npy_ulonglong *)op1) = 0; - *((npy_ulonglong *)op2) = 0; - } - else { - /* handle mixed case the way Python does */ - const npy_ulonglong quo = in1 / in2; - const npy_ulonglong rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_ulonglong *)op1) = quo; - *((npy_ulonglong *)op2) = rem; - } - else { - *((npy_ulonglong *)op1) = quo - 1; - *((npy_ulonglong *)op2) = rem + in2; - } - } - } -#else - BINARY_LOOP_TWO_OUT { - const npy_ulonglong in1 = *(npy_ulonglong *)ip1; - const npy_ulonglong in2 = *(npy_ulonglong *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_ulonglong *)op1) = 0; - *((npy_ulonglong *)op2) = 0; - } - else { - *((npy_ulonglong *)op1)= in1/in2; - *((npy_ulonglong *)op2) = in1 % in2; - } - } -#endif -} - -#line 494 -#undef TO_SIMD_SFX -#if 0 -#line 499 -#elif NPY_BITSOF_BYTE == 8 - #if 1 - #define TO_SIMD_SFX(X) X##_s8 - #else - #define TO_SIMD_SFX(X) X##_u8 - #endif - -#line 499 -#elif NPY_BITSOF_BYTE == 16 - #if 1 - #define TO_SIMD_SFX(X) X##_s16 - #else - #define TO_SIMD_SFX(X) X##_u16 - #endif - -#line 499 -#elif NPY_BITSOF_BYTE == 32 - #if 1 - #define TO_SIMD_SFX(X) X##_s32 - #else - #define TO_SIMD_SFX(X) X##_u32 - #endif - -#line 499 -#elif NPY_BITSOF_BYTE == 64 - #if 1 - #define TO_SIMD_SFX(X) X##_s64 - #else - #define TO_SIMD_SFX(X) X##_u64 - #endif - -#endif - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_fmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_byte), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_byte), NPY_SIMD_WIDTH) && - (*(npy_byte *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_byte in1 = *(npy_byte *)ip1; - const npy_byte in2 = *(npy_byte *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_byte *)op1) = 0; - } else{ - *((npy_byte *)op1)= in1 % in2; - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_remainder) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_byte), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_byte), NPY_SIMD_WIDTH) && - (*(npy_byte *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_byte in1 = *(npy_byte *)ip1; - const npy_byte in2 = *(npy_byte *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_byte *)op1) = 0; - } else{ -#if 1 - /* handle mixed case the way Python does */ - const npy_byte rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_byte *)op1) = rem; - } - else { - *((npy_byte *)op1) = rem + in2; - } -#else - *((npy_byte *)op1)= in1 % in2; -#endif - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_divmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_byte), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_byte), NPY_SIMD_WIDTH) && - (*(npy_byte *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif -#if 1 - BINARY_LOOP_TWO_OUT { - const npy_byte in1 = *(npy_byte *)ip1; - const npy_byte in2 = *(npy_byte *)ip2; - /* see FIXME note for divide above */ - if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_BYTE && in2 == -1))) { - npy_set_floatstatus_divbyzero(); - *((npy_byte *)op1) = 0; - *((npy_byte *)op2) = 0; - } - else { - /* handle mixed case the way Python does */ - const npy_byte quo = in1 / in2; - const npy_byte rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_byte *)op1) = quo; - *((npy_byte *)op2) = rem; - } - else { - *((npy_byte *)op1) = quo - 1; - *((npy_byte *)op2) = rem + in2; - } - } - } -#else - BINARY_LOOP_TWO_OUT { - const npy_byte in1 = *(npy_byte *)ip1; - const npy_byte in2 = *(npy_byte *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_byte *)op1) = 0; - *((npy_byte *)op2) = 0; - } - else { - *((npy_byte *)op1)= in1/in2; - *((npy_byte *)op2) = in1 % in2; - } - } -#endif -} - -#line 494 -#undef TO_SIMD_SFX -#if 0 -#line 499 -#elif NPY_BITSOF_SHORT == 8 - #if 1 - #define TO_SIMD_SFX(X) X##_s8 - #else - #define TO_SIMD_SFX(X) X##_u8 - #endif - -#line 499 -#elif NPY_BITSOF_SHORT == 16 - #if 1 - #define TO_SIMD_SFX(X) X##_s16 - #else - #define TO_SIMD_SFX(X) X##_u16 - #endif - -#line 499 -#elif NPY_BITSOF_SHORT == 32 - #if 1 - #define TO_SIMD_SFX(X) X##_s32 - #else - #define TO_SIMD_SFX(X) X##_u32 - #endif - -#line 499 -#elif NPY_BITSOF_SHORT == 64 - #if 1 - #define TO_SIMD_SFX(X) X##_s64 - #else - #define TO_SIMD_SFX(X) X##_u64 - #endif - -#endif - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_fmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_short), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_short), NPY_SIMD_WIDTH) && - (*(npy_short *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_short in1 = *(npy_short *)ip1; - const npy_short in2 = *(npy_short *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_short *)op1) = 0; - } else{ - *((npy_short *)op1)= in1 % in2; - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_remainder) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_short), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_short), NPY_SIMD_WIDTH) && - (*(npy_short *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_short in1 = *(npy_short *)ip1; - const npy_short in2 = *(npy_short *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_short *)op1) = 0; - } else{ -#if 1 - /* handle mixed case the way Python does */ - const npy_short rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_short *)op1) = rem; - } - else { - *((npy_short *)op1) = rem + in2; - } -#else - *((npy_short *)op1)= in1 % in2; -#endif - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_divmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_short), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_short), NPY_SIMD_WIDTH) && - (*(npy_short *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif -#if 1 - BINARY_LOOP_TWO_OUT { - const npy_short in1 = *(npy_short *)ip1; - const npy_short in2 = *(npy_short *)ip2; - /* see FIXME note for divide above */ - if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_SHORT && in2 == -1))) { - npy_set_floatstatus_divbyzero(); - *((npy_short *)op1) = 0; - *((npy_short *)op2) = 0; - } - else { - /* handle mixed case the way Python does */ - const npy_short quo = in1 / in2; - const npy_short rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_short *)op1) = quo; - *((npy_short *)op2) = rem; - } - else { - *((npy_short *)op1) = quo - 1; - *((npy_short *)op2) = rem + in2; - } - } - } -#else - BINARY_LOOP_TWO_OUT { - const npy_short in1 = *(npy_short *)ip1; - const npy_short in2 = *(npy_short *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_short *)op1) = 0; - *((npy_short *)op2) = 0; - } - else { - *((npy_short *)op1)= in1/in2; - *((npy_short *)op2) = in1 % in2; - } - } -#endif -} - -#line 494 -#undef TO_SIMD_SFX -#if 0 -#line 499 -#elif NPY_BITSOF_INT == 8 - #if 1 - #define TO_SIMD_SFX(X) X##_s8 - #else - #define TO_SIMD_SFX(X) X##_u8 - #endif - -#line 499 -#elif NPY_BITSOF_INT == 16 - #if 1 - #define TO_SIMD_SFX(X) X##_s16 - #else - #define TO_SIMD_SFX(X) X##_u16 - #endif - -#line 499 -#elif NPY_BITSOF_INT == 32 - #if 1 - #define TO_SIMD_SFX(X) X##_s32 - #else - #define TO_SIMD_SFX(X) X##_u32 - #endif - -#line 499 -#elif NPY_BITSOF_INT == 64 - #if 1 - #define TO_SIMD_SFX(X) X##_s64 - #else - #define TO_SIMD_SFX(X) X##_u64 - #endif - -#endif - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_fmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_int), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_int), NPY_SIMD_WIDTH) && - (*(npy_int *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_int in1 = *(npy_int *)ip1; - const npy_int in2 = *(npy_int *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_int *)op1) = 0; - } else{ - *((npy_int *)op1)= in1 % in2; - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_remainder) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_int), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_int), NPY_SIMD_WIDTH) && - (*(npy_int *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_int in1 = *(npy_int *)ip1; - const npy_int in2 = *(npy_int *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_int *)op1) = 0; - } else{ -#if 1 - /* handle mixed case the way Python does */ - const npy_int rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_int *)op1) = rem; - } - else { - *((npy_int *)op1) = rem + in2; - } -#else - *((npy_int *)op1)= in1 % in2; -#endif - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_divmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_int), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_int), NPY_SIMD_WIDTH) && - (*(npy_int *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif -#if 1 - BINARY_LOOP_TWO_OUT { - const npy_int in1 = *(npy_int *)ip1; - const npy_int in2 = *(npy_int *)ip2; - /* see FIXME note for divide above */ - if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_INT && in2 == -1))) { - npy_set_floatstatus_divbyzero(); - *((npy_int *)op1) = 0; - *((npy_int *)op2) = 0; - } - else { - /* handle mixed case the way Python does */ - const npy_int quo = in1 / in2; - const npy_int rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_int *)op1) = quo; - *((npy_int *)op2) = rem; - } - else { - *((npy_int *)op1) = quo - 1; - *((npy_int *)op2) = rem + in2; - } - } - } -#else - BINARY_LOOP_TWO_OUT { - const npy_int in1 = *(npy_int *)ip1; - const npy_int in2 = *(npy_int *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_int *)op1) = 0; - *((npy_int *)op2) = 0; - } - else { - *((npy_int *)op1)= in1/in2; - *((npy_int *)op2) = in1 % in2; - } - } -#endif -} - -#line 494 -#undef TO_SIMD_SFX -#if 0 -#line 499 -#elif NPY_BITSOF_LONG == 8 - #if 1 - #define TO_SIMD_SFX(X) X##_s8 - #else - #define TO_SIMD_SFX(X) X##_u8 - #endif - -#line 499 -#elif NPY_BITSOF_LONG == 16 - #if 1 - #define TO_SIMD_SFX(X) X##_s16 - #else - #define TO_SIMD_SFX(X) X##_u16 - #endif - -#line 499 -#elif NPY_BITSOF_LONG == 32 - #if 1 - #define TO_SIMD_SFX(X) X##_s32 - #else - #define TO_SIMD_SFX(X) X##_u32 - #endif - -#line 499 -#elif NPY_BITSOF_LONG == 64 - #if 1 - #define TO_SIMD_SFX(X) X##_s64 - #else - #define TO_SIMD_SFX(X) X##_u64 - #endif - -#endif - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_fmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_long), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_long), NPY_SIMD_WIDTH) && - (*(npy_long *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_long in1 = *(npy_long *)ip1; - const npy_long in2 = *(npy_long *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_long *)op1) = 0; - } else{ - *((npy_long *)op1)= in1 % in2; - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_remainder) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_long), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_long), NPY_SIMD_WIDTH) && - (*(npy_long *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_long in1 = *(npy_long *)ip1; - const npy_long in2 = *(npy_long *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_long *)op1) = 0; - } else{ -#if 1 - /* handle mixed case the way Python does */ - const npy_long rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_long *)op1) = rem; - } - else { - *((npy_long *)op1) = rem + in2; - } -#else - *((npy_long *)op1)= in1 % in2; -#endif - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_divmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_long), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_long), NPY_SIMD_WIDTH) && - (*(npy_long *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif -#if 1 - BINARY_LOOP_TWO_OUT { - const npy_long in1 = *(npy_long *)ip1; - const npy_long in2 = *(npy_long *)ip2; - /* see FIXME note for divide above */ - if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_LONG && in2 == -1))) { - npy_set_floatstatus_divbyzero(); - *((npy_long *)op1) = 0; - *((npy_long *)op2) = 0; - } - else { - /* handle mixed case the way Python does */ - const npy_long quo = in1 / in2; - const npy_long rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_long *)op1) = quo; - *((npy_long *)op2) = rem; - } - else { - *((npy_long *)op1) = quo - 1; - *((npy_long *)op2) = rem + in2; - } - } - } -#else - BINARY_LOOP_TWO_OUT { - const npy_long in1 = *(npy_long *)ip1; - const npy_long in2 = *(npy_long *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_long *)op1) = 0; - *((npy_long *)op2) = 0; - } - else { - *((npy_long *)op1)= in1/in2; - *((npy_long *)op2) = in1 % in2; - } - } -#endif -} - -#line 494 -#undef TO_SIMD_SFX -#if 0 -#line 499 -#elif NPY_BITSOF_LONGLONG == 8 - #if 1 - #define TO_SIMD_SFX(X) X##_s8 - #else - #define TO_SIMD_SFX(X) X##_u8 - #endif - -#line 499 -#elif NPY_BITSOF_LONGLONG == 16 - #if 1 - #define TO_SIMD_SFX(X) X##_s16 - #else - #define TO_SIMD_SFX(X) X##_u16 - #endif - -#line 499 -#elif NPY_BITSOF_LONGLONG == 32 - #if 1 - #define TO_SIMD_SFX(X) X##_s32 - #else - #define TO_SIMD_SFX(X) X##_u32 - #endif - -#line 499 -#elif NPY_BITSOF_LONGLONG == 64 - #if 1 - #define TO_SIMD_SFX(X) X##_s64 - #else - #define TO_SIMD_SFX(X) X##_u64 - #endif - -#endif - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_fmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_longlong), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longlong), NPY_SIMD_WIDTH) && - (*(npy_longlong *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_longlong in1 = *(npy_longlong *)ip1; - const npy_longlong in2 = *(npy_longlong *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_longlong *)op1) = 0; - } else{ - *((npy_longlong *)op1)= in1 % in2; - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_remainder) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_longlong), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longlong), NPY_SIMD_WIDTH) && - (*(npy_longlong *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif - BINARY_LOOP { - const npy_longlong in1 = *(npy_longlong *)ip1; - const npy_longlong in2 = *(npy_longlong *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_longlong *)op1) = 0; - } else{ -#if 1 - /* handle mixed case the way Python does */ - const npy_longlong rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_longlong *)op1) = rem; - } - else { - *((npy_longlong *)op1) = rem + in2; - } -#else - *((npy_longlong *)op1)= in1 % in2; -#endif - } - } -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_divmod) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ -#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX) - // both arguments are arrays of the same size - if (IS_BLOCKABLE_BINARY(sizeof(npy_longlong), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]); - return; - } - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longlong), NPY_SIMD_WIDTH) && - (*(npy_longlong *)args[1]) != 0) { - TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]); - return ; - } -#endif -#if 1 - BINARY_LOOP_TWO_OUT { - const npy_longlong in1 = *(npy_longlong *)ip1; - const npy_longlong in2 = *(npy_longlong *)ip2; - /* see FIXME note for divide above */ - if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_LONGLONG && in2 == -1))) { - npy_set_floatstatus_divbyzero(); - *((npy_longlong *)op1) = 0; - *((npy_longlong *)op2) = 0; - } - else { - /* handle mixed case the way Python does */ - const npy_longlong quo = in1 / in2; - const npy_longlong rem = in1 % in2; - if ((in1 > 0) == (in2 > 0) || rem == 0) { - *((npy_longlong *)op1) = quo; - *((npy_longlong *)op2) = rem; - } - else { - *((npy_longlong *)op1) = quo - 1; - *((npy_longlong *)op2) = rem + in2; - } - } - } -#else - BINARY_LOOP_TWO_OUT { - const npy_longlong in1 = *(npy_longlong *)ip1; - const npy_longlong in2 = *(npy_longlong *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((npy_longlong *)op1) = 0; - *((npy_longlong *)op2) = 0; - } - else { - *((npy_longlong *)op1)= in1/in2; - *((npy_longlong *)op2) = in1 % in2; - } - } -#endif -} - - From c0b3b2eb47cedd38acee4db86ca1393d2837eac3 Mon Sep 17 00:00:00 2001 From: jhonatancunha Date: Thu, 2 Jun 2022 20:56:51 -0300 Subject: [PATCH 05/15] REV: removing inserted newlines See numpy#20959 Co-authored-by: alescrocaro Co-authored-by: JessePires Co-authored-by: patriarka --- numpy/core/shape_base.pyi | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/numpy/core/shape_base.pyi b/numpy/core/shape_base.pyi index de5dda3bbfa4..812ae017b9e3 100644 --- a/numpy/core/shape_base.pyi +++ b/numpy/core/shape_base.pyi @@ -9,7 +9,6 @@ _ArrayType = TypeVar("_ArrayType", bound=NDArray[Any]) __all__: list[str] - @overload def atleast_1d(arys: _ArrayLike[_SCT], /) -> NDArray[_SCT]: ... @overload @@ -17,7 +16,6 @@ def atleast_1d(arys: ArrayLike, /) -> NDArray[Any]: ... @overload def atleast_1d(*arys: ArrayLike) -> list[NDArray[Any]]: ... - @overload def atleast_2d(arys: _ArrayLike[_SCT], /) -> NDArray[_SCT]: ... @overload @@ -25,7 +23,6 @@ def atleast_2d(arys: ArrayLike, /) -> NDArray[Any]: ... @overload def atleast_2d(*arys: ArrayLike) -> list[NDArray[Any]]: ... - @overload def atleast_3d(arys: _ArrayLike[_SCT], /) -> NDArray[_SCT]: ... @overload @@ -33,19 +30,16 @@ def atleast_3d(arys: ArrayLike, /) -> NDArray[Any]: ... @overload def atleast_3d(*arys: ArrayLike) -> list[NDArray[Any]]: ... - @overload def vstack(tup: Sequence[_ArrayLike[_SCT]]) -> NDArray[_SCT]: ... @overload def vstack(tup: Sequence[ArrayLike]) -> NDArray[Any]: ... - @overload def hstack(tup: Sequence[_ArrayLike[_SCT]]) -> NDArray[_SCT]: ... @overload def hstack(tup: Sequence[ArrayLike]) -> NDArray[Any]: ... - @overload def stack( arrays: Sequence[_ArrayLike[_SCT]], @@ -55,8 +49,6 @@ def stack( dtype: None = ..., casting: None | _CastingKind = ... ) -> NDArray[_SCT]: ... - - @overload def stack( arrays: Sequence[ArrayLike], @@ -66,8 +58,6 @@ def stack( dtype: None = ..., casting: None | _CastingKind = ... ) -> NDArray[Any]: ... - - @overload def stack( arrays: Sequence[ArrayLike], @@ -78,7 +68,6 @@ def stack( casting: None | _CastingKind = ... ) -> _ArrayType: ... - @overload def block(arrays: _ArrayLike[_SCT]) -> NDArray[_SCT]: ... @overload From ec714a31a55b7c240a88a4d8d7191e4bd4724205 Mon Sep 17 00:00:00 2001 From: jhonatancunha Date: Thu, 2 Jun 2022 20:58:28 -0300 Subject: [PATCH 06/15] DOC: inserting versionadded info in dtype and casting parameters. See numpy#20959 Co-authored-by: alescrocaro Co-authored-by: JessePires Co-authored-by: patriarka --- numpy/core/shape_base.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py index 6544ce68ba8f..975ca68cf2eb 100644 --- a/numpy/core/shape_base.py +++ b/numpy/core/shape_base.py @@ -345,8 +345,8 @@ def hstack(tup): return _nx.concatenate(arrs, 1) -def _stack_dispatcher(arrays, axis=None, out=None, *, - dtype=None, casting=None): +def _stack_dispatcher(arrays, axis=None, out=None, *, + dtype=None, casting=None): arrays = _arrays_for_stack_dispatcher(arrays, stacklevel=6) if out is not None: # optimize for the typical case where only arrays is provided @@ -383,9 +383,13 @@ def stack(arrays, axis=0, out=None, *, dtype=None, casting="same_kind"): If provided, the destination array will have this dtype. Cannot be provided together with `out`. + .. versionadded:: 1.24 + casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional Controls what kind of data casting may occur. Defaults to 'same_kind'. + .. versionadded:: 1.24 + Returns ------- @@ -440,7 +444,7 @@ def stack(arrays, axis=0, out=None, *, dtype=None, casting="same_kind"): sl = (slice(None),) * axis + (_nx.newaxis,) expanded_arrays = [arr[sl] for arr in arrays] return _nx.concatenate(expanded_arrays, axis=axis, out=out, - dtype=dtype, casting=casting) + dtype=dtype, casting=casting) # Internal functions to eliminate the overhead of repeated dispatch in one of From 653aa1a016b9f5f0234a0ec344c4b4c1cd0bad85 Mon Sep 17 00:00:00 2001 From: jhonatancunha Date: Thu, 2 Jun 2022 20:59:27 -0300 Subject: [PATCH 07/15] TST: writing tests to stack method with dtype and casting options See numpy#20959 Co-authored-by: alescrocaro Co-authored-by: JessePires Co-authored-by: patriarka --- numpy/core/tests/test_shape_base.py | 49 ++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py index 679e3c036351..969f273ed10a 100644 --- a/numpy/core/tests/test_shape_base.py +++ b/numpy/core/tests/test_shape_base.py @@ -3,13 +3,13 @@ from numpy.core import ( array, arange, atleast_1d, atleast_2d, atleast_3d, block, vstack, hstack, newaxis, concatenate, stack - ) +) from numpy.core.shape_base import (_block_dispatcher, _block_setup, _block_concatenate, _block_slicing) from numpy.testing import ( assert_, assert_raises, assert_array_equal, assert_equal, assert_raises_regex, assert_warns, IS_PYPY - ) +) class TestAtleast1d: @@ -345,7 +345,7 @@ def test_bad_out_shape(self): @pytest.mark.parametrize("axis", [None, 0]) @pytest.mark.parametrize("out_dtype", ["c8", "f4", "f8", ">f8", "i8", "S4"]) @pytest.mark.parametrize("casting", - ['no', 'equiv', 'safe', 'same_kind', 'unsafe']) + ['no', 'equiv', 'safe', 'same_kind', 'unsafe']) def test_out_and_dtype(self, axis, out_dtype, casting): # Compare usage of `out=out` with `dtype=out.dtype` out = np.empty(4, dtype=out_dtype) @@ -372,7 +372,7 @@ def test_out_and_dtype(self, axis, out_dtype, casting): @pytest.mark.parametrize("axis", [None, 0]) @pytest.mark.parametrize("string_dt", ["S", "U", "S0", "U0"]) @pytest.mark.parametrize("arrs", - [([0.],), ([0.], [1]), ([0], ["string"], [1.])]) + [([0.],), ([0.], [1]), ([0], ["string"], [1.])]) def test_dtype_with_promotion(self, arrs, string_dt, axis): # Note that U0 and S0 should be deprecated eventually and changed to # actually give the empty string result (together with `np.array`) @@ -449,6 +449,41 @@ def test_stack(): with assert_warns(FutureWarning): result = stack((x for x in range(3))) assert_array_equal(result, np.array([0, 1, 2])) + #casting and dtype test + a = np.array([1, 2, 3]) + b = np.array([2.5, 3.5, 4.5]) + res = np.stack((a, b), axis=1, casting="unsafe", dtype=np.int64) + expected_res = np.array([[1, 2], [2, 3], [3, 4]]) + assert_array_equal(res, expected_res) + #casting and dtype with TypeError + with assert_raises(TypeError): + stack((a, b), dtype=np.int64, axis=1, casting="safe") + + +@pytest.mark.parametrize("axis", [0]) +@pytest.mark.parametrize("out_dtype", ["c8", "f4", "f8", ">f8", "i8"]) +@pytest.mark.parametrize("casting", + ['no', 'equiv', 'safe', 'same_kind', 'unsafe']) +def test_stack_out_and_dtype(axis, out_dtype, casting): + to_concat = (array([1, 2]), array([3, 4])) + res = array([[1, 2], [3, 4]]) + out = np.zeros_like(res) + + if not np.can_cast(to_concat[0], out_dtype, casting=casting): + with assert_raises(TypeError): + stack(to_concat, dtype=out_dtype, + axis=axis, casting=casting) + else: + res_out = stack(to_concat, out=out, + axis=axis, casting=casting) + res_dtype = stack(to_concat, dtype=out_dtype, + axis=axis, casting=casting) + assert res_out is out + assert_array_equal(out, res_dtype) + assert res_dtype.dtype == out_dtype + + with assert_raises(TypeError): + stack(to_concat, out=out, dtype=out_dtype, axis=axis) class TestBlock: @@ -588,9 +623,9 @@ def test_nested(self, block): result = block([ [ block([ - [one], - [three], - [four] + [one], + [three], + [four] ]), two ], From 9b8d4cdec84f0880e3b72ef1e940af9ac2029c4d Mon Sep 17 00:00:00 2001 From: jhonatancunha Date: Thu, 2 Jun 2022 21:04:11 -0300 Subject: [PATCH 08/15] DOC: adding upcoming_change file for new options casting and dtype in method stack. See numpy#20959 Co-authored-by: alescrocaro Co-authored-by: JessePires Co-authored-by: patriarka --- doc/release/upcoming_changes/21627.new_feature.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 doc/release/upcoming_changes/21627.new_feature.rst diff --git a/doc/release/upcoming_changes/21627.new_feature.rst b/doc/release/upcoming_changes/21627.new_feature.rst new file mode 100644 index 000000000000..370ff7a0edc4 --- /dev/null +++ b/doc/release/upcoming_changes/21627.new_feature.rst @@ -0,0 +1,4 @@ +``casting and dtype`` options for `numpy.stack` +---------------------------------------------------- +The ``casting and dtype`` options is now available for `numpy.stack`. +To use it, write ``np.stack(..., dtype=None, casting='same_kind')``. From df2c73fbd6416d01402d4a99553278fddc537f2f Mon Sep 17 00:00:00 2001 From: jhonatancunha Date: Thu, 2 Jun 2022 21:09:02 -0300 Subject: [PATCH 09/15] REV: reverting lint errors. See numpy#20959 Co-authored-by: alescrocaro Co-authored-by: JessePires Co-authored-by: patriarka --- numpy/core/tests/test_shape_base.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py index 969f273ed10a..955c80b2d58f 100644 --- a/numpy/core/tests/test_shape_base.py +++ b/numpy/core/tests/test_shape_base.py @@ -3,13 +3,13 @@ from numpy.core import ( array, arange, atleast_1d, atleast_2d, atleast_3d, block, vstack, hstack, newaxis, concatenate, stack -) + ) from numpy.core.shape_base import (_block_dispatcher, _block_setup, _block_concatenate, _block_slicing) from numpy.testing import ( assert_, assert_raises, assert_array_equal, assert_equal, assert_raises_regex, assert_warns, IS_PYPY -) + ) class TestAtleast1d: @@ -345,7 +345,7 @@ def test_bad_out_shape(self): @pytest.mark.parametrize("axis", [None, 0]) @pytest.mark.parametrize("out_dtype", ["c8", "f4", "f8", ">f8", "i8", "S4"]) @pytest.mark.parametrize("casting", - ['no', 'equiv', 'safe', 'same_kind', 'unsafe']) + ['no', 'equiv', 'safe', 'same_kind', 'unsafe']) def test_out_and_dtype(self, axis, out_dtype, casting): # Compare usage of `out=out` with `dtype=out.dtype` out = np.empty(4, dtype=out_dtype) @@ -372,7 +372,7 @@ def test_out_and_dtype(self, axis, out_dtype, casting): @pytest.mark.parametrize("axis", [None, 0]) @pytest.mark.parametrize("string_dt", ["S", "U", "S0", "U0"]) @pytest.mark.parametrize("arrs", - [([0.],), ([0.], [1]), ([0], ["string"], [1.])]) + [([0.],), ([0.], [1]), ([0], ["string"], [1.])]) def test_dtype_with_promotion(self, arrs, string_dt, axis): # Note that U0 and S0 should be deprecated eventually and changed to # actually give the empty string result (together with `np.array`) @@ -623,9 +623,9 @@ def test_nested(self, block): result = block([ [ block([ - [one], - [three], - [four] + [one], + [three], + [four] ]), two ], From da1154592e6355f43965efaa22a3a1cfd23a42e8 Mon Sep 17 00:00:00 2001 From: jhonatancunha Date: Sat, 4 Jun 2022 14:55:53 -0300 Subject: [PATCH 10/15] DOC: inserting hstack and vstack methods in upcoming changes See numpy#20959 Co-authored-by: alescrocaro Co-authored-by: JessePires Co-authored-by: patriarka --- .../upcoming_changes/21627.new_feature.rst | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/doc/release/upcoming_changes/21627.new_feature.rst b/doc/release/upcoming_changes/21627.new_feature.rst index 370ff7a0edc4..6330db398d42 100644 --- a/doc/release/upcoming_changes/21627.new_feature.rst +++ b/doc/release/upcoming_changes/21627.new_feature.rst @@ -1,4 +1,16 @@ -``casting and dtype`` options for `numpy.stack` ----------------------------------------------------- -The ``casting and dtype`` options is now available for `numpy.stack`. +``casting`` and ``dtype`` keyword arguments for `numpy.stack` +------------------------------------------------------------- +The ``casting`` and ``dtype`` keyword arguments is now available for `numpy.stack`. To use it, write ``np.stack(..., dtype=None, casting='same_kind')``. + + +``casting`` and ``dtype`` keyword arguments for `numpy.vstack` +-------------------------------------------------------------- +The ``casting`` and ``dtype`` keyword arguments is now available for `numpy.vstack`. +To use it, write ``np.vstack(..., dtype=None, casting='same_kind')``. + + +``casting`` and ``dtype`` keyword arguments for `numpy.hstack` +-------------------------------------------------------------- +The ``casting`` and ``dtype`` keyword arguments is now available for `numpy.hstack`. +To use it, write ``np.hstack(..., dtype=None, casting='same_kind')``. \ No newline at end of file From 8296c43d533ad2b8bcbfb8c0c481c6978f93bdfc Mon Sep 17 00:00:00 2001 From: jhonatancunha Date: Sat, 4 Jun 2022 14:58:10 -0300 Subject: [PATCH 11/15] ENH: adding dtype and casting keyword arguments to numpy.vstack and numpy.hstack. See numpy#20959 Co-authored-by: alescrocaro Co-authored-by: JessePires Co-authored-by: patriarka --- numpy/core/shape_base.py | 35 ++++++++++++++++++++++++----- numpy/core/shape_base.pyi | 46 ++++++++++++++++++++++++++++----------- 2 files changed, 62 insertions(+), 19 deletions(-) diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py index 975ca68cf2eb..c5e0ad475449 100644 --- a/numpy/core/shape_base.py +++ b/numpy/core/shape_base.py @@ -215,12 +215,13 @@ def _arrays_for_stack_dispatcher(arrays, stacklevel=4): return arrays -def _vhstack_dispatcher(tup): +def _vhstack_dispatcher(tup, *, + dtype=None, casting=None): return _arrays_for_stack_dispatcher(tup) @array_function_dispatch(_vhstack_dispatcher) -def vstack(tup): +def vstack(tup, *, dtype=None, casting="same_kind"): """ Stack arrays in sequence vertically (row wise). @@ -239,6 +240,17 @@ def vstack(tup): The arrays must have the same shape along all but the first axis. 1-D arrays must have the same length. + dtype : str or dtype + If provided, the destination array will have this dtype. Cannot be + provided together with `out`. + + .. versionadded:: 1.24 + + casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional + Controls what kind of data casting may occur. Defaults to 'same_kind'. + + .. versionadded:: 1.24 + Returns ------- stacked : ndarray @@ -279,11 +291,11 @@ def vstack(tup): arrs = atleast_2d(*tup) if not isinstance(arrs, list): arrs = [arrs] - return _nx.concatenate(arrs, 0) + return _nx.concatenate(arrs, 0, dtype=dtype, casting=casting) @array_function_dispatch(_vhstack_dispatcher) -def hstack(tup): +def hstack(tup, *, dtype=None, casting="same_kind"): """ Stack arrays in sequence horizontally (column wise). @@ -302,6 +314,17 @@ def hstack(tup): The arrays must have the same shape along all but the second axis, except 1-D arrays which can be any length. + dtype : str or dtype + If provided, the destination array will have this dtype. Cannot be + provided together with `out`. + + .. versionadded:: 1.24 + + casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional + Controls what kind of data casting may occur. Defaults to 'same_kind'. + + .. versionadded:: 1.24 + Returns ------- stacked : ndarray @@ -340,9 +363,9 @@ def hstack(tup): arrs = [arrs] # As a special case, dimension 0 of 1-dimensional arrays is "horizontal" if arrs and arrs[0].ndim == 1: - return _nx.concatenate(arrs, 0) + return _nx.concatenate(arrs, 0, dtype=dtype, casting=casting) else: - return _nx.concatenate(arrs, 1) + return _nx.concatenate(arrs, 1, dtype=dtype, casting=casting) def _stack_dispatcher(arrays, axis=None, out=None, *, diff --git a/numpy/core/shape_base.pyi b/numpy/core/shape_base.pyi index 812ae017b9e3..794993f5813b 100644 --- a/numpy/core/shape_base.pyi +++ b/numpy/core/shape_base.pyi @@ -1,8 +1,8 @@ from collections.abc import Sequence from typing import TypeVar, overload, Any, SupportsIndex -from numpy import generic -from numpy._typing import ArrayLike, NDArray, _ArrayLike, _CastingKind +from numpy import generic, _CastingKind +from numpy._typing import ArrayLike, NDArray, _ArrayLike, DTypeLike _SCT = TypeVar("_SCT", bound=generic) _ArrayType = TypeVar("_ArrayType", bound=NDArray[Any]) @@ -31,23 +31,43 @@ def atleast_3d(arys: ArrayLike, /) -> NDArray[Any]: ... def atleast_3d(*arys: ArrayLike) -> list[NDArray[Any]]: ... @overload -def vstack(tup: Sequence[_ArrayLike[_SCT]]) -> NDArray[_SCT]: ... +def vstack( + tup: Sequence[_ArrayLike[_SCT]], + *, + dtype: DTypeLike = ..., + casting: _CastingKind = ... +) -> NDArray[_SCT]: ... @overload -def vstack(tup: Sequence[ArrayLike]) -> NDArray[Any]: ... +def vstack( + tup: Sequence[ArrayLike], + *, + dtype: DTypeLike = ..., + casting: _CastingKind = ... +) -> NDArray[Any]: ... @overload -def hstack(tup: Sequence[_ArrayLike[_SCT]]) -> NDArray[_SCT]: ... +def hstack( + tup: Sequence[_ArrayLike[_SCT]], + *, + dtype: DTypeLike = ..., + casting: _CastingKind = ... +) -> NDArray[_SCT]: ... @overload -def hstack(tup: Sequence[ArrayLike]) -> NDArray[Any]: ... +def hstack( + tup: Sequence[ArrayLike], + *, + dtype: DTypeLike = ..., + casting: _CastingKind = ... +) -> NDArray[Any]: ... @overload def stack( arrays: Sequence[_ArrayLike[_SCT]], axis: SupportsIndex = ..., - out: None = ..., + out: DTypeLike = ..., *, - dtype: None = ..., - casting: None | _CastingKind = ... + dtype: DTypeLike = ..., + casting: _CastingKind = ... ) -> NDArray[_SCT]: ... @overload def stack( @@ -55,8 +75,8 @@ def stack( axis: SupportsIndex = ..., out: None = ..., *, - dtype: None = ..., - casting: None | _CastingKind = ... + dtype: DTypeLike = ..., + casting: _CastingKind = ... ) -> NDArray[Any]: ... @overload def stack( @@ -64,8 +84,8 @@ def stack( axis: SupportsIndex = ..., out: _ArrayType = ..., *, - dtype: None = ..., - casting: None | _CastingKind = ... + dtype: DTypeLike = ..., + casting: _CastingKind = ... ) -> _ArrayType: ... @overload From 7bf67816a57cff92227273c4ffffe93d1fc74c59 Mon Sep 17 00:00:00 2001 From: jhonatancunha Date: Sat, 4 Jun 2022 14:59:28 -0300 Subject: [PATCH 12/15] TST: writing tests to vstack and hstack methods with dtype and casting keyword arguments. See numpy#20959 Co-authored-by: alescrocaro Co-authored-by: JessePires Co-authored-by: patriarka --- numpy/core/tests/test_shape_base.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py index 955c80b2d58f..c8dbb144a25f 100644 --- a/numpy/core/tests/test_shape_base.py +++ b/numpy/core/tests/test_shape_base.py @@ -157,6 +157,19 @@ def test_generator(self): with assert_warns(FutureWarning): hstack(map(lambda x: x, np.ones((3, 2)))) + def test_casting_and_dtype(self): + a = np.array([1, 2, 3]) + b = np.array([2.5, 3.5, 4.5]) + res = np.hstack((a, b), casting="unsafe", dtype=np.int64) + expected_res = np.array([1, 2, 3, 2, 3, 4]) + assert_array_equal(res, expected_res) + + def test_casting_and_dtype_type_error(self): + a = np.array([1, 2, 3]) + b = np.array([2.5, 3.5, 4.5]) + with pytest.raises(TypeError): + hstack((a, b), casting="safe", dtype=np.int64) + class TestVstack: def test_non_iterable(self): @@ -197,6 +210,20 @@ def test_generator(self): with assert_warns(FutureWarning): vstack((np.arange(3) for _ in range(2))) + def test_casting_and_dtype(self): + a = np.array([1, 2, 3]) + b = np.array([2.5, 3.5, 4.5]) + res = np.vstack((a, b), casting="unsafe", dtype=np.int64) + expected_res = np.array([[1, 2, 3], [2, 3, 4]]) + assert_array_equal(res, expected_res) + + def test_casting_and_dtype_type_error(self): + a = np.array([1, 2, 3]) + b = np.array([2.5, 3.5, 4.5]) + with pytest.raises(TypeError): + vstack((a, b), casting="safe", dtype=np.int64) + + class TestConcatenate: def test_returns_copy(self): From ce0ae3545ba61c86b4dc385e947018dc2e22f292 Mon Sep 17 00:00:00 2001 From: jhonatancunha Date: Sat, 4 Jun 2022 16:17:57 -0300 Subject: [PATCH 13/15] REV: reverting the 'out' option type in stack method. See numpy#20959 Co-authored-by: alescrocaro Co-authored-by: JessePires Co-authored-by: patriarka --- numpy/core/shape_base.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numpy/core/shape_base.pyi b/numpy/core/shape_base.pyi index 794993f5813b..38ded19fbcaa 100644 --- a/numpy/core/shape_base.pyi +++ b/numpy/core/shape_base.pyi @@ -64,7 +64,7 @@ def hstack( def stack( arrays: Sequence[_ArrayLike[_SCT]], axis: SupportsIndex = ..., - out: DTypeLike = ..., + out: None = ..., *, dtype: DTypeLike = ..., casting: _CastingKind = ... @@ -82,7 +82,7 @@ def stack( def stack( arrays: Sequence[ArrayLike], axis: SupportsIndex = ..., - out: _ArrayType = ..., + out: None = ..., *, dtype: DTypeLike = ..., casting: _CastingKind = ... From 1e94f431ed685ead3043231334055f15be291a90 Mon Sep 17 00:00:00 2001 From: JessePires Date: Mon, 6 Jun 2022 17:54:05 -0300 Subject: [PATCH 14/15] REV: Reverting out type changes in overload of shape_base.pyi file. See numpy#20959 Co-authored-by: alescrocaro Co-authored-by: jhonatancunha Co-authored-by: patriarka --- numpy/core/shape_base.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/core/shape_base.pyi b/numpy/core/shape_base.pyi index 38ded19fbcaa..82541b55b067 100644 --- a/numpy/core/shape_base.pyi +++ b/numpy/core/shape_base.pyi @@ -82,7 +82,7 @@ def stack( def stack( arrays: Sequence[ArrayLike], axis: SupportsIndex = ..., - out: None = ..., + out: _ArrayType = ..., *, dtype: DTypeLike = ..., casting: _CastingKind = ... From 43eadef06032478a77f99be210df2eb423f60c23 Mon Sep 17 00:00:00 2001 From: jhonatancunha Date: Wed, 8 Jun 2022 10:48:16 -0300 Subject: [PATCH 15/15] DOC: correcting some english erros in upcoming_changes file. See numpy#20959 Co-authored-by: alescrocaro Co-authored-by: JessePires Co-authored-by: patriarka --- doc/release/upcoming_changes/21627.new_feature.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/release/upcoming_changes/21627.new_feature.rst b/doc/release/upcoming_changes/21627.new_feature.rst index 6330db398d42..f516ac96d0b6 100644 --- a/doc/release/upcoming_changes/21627.new_feature.rst +++ b/doc/release/upcoming_changes/21627.new_feature.rst @@ -1,16 +1,16 @@ ``casting`` and ``dtype`` keyword arguments for `numpy.stack` ------------------------------------------------------------- -The ``casting`` and ``dtype`` keyword arguments is now available for `numpy.stack`. -To use it, write ``np.stack(..., dtype=None, casting='same_kind')``. +The ``casting`` and ``dtype`` keyword arguments are now available for `numpy.stack`. +To use them, write ``np.stack(..., dtype=None, casting='same_kind')``. ``casting`` and ``dtype`` keyword arguments for `numpy.vstack` -------------------------------------------------------------- -The ``casting`` and ``dtype`` keyword arguments is now available for `numpy.vstack`. -To use it, write ``np.vstack(..., dtype=None, casting='same_kind')``. +The ``casting`` and ``dtype`` keyword arguments are now available for `numpy.vstack`. +To use them, write ``np.vstack(..., dtype=None, casting='same_kind')``. ``casting`` and ``dtype`` keyword arguments for `numpy.hstack` -------------------------------------------------------------- -The ``casting`` and ``dtype`` keyword arguments is now available for `numpy.hstack`. -To use it, write ``np.hstack(..., dtype=None, casting='same_kind')``. \ No newline at end of file +The ``casting`` and ``dtype`` keyword arguments are now available for `numpy.hstack`. +To use them, write ``np.hstack(..., dtype=None, casting='same_kind')``. \ No newline at end of file