diff --git a/.circleci/config.yml b/.circleci/config.yml index fdb85be98be6..d582fe4ec48d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -19,6 +19,12 @@ jobs: command: | if [[ -v CI_PULL_REQUEST ]] ; then git pull --ff-only origin "refs/pull/${CI_PULL_REQUEST//*pull\//}/merge" ; fi + - run: + name: update submodules + command: | + git submodule init + git submodule update + - run: name: create virtual environment, install dependencies command: | diff --git a/.gitattributes b/.gitattributes index 8723dd9dc95a..1830531c1662 100644 --- a/.gitattributes +++ b/.gitattributes @@ -11,6 +11,7 @@ numpy/linalg/lapack_lite/f2c.c linguist-vendored numpy/linalg/lapack_lite/f2c.h linguist-vendored tools/npy_tempita/* linguist-vendored numpy/core/include/numpy/libdivide/* linguist-vendored +numpy/core/src/umath/svml/* linguist-vendored # Mark some files as generated numpy/linalg/lapack_lite/f2c_*.c linguist-generated diff --git a/.gitmodules b/.gitmodules index 0d6857868837..1ea274daf3b9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "doc/source/_static/scipy-mathjax"] path = doc/source/_static/scipy-mathjax url = https://github.com/scipy/scipy-mathjax.git +[submodule "numpy/core/src/umath/svml"] + path = numpy/core/src/umath/svml + url = https://github.com/numpy/SVML.git diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 714f62912a23..99ea407be6df 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -23,6 +23,9 @@ stages: pool: vmImage: 'ubuntu-20.04' steps: + - script: | + git submodule update --init + displayName: 'Fetch submodules' - script: | if ! `gcc 2>/dev/null`; then sudo apt install gcc @@ -71,6 +74,9 @@ stages: pool: vmImage: 'ubuntu-20.04' steps: + - script: | + git submodule update --init + displayName: 'Fetch submodules' - script: | docker run -v $(pwd):/numpy -e CFLAGS="-msse2 -std=c99 -UNDEBUG" \ -e F77=gfortran-5 -e F90=gfortran-5 quay.io/pypa/manylinux2014_i686 \ @@ -258,6 +264,9 @@ stages: pool: vmImage: 'ubuntu-20.04' steps: + - script: | + git submodule update --init + displayName: 'Fetch submodules' - script: | # create and activate conda environment conda env create -f environment.yml diff --git a/doc/release/upcoming_changes/19478.performance.rst b/doc/release/upcoming_changes/19478.performance.rst new file mode 100644 index 000000000000..6a389c20eb0c --- /dev/null +++ b/doc/release/upcoming_changes/19478.performance.rst @@ -0,0 +1,11 @@ +Vectorize umath module using AVX-512 +------------------------------------- + +By leveraging Intel Short Vector Math Library (SVML), 18 umath functions +(``exp2``, ``log2``, ``log10``, ``expm1``, ``log1p``, ``cbrt``, ``sin``, +``cos``, ``tan``, ``arcsin``, ``arccos``, ``arctan``, ``sinh``, ``cosh``, +``tanh``, ``arcsinh``, ``arccosh``, ``arctanh``) are vectorized using AVX-512 +instruction set for both single and double precision implementations. This +change is currently enabled only for Linux users and on processors with +AVX-512 instruction set. It provides an average speed up of 32x and 14x for +single and double precision functions respectively. diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 4891e8f2318a..3a27a34cdd51 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -359,7 +359,7 @@ def english_upper(s): docstrings.get('numpy.core.umath.fmod'), None, TD(ints), - TD(flts, f='fmod', astype={'e':'f'}), + TD(flts, f='fmod', astype={'e': 'f'}), TD(P, f='fmod'), ), 'square': @@ -390,7 +390,7 @@ def english_upper(s): docstrings.get('numpy.core.umath.power'), None, TD(ints), - TD(inexact, f='pow', astype={'e':'f'}), + TD(inexact, f='pow', astype={'e': 'f'}), TD(O, f='npy_ObjectPower'), ), 'float_power': @@ -551,13 +551,13 @@ def english_upper(s): Ufunc(2, 1, MinusInfinity, docstrings.get('numpy.core.umath.logaddexp'), None, - TD(flts, f="logaddexp", astype={'e':'f'}) + TD(flts, f="logaddexp", astype={'e': 'f'}) ), 'logaddexp2': Ufunc(2, 1, MinusInfinity, docstrings.get('numpy.core.umath.logaddexp2'), None, - TD(flts, f="logaddexp2", astype={'e':'f'}) + TD(flts, f="logaddexp2", astype={'e': 'f'}) ), 'bitwise_and': Ufunc(2, 1, AllOnes, @@ -605,80 +605,93 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.heaviside'), None, - TD(flts, f='heaviside', astype={'e':'f'}), + TD(flts, f='heaviside', astype={'e': 'f'}), ), 'degrees': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.degrees'), None, - TD(fltsP, f='degrees', astype={'e':'f'}), + TD(fltsP, f='degrees', astype={'e': 'f'}), ), 'rad2deg': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.rad2deg'), None, - TD(fltsP, f='rad2deg', astype={'e':'f'}), + TD(fltsP, f='rad2deg', astype={'e': 'f'}), ), 'radians': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.radians'), None, - TD(fltsP, f='radians', astype={'e':'f'}), + TD(fltsP, f='radians', astype={'e': 'f'}), ), 'deg2rad': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.deg2rad'), None, - TD(fltsP, f='deg2rad', astype={'e':'f'}), + TD(fltsP, f='deg2rad', astype={'e': 'f'}), ), 'arccos': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.arccos'), None, - TD(inexact, f='acos', astype={'e':'f'}), + TD('e', f='acos', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(inexact, f='acos', astype={'e': 'f'}), TD(P, f='arccos'), ), 'arccosh': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.arccosh'), None, - TD(inexact, f='acosh', astype={'e':'f'}), + TD('e', f='acosh', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(inexact, f='acosh', astype={'e': 'f'}), TD(P, f='arccosh'), ), 'arcsin': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.arcsin'), None, - TD(inexact, f='asin', astype={'e':'f'}), + TD('e', f='asin', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(inexact, f='asin', astype={'e': 'f'}), TD(P, f='arcsin'), ), 'arcsinh': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.arcsinh'), None, - TD(inexact, f='asinh', astype={'e':'f'}), + TD('e', f='asinh', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(inexact, f='asinh', astype={'e': 'f'}), TD(P, f='arcsinh'), ), 'arctan': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.arctan'), None, - TD(inexact, f='atan', astype={'e':'f'}), + TD('e', f='atan', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(inexact, f='atan', astype={'e': 'f'}), TD(P, f='arctan'), ), 'arctanh': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.arctanh'), None, - TD(inexact, f='atanh', astype={'e':'f'}), + TD('e', f='atanh', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(inexact, f='atanh', astype={'e': 'f'}), TD(P, f='arctanh'), ), 'cos': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.cos'), None, - TD('e', f='cos', astype={'e':'f'}), + TD('e', f='cos', astype={'e': 'f'}), TD('f', dispatch=[('loops_trigonometric', 'f')]), + TD('d', dispatch=[('loops_umath_fp', 'd')]), TD('fdg' + cmplx, f='cos'), TD(P, f='cos'), ), @@ -686,8 +699,9 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.sin'), None, - TD('e', f='sin', astype={'e':'f'}), + TD('e', f='sin', astype={'e': 'f'}), TD('f', dispatch=[('loops_trigonometric', 'f')]), + TD('d', dispatch=[('loops_umath_fp', 'd')]), TD('fdg' + cmplx, f='sin'), TD(P, f='sin'), ), @@ -695,35 +709,43 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.tan'), None, - TD(inexact, f='tan', astype={'e':'f'}), + TD('e', f='tan', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(inexact, f='tan', astype={'e': 'f'}), TD(P, f='tan'), ), 'cosh': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.cosh'), None, - TD(inexact, f='cosh', astype={'e':'f'}), + TD('e', f='cosh', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(inexact, f='cosh', astype={'e': 'f'}), TD(P, f='cosh'), ), 'sinh': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.sinh'), None, - TD(inexact, f='sinh', astype={'e':'f'}), + TD('e', f='sinh', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(inexact, f='sinh', astype={'e': 'f'}), TD(P, f='sinh'), ), 'tanh': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.tanh'), None, - TD(inexact, f='tanh', astype={'e':'f'}), + TD('e', f='tanh', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(inexact, f='tanh', astype={'e': 'f'}), TD(P, f='tanh'), ), 'exp': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.exp'), None, - TD('e', f='exp', astype={'e':'f'}), + TD('e', f='exp', astype={'e': 'f'}), TD('fd', dispatch=[('loops_exponent_log', 'fd')]), TD('fdg' + cmplx, f='exp'), TD(P, f='exp'), @@ -732,21 +754,25 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.exp2'), None, - TD(inexact, f='exp2', astype={'e':'f'}), + TD('e', f='exp2', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(inexact, f='exp2', astype={'e': 'f'}), TD(P, f='exp2'), ), 'expm1': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.expm1'), None, - TD(inexact, f='expm1', astype={'e':'f'}), + TD('e', f='expm1', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(inexact, f='expm1', astype={'e': 'f'}), TD(P, f='expm1'), ), 'log': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.log'), None, - TD('e', f='log', astype={'e':'f'}), + TD('e', f='log', astype={'e': 'f'}), TD('fd', dispatch=[('loops_exponent_log', 'fd')]), TD('fdg' + cmplx, f='log'), TD(P, f='log'), @@ -755,28 +781,34 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.log2'), None, - TD(inexact, f='log2', astype={'e':'f'}), + TD('e', f='log2', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(inexact, f='log2', astype={'e': 'f'}), TD(P, f='log2'), ), 'log10': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.log10'), None, - TD(inexact, f='log10', astype={'e':'f'}), + TD('e', f='log10', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(inexact, f='log10', astype={'e': 'f'}), TD(P, f='log10'), ), 'log1p': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.log1p'), None, - TD(inexact, f='log1p', astype={'e':'f'}), + TD('e', f='log1p', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(inexact, f='log1p', astype={'e': 'f'}), TD(P, f='log1p'), ), 'sqrt': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.sqrt'), None, - TD('e', f='sqrt', astype={'e':'f'}), + TD('e', f='sqrt', astype={'e': 'f'}), TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]), TD('fdg' + cmplx, f='sqrt'), TD(P, f='sqrt'), @@ -785,14 +817,16 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.cbrt'), None, - TD(flts, f='cbrt', astype={'e':'f'}), + TD('e', f='cbrt', astype={'e': 'f'}), + TD('fd', dispatch=[('loops_umath_fp', 'fd')]), + TD(flts, f='cbrt', astype={'e': 'f'}), TD(P, f='cbrt'), ), 'ceil': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.ceil'), None, - TD('e', f='ceil', astype={'e':'f'}), + TD('e', f='ceil', astype={'e': 'f'}), TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), TD('fdg', f='ceil'), TD(O, f='npy_ObjectCeil'), @@ -801,7 +835,7 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.trunc'), None, - TD('e', f='trunc', astype={'e':'f'}), + TD('e', f='trunc', astype={'e': 'f'}), TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), TD('fdg', f='trunc'), TD(O, f='npy_ObjectTrunc'), @@ -810,14 +844,14 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.fabs'), None, - TD(flts, f='fabs', astype={'e':'f'}), + TD(flts, f='fabs', astype={'e': 'f'}), TD(P, f='fabs'), ), 'floor': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.floor'), None, - TD('e', f='floor', astype={'e':'f'}), + TD('e', f='floor', astype={'e': 'f'}), TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), TD('fdg', f='floor'), TD(O, f='npy_ObjectFloor'), @@ -826,7 +860,7 @@ def english_upper(s): Ufunc(1, 1, None, docstrings.get('numpy.core.umath.rint'), None, - TD('e', f='rint', astype={'e':'f'}), + TD('e', f='rint', astype={'e': 'f'}), TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), TD('fdg' + cmplx, f='rint'), TD(P, f='rint'), @@ -835,7 +869,7 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.arctan2'), None, - TD(flts, f='atan2', astype={'e':'f'}), + TD(flts, f='atan2', astype={'e': 'f'}), TD(P, f='arctan2'), ), 'remainder': @@ -858,7 +892,7 @@ def english_upper(s): Ufunc(2, 1, Zero, docstrings.get('numpy.core.umath.hypot'), None, - TD(flts, f='hypot', astype={'e':'f'}), + TD(flts, f='hypot', astype={'e': 'f'}), TD(P, f='hypot'), ), 'isnan': diff --git a/numpy/core/setup.py b/numpy/core/setup.py index c20320910d22..38c68dfcca78 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -5,6 +5,7 @@ import warnings import platform import textwrap +import glob from os.path import join from numpy.distutils import log @@ -63,6 +64,20 @@ def check_complex(self, *a, **kw): out = copy.deepcopy(pickle.loads(self._check_complex)) return out +def can_link_svml(): + """SVML library is supported only on x86_64 architecture and currently + only on linux + """ + machine = platform.machine() + system = platform.system() + return "x86_64" in machine and system == "Linux" + +def check_svml_submodule(svmlpath): + if not os.path.exists(svmlpath + "/README.md"): + raise RuntimeError("Missing `SVML` submodule! Run `git submodule " + "update --init` to fix this.") + return True + def pythonlib_dir(): """return path where libpython* is.""" if sys.platform == 'win32': @@ -455,6 +470,9 @@ def generate_config_h(ext, build_dir): # Inline check inline = config_cmd.check_inline() + if can_link_svml(): + moredefs.append(('NPY_CAN_LINK_SVML', 1)) + # Use relaxed stride checking if NPY_RELAXED_STRIDES_CHECKING: moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 1)) @@ -727,6 +745,7 @@ def get_mathlib_info(*args): join('src', 'common', 'npy_import.h'), join('src', 'common', 'npy_hashtable.h'), join('src', 'common', 'npy_longdouble.h'), + join('src', 'common', 'npy_svml.h'), join('src', 'common', 'templ_common.h.src'), join('src', 'common', 'ucsnarrow.h'), join('src', 'common', 'ufunc_override.h'), @@ -923,6 +942,7 @@ def generate_umath_c(ext, build_dir): join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), join('src', 'umath', 'loops_trigonometric.dispatch.c.src'), + join('src', 'umath', 'loops_umath_fp.dispatch.c.src'), join('src', 'umath', 'loops_exponent_log.dispatch.c.src'), join('src', 'umath', 'matmul.h.src'), join('src', 'umath', 'matmul.c.src'), @@ -951,6 +971,11 @@ def generate_umath_c(ext, build_dir): join(codegen_dir, 'generate_ufunc_api.py'), ] + svml_path = join('numpy', 'core', 'src', 'umath', 'svml') + svml_objs = [] + if can_link_svml() and check_svml_submodule(svml_path): + svml_objs = glob.glob(svml_path + '/**/*.s', recursive=True) + config.add_extension('_multiarray_umath', sources=multiarray_src + umath_src + common_src + @@ -965,6 +990,7 @@ def generate_umath_c(ext, build_dir): depends=deps + multiarray_deps + umath_deps + common_deps, libraries=['npymath'], + extra_objects=svml_objs, extra_info=extra_info) ####################################################################### diff --git a/numpy/core/src/common/npy_svml.h b/numpy/core/src/common/npy_svml.h new file mode 100644 index 000000000000..4292f7090333 --- /dev/null +++ b/numpy/core/src/common/npy_svml.h @@ -0,0 +1,41 @@ +#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) +extern __m512 __svml_exp2f16(__m512 x); +extern __m512 __svml_log2f16(__m512 x); +extern __m512 __svml_log10f16(__m512 x); +extern __m512 __svml_expm1f16(__m512 x); +extern __m512 __svml_log1pf16(__m512 x); +extern __m512 __svml_cbrtf16(__m512 x); +extern __m512 __svml_sinf16(__m512 x); +extern __m512 __svml_cosf16(__m512 x); +extern __m512 __svml_tanf16(__m512 x); +extern __m512 __svml_asinf16(__m512 x); +extern __m512 __svml_acosf16(__m512 x); +extern __m512 __svml_atanf16(__m512 x); +extern __m512 __svml_atan2f16(__m512 x); +extern __m512 __svml_sinhf16(__m512 x); +extern __m512 __svml_coshf16(__m512 x); +extern __m512 __svml_tanhf16(__m512 x); +extern __m512 __svml_asinhf16(__m512 x); +extern __m512 __svml_acoshf16(__m512 x); +extern __m512 __svml_atanhf16(__m512 x); + +extern __m512d __svml_exp28(__m512d x); +extern __m512d __svml_log28(__m512d x); +extern __m512d __svml_log108(__m512d x); +extern __m512d __svml_expm18(__m512d x); +extern __m512d __svml_log1p8(__m512d x); +extern __m512d __svml_cbrt8(__m512d x); +extern __m512d __svml_sin8(__m512d x); +extern __m512d __svml_cos8(__m512d x); +extern __m512d __svml_tan8(__m512d x); +extern __m512d __svml_asin8(__m512d x); +extern __m512d __svml_acos8(__m512d x); +extern __m512d __svml_atan8(__m512d x); +extern __m512d __svml_atan28(__m512d x); +extern __m512d __svml_sinh8(__m512d x); +extern __m512d __svml_cosh8(__m512d x); +extern __m512d __svml_tanh8(__m512d x); +extern __m512d __svml_asinh8(__m512d x); +extern __m512d __svml_acosh8(__m512d x); +extern __m512d __svml_atanh8(__m512d x); +#endif diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 02d749a5eb5b..0938cd050f58 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -210,6 +210,32 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, /**end repeat1**/ /**end repeat**/ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_umath_fp.dispatch.h" +#endif + +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh# + */ + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #func = sin, cos# + */ + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + +/**end repeat**/ + /**begin repeat * #TYPE = FLOAT, DOUBLE# */ diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src new file mode 100644 index 000000000000..8526046556a7 --- /dev/null +++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src @@ -0,0 +1,141 @@ +/*@targets + ** $maxopt baseline avx512_skx + */ +#include "numpy/npy_math.h" +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "npy_svml.h" +#include "fast_loop_macros.h" + +#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) +/**begin repeat + * #sfx = f32, f64# + * #func_suffix = f16, 8# + */ +/**begin repeat1 + * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh# + * #default_val = 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0# + */ +static void +simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc, + npyv_lanetype_@sfx@ *dst, npy_intp sdst, npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { + npyv_@sfx@ x; + #if @default_val@ + if (ssrc == 1) { + x = npyv_load_till_@sfx@(src, len, @default_val@); + } else { + x = npyv_loadn_till_@sfx@(src, ssrc, len, @default_val@); + } + #else + if (ssrc == 1) { + x = npyv_load_tillz_@sfx@(src, len); + } else { + x = npyv_loadn_tillz_@sfx@(src, ssrc, len); + } + #endif + npyv_@sfx@ out = __svml_@func@@func_suffix@(x); + if (sdst == 1) { + npyv_store_till_@sfx@(dst, len, out); + } else { + npyv_storen_till_@sfx@(dst, sdst, len, out); + } + } + npyv_cleanup(); +} +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #func = sin, cos# + */ +static void +simd_@func@_f64(const double *src, npy_intp ssrc, + double *dst, npy_intp sdst, npy_intp len) +{ + const int vstep = npyv_nlanes_f64; + for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { + npyv_f64 x; + if (ssrc == 1) { + x = npyv_load_tillz_f64(src, len); + } else { + x = npyv_loadn_tillz_f64(src, ssrc, len); + } + npyv_f64 out = __svml_@func@8(x); + if (sdst == 1) { + npyv_store_till_f64(dst, len, out); + } else { + npyv_storen_till_f64(dst, sdst, len, out); + } + } + npyv_cleanup(); +} +/**end repeat**/ +#endif + +/**begin repeat + * #TYPE = DOUBLE, FLOAT# + * #type = npy_double, npy_float# + * #vsub = , f# + * #sfx = f64, f32# + */ +/**begin repeat1 + * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh# + * #intrin = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ +#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) + const @type@ *src = (@type@*)args[0]; + @type@ *dst = (@type@*)args[1]; + const int lsize = sizeof(src[0]); + const npy_intp ssrc = steps[0] / lsize; + const npy_intp sdst = steps[1] / lsize; + const npy_intp len = dimensions[0]; + assert(steps[0] % lsize == 0 && steps[1] % lsize == 0); + if (!is_mem_overlap(src, steps[0], dst, steps[1], len) && + npyv_loadable_stride_@sfx@(ssrc) && + npyv_storable_stride_@sfx@(sdst)) { + simd_@intrin@_@sfx@(src, ssrc, dst, sdst, len); + return; + } +#endif + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + *(@type@ *)op1 = npy_@intrin@@vsub@(in1); + } +} +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #func = sin, cos# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ +#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) + const double *src = (double*)args[0]; + double *dst = (double*)args[1]; + const int lsize = sizeof(src[0]); + const npy_intp ssrc = steps[0] / lsize; + const npy_intp sdst = steps[1] / lsize; + const npy_intp len = dimensions[0]; + assert(steps[0] % lsize == 0 && steps[1] % lsize == 0); + if (!is_mem_overlap(src, steps[0], dst, steps[1], len) && + npyv_loadable_stride_f64(ssrc) && + npyv_storable_stride_f64(sdst)) { + simd_@func@_f64(src, ssrc, dst, sdst, len); + return; + } +#endif + UNARY_LOOP { + const npy_double in1 = *(npy_double *)ip1; + *(npy_double *)op1 = npy_@func@(in1); + } +} +/**end repeat**/ diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 654ab81cc370..d47be9a30fd3 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -116,9 +116,8 @@ run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_in #endif return 0; } - - /**end repeat1**/ + /**end repeat**/ /**begin repeat @@ -1152,6 +1151,7 @@ NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d * #is_finite = 0, 1, 0, 0# * #is_signbit = 0, 0, 0, 1# */ + #if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, const npy_intp steps) diff --git a/numpy/core/src/umath/svml b/numpy/core/src/umath/svml new file mode 160000 index 000000000000..9f8af767ed6c --- /dev/null +++ b/numpy/core/src/umath/svml @@ -0,0 +1 @@ +Subproject commit 9f8af767ed6c75455d9a382af829048f8dd18067 diff --git a/numpy/polynomial/tests/test_classes.py b/numpy/polynomial/tests/test_classes.py index 8e71a19459bc..6322062f29ec 100644 --- a/numpy/polynomial/tests/test_classes.py +++ b/numpy/polynomial/tests/test_classes.py @@ -597,4 +597,4 @@ def powx(x, p): for deg in range(0, 10): for t in range(0, deg + 1): p = Chebyshev.interpolate(powx, deg, domain=[0, 2], args=(t,)) - assert_almost_equal(p(x), powx(x, t), decimal=12) + assert_almost_equal(p(x), powx(x, t), decimal=11)