diff --git a/numpy/__init__.pyi b/numpy/__init__.pyi index ac37eb8ad40..2621cd2f141 100644 --- a/numpy/__init__.pyi +++ b/numpy/__init__.pyi @@ -1644,6 +1644,10 @@ _ArrayComplex_co = NDArray[Union[bool_, integer[Any], floating[Any], complexfloa _ArrayNumber_co = NDArray[Union[bool_, number[Any]]] _ArrayTD64_co = NDArray[Union[bool_, integer[Any], timedelta64]] +# `builtins.PyCapsule` unfortunately lacks annotations as of the moment; +# use `Any` as a stopgap measure +_PyCapsule = Any + class _SupportsItem(Protocol[_T_co]): def item(self, __args: Any) -> _T_co: ... @@ -2809,6 +2813,8 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType_co]): def __ior__(self: NDArray[object_], other: Any) -> NDArray[object_]: ... @overload def __ior__(self: NDArray[_ScalarType], other: _RecursiveSequence) -> NDArray[_ScalarType]: ... + def __dlpack__(self: NDArray[number[Any]], *, stream: None = ...) -> _PyCapsule: ... + def __dlpack_device__(self) -> Tuple[L[1], L[0]]: ... # Keep `dtype` at the bottom to avoid name conflicts with `np.dtype` @property diff --git a/numpy/core/include/numpy/dlpack/dlpack.h b/numpy/core/include/numpy/dlpack/dlpack.h new file mode 100644 index 00000000000..84afca24829 --- /dev/null +++ b/numpy/core/include/numpy/dlpack/dlpack.h @@ -0,0 +1,188 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file dlpack.h + * \brief The common header of DLPack. + */ +#ifndef DLPACK_DLPACK_H_ +#define DLPACK_DLPACK_H_ + +#ifdef __cplusplus +#define DLPACK_EXTERN_C extern "C" +#else +#define DLPACK_EXTERN_C +#endif + +/*! \brief The current version of dlpack */ +#define DLPACK_VERSION 050 + +/*! \brief DLPACK_DLL prefix for windows */ +#ifdef _WIN32 +#ifdef DLPACK_EXPORTS +#define DLPACK_DLL __declspec(dllexport) +#else +#define DLPACK_DLL __declspec(dllimport) +#endif +#else +#define DLPACK_DLL +#endif + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif +/*! + * \brief The device type in DLDevice. + */ +typedef enum { + /*! \brief CPU device */ + kDLCPU = 1, + /*! \brief CUDA GPU device */ + kDLCUDA = 2, + /*! + * \brief Pinned CUDA CPU memory by cudaMallocHost + */ + kDLCUDAHost = 3, + /*! \brief OpenCL devices. */ + kDLOpenCL = 4, + /*! \brief Vulkan buffer for next generation graphics. */ + kDLVulkan = 7, + /*! \brief Metal for Apple GPU. */ + kDLMetal = 8, + /*! \brief Verilog simulator buffer */ + kDLVPI = 9, + /*! \brief ROCm GPUs for AMD GPUs */ + kDLROCM = 10, + /*! + * \brief Reserved extension device type, + * used for quickly test extension device + * The semantics can differ depending on the implementation. + */ + kDLExtDev = 12, +} DLDeviceType; + +/*! + * \brief A Device for Tensor and operator. + */ +typedef struct { + /*! \brief The device type used in the device. */ + DLDeviceType device_type; + /*! \brief The device index */ + int device_id; +} DLDevice; + +/*! + * \brief The type code options DLDataType. + */ +typedef enum { + /*! \brief signed integer */ + kDLInt = 0U, + /*! \brief unsigned integer */ + kDLUInt = 1U, + /*! \brief IEEE floating point */ + kDLFloat = 2U, + /*! + * \brief Opaque handle type, reserved for testing purposes. + * Frameworks need to agree on the handle data type for the exchange to be well-defined. + */ + kDLOpaqueHandle = 3U, + /*! \brief bfloat16 */ + kDLBfloat = 4U, + /*! + * \brief complex number + * (C/C++/Python layout: compact struct per complex number) + */ + kDLComplex = 5U, +} DLDataTypeCode; + +/*! + * \brief The data type the tensor can hold. + * + * Examples + * - float: type_code = 2, bits = 32, lanes=1 + * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4 + * - int8: type_code = 0, bits = 8, lanes=1 + * - std::complex: type_code = 5, bits = 64, lanes = 1 + */ +typedef struct { + /*! + * \brief Type code of base types. + * We keep it uint8_t instead of DLDataTypeCode for minimal memory + * footprint, but the value should be one of DLDataTypeCode enum values. + * */ + uint8_t code; + /*! + * \brief Number of bits, common choices are 8, 16, 32. + */ + uint8_t bits; + /*! \brief Number of lanes in the type, used for vector types. */ + uint16_t lanes; +} DLDataType; + +/*! + * \brief Plain C Tensor object, does not manage memory. + */ +typedef struct { + /*! + * \brief The opaque data pointer points to the allocated data. This will be + * CUDA device pointer or cl_mem handle in OpenCL. This pointer is always + * aligned to 256 bytes as in CUDA. + * + * For given DLTensor, the size of memory required to store the contents of + * data is calculated as follows: + * + * \code{.c} + * static inline size_t GetDataSize(const DLTensor* t) { + * size_t size = 1; + * for (tvm_index_t i = 0; i < t->ndim; ++i) { + * size *= t->shape[i]; + * } + * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; + * return size; + * } + * \endcode + */ + void* data; + /*! \brief The device of the tensor */ + DLDevice device; + /*! \brief Number of dimensions */ + int ndim; + /*! \brief The data type of the pointer*/ + DLDataType dtype; + /*! \brief The shape of the tensor */ + int64_t* shape; + /*! + * \brief strides of the tensor (in number of elements, not bytes) + * can be NULL, indicating tensor is compact and row-majored. + */ + int64_t* strides; + /*! \brief The offset in bytes to the beginning pointer to data */ + uint64_t byte_offset; +} DLTensor; + +/*! + * \brief C Tensor object, manage memory of DLTensor. This data structure is + * intended to facilitate the borrowing of DLTensor by another framework. It is + * not meant to transfer the tensor. When the borrowing framework doesn't need + * the tensor, it should call the deleter to notify the host that the resource + * is no longer needed. + */ +typedef struct DLManagedTensor { + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; + /*! \brief the context of the original host framework of DLManagedTensor in + * which DLManagedTensor is used in the framework. It can also be NULL. + */ + void * manager_ctx; + /*! \brief Destructor signature void (*)(void*) - this should be called + * to destruct manager_ctx which holds the DLManagedTensor. It can be NULL + * if there is no way for the caller to provide a reasonable destructor. + * The destructors deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensor * self); +} DLManagedTensor; +#ifdef __cplusplus +} // DLPACK_EXTERN_C +#endif +#endif // DLPACK_DLPACK_H_ diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c index 251e527a6b9..fa043ac3129 100644 --- a/numpy/core/src/multiarray/methods.c +++ b/numpy/core/src/multiarray/methods.c @@ -30,6 +30,8 @@ #include "methods.h" #include "alloc.h" +#include "numpy/dlpack/dlpack.h" + /* NpyArg_ParseKeywords * @@ -2694,6 +2696,152 @@ array_complex(PyArrayObject *self, PyObject *NPY_UNUSED(args)) return c; } +#define NPY_DLPACK_CAPSULE_NAME "NumPy DLPack Wrapper" + +static void array_dlpack_capsule_deleter(PyObject *self) +{ + DLManagedTensor *managed = + (DLManagedTensor *)PyCapsule_GetPointer(self, NPY_DLPACK_CAPSULE_NAME); + managed->deleter(managed); +} + +static void array_dlpack_deleter(DLManagedTensor *self) +{ + PyArrayObject *array = (PyArrayObject *)self->manager_ctx; + free(self->dl_tensor.shape); + free(self->dl_tensor.strides); + free(self); + + PyArray_XDECREF(array); +} + +static PyObject * +array_dlpack(PyArrayObject *self, + PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames) +{ + PyObject *stream = Py_None; + NPY_PREPARE_ARGPARSER; + if (npy_parse_arguments("__dlpack__", args, len_args, kwnames, + "$stream", NULL, &stream, + NULL, NULL, NULL)) + { + return NULL; + } + + if (stream != Py_None) + { + PyErr_SetString(PyExc_RuntimeError, "NumPy only supports stream=None."); + return NULL; + } + + npy_intp itemsize = PyArray_ITEMSIZE(self); + int ndim = PyArray_NDIM(self); + npy_intp *strides = PyArray_STRIDES(self); + npy_intp *shape = PyArray_SHAPE(self); + + for (int i = 0; i < ndim; ++i) + { + if (strides[i] % itemsize != 0) { + PyErr_SetString(PyExc_RuntimeError, + "DLPack only supports strides which are a multiple of itemsize."); + return NULL; + } + } + + DLDataType managed_dtype; + PyArray_Descr *dtype = PyArray_DESCR(self); + + managed_dtype.bits = 8 * itemsize; + managed_dtype.lanes = 1; + if (PyDataType_ISSIGNED(dtype)) + { + managed_dtype.code = kDLInt; + } + else if (PyDataType_ISUNSIGNED(dtype)) + { + managed_dtype.code = kDLUInt; + } + else if (PyDataType_ISFLOAT(dtype)) + { + managed_dtype.code = kDLFloat; + } + else if (PyDataType_ISCOMPLEX(dtype)) + { + managed_dtype.code = kDLComplex; + } + else + { + PyErr_SetString(PyExc_TypeError, + "DLPack only supports signed/unsigned integers, float and complex dtypes."); + return NULL; + } + + DLManagedTensor *managed = malloc(sizeof(DLManagedTensor)); + if (managed == NULL) + { + PyErr_SetString(PyExc_MemoryError, + "Could not allocate the DLManagedTensor struct."); + return NULL; + } + + managed->dl_tensor.data = PyArray_DATA(self); + managed->dl_tensor.device.device_type = kDLCPU; + managed->dl_tensor.device.device_id = 0; + managed->dl_tensor.dtype = managed_dtype; + + + int64_t *managed_shape = malloc(sizeof(int64_t) * ndim); + if (managed_shape == NULL) + { + PyErr_SetString(PyExc_MemoryError, + "Could not allocate the DLManagedTensor struct shape."); + free(managed); + return NULL; + } + + int64_t *managed_strides = malloc(sizeof(int64_t) * ndim); + if (managed_strides == NULL) + { + PyErr_SetString(PyExc_MemoryError, + "Could not allocate the DLManagedTensor struct strides."); + free(managed); + free(managed_shape); + return NULL; + } + + for (int i = 0; i < ndim; ++i) + { + managed_shape[i] = shape[i]; + managed_strides[i] = strides[i]; + } + + managed->dl_tensor.ndim = ndim; + managed->dl_tensor.shape = managed_shape; + managed->dl_tensor.strides = managed_strides; + managed->dl_tensor.byte_offset = 0; + managed->manager_ctx = self; + managed->deleter = array_dlpack_deleter; + + PyObject *capsule = PyCapsule_New(managed, NPY_DLPACK_CAPSULE_NAME, array_dlpack_capsule_deleter); + if (capsule != NULL) + { + PyArray_INCREF(self); + } + else + { + free(managed); + free(managed_shape); + free(managed_strides); + } + return capsule; +} + +static PyObject * +array_dlpack_device(PyArrayObject *self, PyObject *args) +{ + return Py_BuildValue("ii", 1, 0); +} + NPY_NO_EXPORT PyMethodDef array_methods[] = { /* for subtypes */ @@ -2914,5 +3062,15 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = { {"view", (PyCFunction)array_view, METH_FASTCALL | METH_KEYWORDS, NULL}, + + // For data interchange between libraries + {"__dlpack__", + (PyCFunction)array_dlpack, + METH_FASTCALL | METH_KEYWORDS, NULL}, + + {"__dlpack_device__", + (PyCFunction)array_dlpack_device, + METH_NOARGS, NULL}, + {NULL, NULL, 0, NULL} /* sentinel */ };