Skip to content


ENH: Add the __dlpack__ and __dlpack_device__ methods to ndarray.
Browse files Browse the repository at this point in the history
  • Loading branch information
hameerabbasi committed May 24, 2021
1 parent 3dcd924 commit 830903a
Show file tree
Hide file tree
Showing 3 changed files with 352 additions and 0 deletions.
6 changes: 6 additions & 0 deletions numpy/__init__.pyi
Expand Up @@ -1644,6 +1644,10 @@ _ArrayComplex_co = NDArray[Union[bool_, integer[Any], floating[Any], complexfloa
_ArrayNumber_co = NDArray[Union[bool_, number[Any]]]
_ArrayTD64_co = NDArray[Union[bool_, integer[Any], timedelta64]]

# `builtins.PyCapsule` unfortunately lacks annotations as of the moment;
# use `Any` as a stopgap measure
_PyCapsule = Any

class _SupportsItem(Protocol[_T_co]):
def item(self, __args: Any) -> _T_co: ...

Expand Down Expand Up @@ -2809,6 +2813,8 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType_co]):
def __ior__(self: NDArray[object_], other: Any) -> NDArray[object_]: ...
def __ior__(self: NDArray[_ScalarType], other: _RecursiveSequence) -> NDArray[_ScalarType]: ...
def __dlpack__(self: NDArray[number[Any]], *, stream: None = ...) -> _PyCapsule: ...
def __dlpack_device__(self) -> Tuple[L[1], L[0]]: ...

# Keep `dtype` at the bottom to avoid name conflicts with `np.dtype`
Expand Down
188 changes: 188 additions & 0 deletions numpy/core/include/numpy/dlpack/dlpack.h
@@ -0,0 +1,188 @@
* Copyright (c) 2017 by Contributors
* \file dlpack.h
* \brief The common header of DLPack.

#ifdef __cplusplus
#define DLPACK_EXTERN_C extern "C"

/*! \brief The current version of dlpack */
#define DLPACK_VERSION 050

/*! \brief DLPACK_DLL prefix for windows */
#ifdef _WIN32
#define DLPACK_DLL __declspec(dllexport)
#define DLPACK_DLL __declspec(dllimport)
#define DLPACK_DLL

#include <stdint.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" {
* \brief The device type in DLDevice.
typedef enum {
/*! \brief CPU device */
kDLCPU = 1,
/*! \brief CUDA GPU device */
kDLCUDA = 2,
* \brief Pinned CUDA CPU memory by cudaMallocHost
kDLCUDAHost = 3,
/*! \brief OpenCL devices. */
kDLOpenCL = 4,
/*! \brief Vulkan buffer for next generation graphics. */
kDLVulkan = 7,
/*! \brief Metal for Apple GPU. */
kDLMetal = 8,
/*! \brief Verilog simulator buffer */
kDLVPI = 9,
/*! \brief ROCm GPUs for AMD GPUs */
kDLROCM = 10,
* \brief Reserved extension device type,
* used for quickly test extension device
* The semantics can differ depending on the implementation.
kDLExtDev = 12,
} DLDeviceType;

* \brief A Device for Tensor and operator.
typedef struct {
/*! \brief The device type used in the device. */
DLDeviceType device_type;
/*! \brief The device index */
int device_id;
} DLDevice;

* \brief The type code options DLDataType.
typedef enum {
/*! \brief signed integer */
kDLInt = 0U,
/*! \brief unsigned integer */
kDLUInt = 1U,
/*! \brief IEEE floating point */
kDLFloat = 2U,
* \brief Opaque handle type, reserved for testing purposes.
* Frameworks need to agree on the handle data type for the exchange to be well-defined.
kDLOpaqueHandle = 3U,
/*! \brief bfloat16 */
kDLBfloat = 4U,
* \brief complex number
* (C/C++/Python layout: compact struct per complex number)
kDLComplex = 5U,
} DLDataTypeCode;

* \brief The data type the tensor can hold.
* Examples
* - float: type_code = 2, bits = 32, lanes=1
* - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
* - int8: type_code = 0, bits = 8, lanes=1
* - std::complex<float>: type_code = 5, bits = 64, lanes = 1
typedef struct {
* \brief Type code of base types.
* We keep it uint8_t instead of DLDataTypeCode for minimal memory
* footprint, but the value should be one of DLDataTypeCode enum values.
* */
uint8_t code;
* \brief Number of bits, common choices are 8, 16, 32.
uint8_t bits;
/*! \brief Number of lanes in the type, used for vector types. */
uint16_t lanes;
} DLDataType;

* \brief Plain C Tensor object, does not manage memory.
typedef struct {
* \brief The opaque data pointer points to the allocated data. This will be
* CUDA device pointer or cl_mem handle in OpenCL. This pointer is always
* aligned to 256 bytes as in CUDA.
* For given DLTensor, the size of memory required to store the contents of
* data is calculated as follows:
* \code{.c}
* static inline size_t GetDataSize(const DLTensor* t) {
* size_t size = 1;
* for (tvm_index_t i = 0; i < t->ndim; ++i) {
* size *= t->shape[i];
* }
* size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
* return size;
* }
* \endcode
void* data;
/*! \brief The device of the tensor */
DLDevice device;
/*! \brief Number of dimensions */
int ndim;
/*! \brief The data type of the pointer*/
DLDataType dtype;
/*! \brief The shape of the tensor */
int64_t* shape;
* \brief strides of the tensor (in number of elements, not bytes)
* can be NULL, indicating tensor is compact and row-majored.
int64_t* strides;
/*! \brief The offset in bytes to the beginning pointer to data */
uint64_t byte_offset;
} DLTensor;

* \brief C Tensor object, manage memory of DLTensor. This data structure is
* intended to facilitate the borrowing of DLTensor by another framework. It is
* not meant to transfer the tensor. When the borrowing framework doesn't need
* the tensor, it should call the deleter to notify the host that the resource
* is no longer needed.
typedef struct DLManagedTensor {
/*! \brief DLTensor which is being memory managed */
DLTensor dl_tensor;
/*! \brief the context of the original host framework of DLManagedTensor in
* which DLManagedTensor is used in the framework. It can also be NULL.
void * manager_ctx;
/*! \brief Destructor signature void (*)(void*) - this should be called
* to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
* if there is no way for the caller to provide a reasonable destructor.
* The destructors deletes the argument self as well.
void (*deleter)(struct DLManagedTensor * self);
} DLManagedTensor;
#ifdef __cplusplus
#endif // DLPACK_DLPACK_H_
158 changes: 158 additions & 0 deletions numpy/core/src/multiarray/methods.c
Expand Up @@ -30,6 +30,8 @@
#include "methods.h"
#include "alloc.h"

#include "numpy/dlpack/dlpack.h"

/* NpyArg_ParseKeywords
Expand Down Expand Up @@ -2694,6 +2696,152 @@ array_complex(PyArrayObject *self, PyObject *NPY_UNUSED(args))
return c;

#define NPY_DLPACK_CAPSULE_NAME "NumPy DLPack Wrapper"

static void array_dlpack_capsule_deleter(PyObject *self)
DLManagedTensor *managed =
(DLManagedTensor *)PyCapsule_GetPointer(self, NPY_DLPACK_CAPSULE_NAME);

static void array_dlpack_deleter(DLManagedTensor *self)
PyArrayObject *array = (PyArrayObject *)self->manager_ctx;


static PyObject *
array_dlpack(PyArrayObject *self,
PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
PyObject *stream = Py_None;
if (npy_parse_arguments("__dlpack__", args, len_args, kwnames,
"$stream", NULL, &stream,
return NULL;

if (stream != Py_None)
PyErr_SetString(PyExc_RuntimeError, "NumPy only supports stream=None.");
return NULL;

npy_intp itemsize = PyArray_ITEMSIZE(self);
int ndim = PyArray_NDIM(self);
npy_intp *strides = PyArray_STRIDES(self);
npy_intp *shape = PyArray_SHAPE(self);

for (int i = 0; i < ndim; ++i)
if (strides[i] % itemsize != 0) {
"DLPack only supports strides which are a multiple of itemsize.");
return NULL;

DLDataType managed_dtype;
PyArray_Descr *dtype = PyArray_DESCR(self);

managed_dtype.bits = 8 * itemsize;
managed_dtype.lanes = 1;
if (PyDataType_ISSIGNED(dtype))
managed_dtype.code = kDLInt;
else if (PyDataType_ISUNSIGNED(dtype))
managed_dtype.code = kDLUInt;
else if (PyDataType_ISFLOAT(dtype))
managed_dtype.code = kDLFloat;
else if (PyDataType_ISCOMPLEX(dtype))
managed_dtype.code = kDLComplex;
"DLPack only supports signed/unsigned integers, float and complex dtypes.");
return NULL;

DLManagedTensor *managed = malloc(sizeof(DLManagedTensor));
if (managed == NULL)
"Could not allocate the DLManagedTensor struct.");
return NULL;

managed-> = PyArray_DATA(self);
managed->dl_tensor.device.device_type = kDLCPU;
managed->dl_tensor.device.device_id = 0;
managed->dl_tensor.dtype = managed_dtype;

int64_t *managed_shape = malloc(sizeof(int64_t) * ndim);
if (managed_shape == NULL)
"Could not allocate the DLManagedTensor struct shape.");
return NULL;

int64_t *managed_strides = malloc(sizeof(int64_t) * ndim);
if (managed_strides == NULL)
"Could not allocate the DLManagedTensor struct strides.");
return NULL;

for (int i = 0; i < ndim; ++i)
managed_shape[i] = shape[i];
managed_strides[i] = strides[i];

managed->dl_tensor.ndim = ndim;
managed->dl_tensor.shape = managed_shape;
managed->dl_tensor.strides = managed_strides;
managed->dl_tensor.byte_offset = 0;
managed->manager_ctx = self;
managed->deleter = array_dlpack_deleter;

PyObject *capsule = PyCapsule_New(managed, NPY_DLPACK_CAPSULE_NAME, array_dlpack_capsule_deleter);
if (capsule != NULL)
return capsule;

static PyObject *
array_dlpack_device(PyArrayObject *self, PyObject *args)
return Py_BuildValue("ii", 1, 0);

NPY_NO_EXPORT PyMethodDef array_methods[] = {

/* for subtypes */
Expand Down Expand Up @@ -2914,5 +3062,15 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = {

// For data interchange between libraries


{NULL, NULL, 0, NULL} /* sentinel */

0 comments on commit 830903a

Please sign in to comment.