Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix numpy.dtype hashing for numpy >= 1.20 #1136

Merged
merged 21 commits into from
Dec 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@ Latest changes
In development
--------------

- Make `joblib.hash` and `joblib.Memory` caching system compatible with `numpy
>= 1.20.0`. Also make it explicit in the documentation that users should now
expect to have their `joblib.Memory` cache invalidated when either `joblib`
or a third party library involved in the cached values definition is
upgraded. In particular, users updating `joblib` to a release that includes
this fix will see their previous cache invalidated if they contained
reference to `numpy` objects.
https://github.com/joblib/joblib/pull/1136

- Remove deprecated `check_pickle` argument in `delayed`.
https://github.com/joblib/joblib/pull/903

Expand Down
9 changes: 9 additions & 0 deletions doc/memory.rst
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,15 @@ Gotchas
``self.method`` does not depend on ``self`` you can use
``self.method = memory.cache(self.method, ignore=['self'])``.

* **joblib cache entries may be invalidated after environment updates**.
Values returned by ``joblib.hash`` are not guaranteed to stay
constant across ``joblib`` versions. This means that **all** entries of a
``joblib.Memory`` cache can get invalidated when upgrading ``joblib``.
Invalidation can also happen when upgrading a third party library (such as
``numpy``): in such a case, only the cached function calls with parameters
that are constructs (or contain references to contructs) defined in the
upgraded library should potentially be invalidated after the uprade.


Ignoring some arguments
-----------------------
Expand Down
31 changes: 18 additions & 13 deletions joblib/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,19 +220,24 @@ def save(self, obj):
# The object will be pickled by the pickler hashed at the end.
obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides))
elif isinstance(obj, self.np.dtype):
# Atomic dtype objects are interned by their default constructor:
# np.dtype('f8') is np.dtype('f8')
# This interning is not maintained by a
# pickle.loads + pickle.dumps cycle, because __reduce__
# uses copy=True in the dtype constructor. This
# non-deterministic behavior causes the internal memoizer
# of the hasher to generate different hash values
# depending on the history of the dtype object.
# To prevent the hash from being sensitive to this, we use
# .descr which is a full (and never interned) description of
# the array dtype according to the numpy doc.
klass = obj.__class__
obj = (klass, ('HASHED', obj.descr))
# numpy.dtype consistent hashing is tricky to get right. This comes
# from the fact that atomic np.dtype objects are interned:
# ``np.dtype('f4') is np.dtype('f4')``. The situation is
# complicated by the fact that this interning does not resist a
# simple pickle.load/dump roundtrip:
# ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not
# np.dtype('f4') Because pickle relies on memoization during
# pickling, it is easy to
# produce different hashes for seemingly identical objects, such as
# ``[np.dtype('f4'), np.dtype('f4')]``
# and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``.
# To prevent memoization from interfering with hashing, we isolate
# the serialization (and thus the pickle memoization) of each dtype
# using each time a different ``pickle.dumps`` call unrelated to
# the current Hasher instance.
self._hash.update("_HASHED_DTYPE".encode('utf-8'))
self._hash.update(pickle.dumps(obj))
return
Hasher.save(self, obj)


Expand Down
160 changes: 109 additions & 51 deletions joblib/test/test_hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import itertools
import pickle
import random
from concurrent.futures import ProcessPoolExecutor
from decimal import Decimal
import pytest

Expand Down Expand Up @@ -326,14 +327,48 @@ def test_string():


@with_numpy
def test_dtype():
# Test that we obtain the same hash for object owning several dtype,
# whatever the past of these dtypes. Catter for cache invalidation with
# complex dtype
a = np.dtype([('f1', np.uint), ('f2', np.int32)])
b = a
c = pickle.loads(pickle.dumps(a))
assert hash([a, c]) == hash([a, b])
def test_numpy_dtype_pickling():
# numpy dtype hashing is tricky to get right: see #231, #239, #251 #1080,
# #1082, and explanatory comments inside
# ``joblib.hashing.NumpyHasher.save``.

# In this test, we make sure that the pickling of numpy dtypes is robust to
# object identity and object copy.

dt1 = np.dtype('f4')
dt2 = np.dtype('f4')

# simple dtypes objects are interned
assert dt1 is dt2
assert hash(dt1) == hash(dt2)

dt1_roundtripped = pickle.loads(pickle.dumps(dt1))
assert dt1 is not dt1_roundtripped
assert hash(dt1) == hash(dt1_roundtripped)

assert hash([dt1, dt1]) == hash([dt1_roundtripped, dt1_roundtripped])
assert hash([dt1, dt1]) == hash([dt1, dt1_roundtripped])

complex_dt1 = np.dtype(
[('name', np.str_, 16), ('grades', np.float64, (2,))]
)
complex_dt2 = np.dtype(
[('name', np.str_, 16), ('grades', np.float64, (2,))]
)

# complex dtypes objects are not interned
assert hash(complex_dt1) == hash(complex_dt2)

complex_dt1_roundtripped = pickle.loads(pickle.dumps(complex_dt1))
assert complex_dt1_roundtripped is not complex_dt1
assert hash(complex_dt1) == hash(complex_dt1_roundtripped)

assert hash([complex_dt1, complex_dt1]) == hash(
[complex_dt1_roundtripped, complex_dt1_roundtripped]
)
assert hash([complex_dt1, complex_dt1]) == hash(
[complex_dt1_roundtripped, complex_dt1]
)


@parametrize('to_hash,expected',
Expand Down Expand Up @@ -378,49 +413,72 @@ def test_0d_and_1d_array_hashing_is_different():

@with_numpy
def test_hashes_stay_the_same_with_numpy_objects():
# We want to make sure that hashes don't change with joblib
# version. For end users, that would mean that they have to
# regenerate their cache from scratch, which potentially means
# lengthy recomputations.
rng = np.random.RandomState(42)
# Being explicit about dtypes in order to avoid
# architecture-related differences. Also using 'f4' rather than
# 'f8' for float arrays because 'f8' arrays generated by
# rng.random.randn don't seem to be bit-identical on 32bit and
# 64bit machines.
to_hash_list = [
rng.randint(-1000, high=1000, size=50).astype('<i8'),
tuple(rng.randn(3).astype('<f4') for _ in range(5)),
[rng.randn(3).astype('<f4') for _ in range(5)],
{
-3333: rng.randn(3, 5).astype('<f4'),
0: [
rng.randint(10, size=20).astype('<i8'),
rng.randn(10).astype('<f4')
]
},
# Non regression cases for https://github.com/joblib/joblib/issues/308.
# Generated with joblib 0.9.4.
np.arange(100, dtype='<i8').reshape((10, 10)),
# Fortran contiguous array
np.asfortranarray(np.arange(100, dtype='<i8').reshape((10, 10))),
# Non contiguous array
np.arange(100, dtype='<i8').reshape((10, 10))[:, :2],
]

# These expected results have been generated with joblib 0.9.0
expected_hashes = [
'10a6afc379ca2708acfbaef0ab676eab',
'988a7114f337f381393025911ebc823b',
'c6809f4b97e35f2fa0ee8d653cbd025c',
'b3ad17348e32728a7eb9cda1e7ede438',
'927b3e6b0b6a037e8e035bda134e0b05',
'108f6ee98e7db19ea2006ffd208f4bf1',
'bd48ccaaff28e16e6badee81041b7180'
]

for to_hash, expected in zip(to_hash_list, expected_hashes):
assert hash(to_hash) == expected
# Note: joblib used to test numpy objects hashing by comparing the produced
# hash of an object with some hard-coded target value to guarantee that
# hashing remains the same across joblib versions. However, since numpy
# 1.20 and joblib 1.0, joblib relies on potentially unstable implementation
# details of numpy to hash np.dtype objects, which makes the stability of
# hash values across different environments hard to guarantee and to test.
# As a result, hashing stability across joblib versions becomes best-effort
# only, and we only test the consistency within a single environment by
# making sure:
# - the hash of two copies of the same objects is the same
# - hashing some object in two different python processes produces the same
# value. This should be viewed as a proxy for testing hash consistency
# through time between Python sessions (provided no change in the
# environment was done between sessions).

def create_objects_to_hash():
rng = np.random.RandomState(42)
# Being explicit about dtypes in order to avoid
# architecture-related differences. Also using 'f4' rather than
# 'f8' for float arrays because 'f8' arrays generated by
# rng.random.randn don't seem to be bit-identical on 32bit and
# 64bit machines.
to_hash_list = [
rng.randint(-1000, high=1000, size=50).astype('<i8'),
tuple(rng.randn(3).astype('<f4') for _ in range(5)),
[rng.randn(3).astype('<f4') for _ in range(5)],
{
-3333: rng.randn(3, 5).astype('<f4'),
0: [
rng.randint(10, size=20).astype('<i8'),
rng.randn(10).astype('<f4')
]
},
# Non regression cases for
# https://github.com/joblib/joblib/issues/308
np.arange(100, dtype='<i8').reshape((10, 10)),
# Fortran contiguous array
np.asfortranarray(np.arange(100, dtype='<i8').reshape((10, 10))),
# Non contiguous array
np.arange(100, dtype='<i8').reshape((10, 10))[:, :2],
]
return to_hash_list

# Create two lists containing copies of the same objects. joblib.hash
# should return the same hash for to_hash_list_one[i] and
# to_hash_list_two[i]
to_hash_list_one = create_objects_to_hash()
to_hash_list_two = create_objects_to_hash()

e1 = ProcessPoolExecutor(max_workers=1)
e2 = ProcessPoolExecutor(max_workers=1)

try:
for obj_1, obj_2 in zip(to_hash_list_one, to_hash_list_two):
# testing consistency of hashes across python processes
hash_1 = e1.submit(hash, obj_1).result()
hash_2 = e2.submit(hash, obj_1).result()
assert hash_1 == hash_2

# testing consistency when hashing two copies of the same objects.
hash_3 = e1.submit(hash, obj_2).result()
assert hash_1 == hash_3

finally:
e1.shutdown()
e2.shutdown()


def test_hashing_pickling_error():
Expand Down