Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix numpy.dtype hashing for numpy >= 1.20 #1136

Merged
merged 21 commits into from
Dec 11, 2020
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ Latest changes
In development
--------------

- Make `joblib.hash` and `joblib.Memory` caching system compatible with `numpy
>= 1.20.0`. In the future, users updating `joblib` to a release that
pierreglaser marked this conversation as resolved.
Show resolved Hide resolved
includes this fix will see their previous cache invalidated if they contained
reference to `numpy` objects.
https://github.com/joblib/joblib/pull/1136

- Remove deprecated `check_pickle` argument in `delayed`.
https://github.com/joblib/joblib/pull/903

Expand Down
9 changes: 9 additions & 0 deletions doc/memory.rst
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,15 @@ Gotchas
``self.method`` does not depend on ``self`` you can use
``self.method = memory.cache(self.method, ignore=['self'])``.

* **joblib cache entries may be invalidated after environment updates**.
Values returned by ``joblib.hash`` are not guaranteed to stay
constant across ``joblib`` versions. This means that **all** entries of a
``joblib.Memory`` cache can get invalidated when upgrading ``joblib``.
Invalidation can also happen when upgrading a third party library (such as
``numpy``): in such a case, only the cached function calls with parameters
that that are constructs (or contain references to contructs) defined in the
pierreglaser marked this conversation as resolved.
Show resolved Hide resolved
upgraded library should potentially be invalidated after the uprade.


Ignoring some arguments
-----------------------
Expand Down
31 changes: 18 additions & 13 deletions joblib/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,19 +220,24 @@ def save(self, obj):
# The object will be pickled by the pickler hashed at the end.
obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides))
elif isinstance(obj, self.np.dtype):
# Atomic dtype objects are interned by their default constructor:
# np.dtype('f8') is np.dtype('f8')
# This interning is not maintained by a
# pickle.loads + pickle.dumps cycle, because __reduce__
# uses copy=True in the dtype constructor. This
# non-deterministic behavior causes the internal memoizer
# of the hasher to generate different hash values
# depending on the history of the dtype object.
# To prevent the hash from being sensitive to this, we use
# .descr which is a full (and never interned) description of
# the array dtype according to the numpy doc.
klass = obj.__class__
obj = (klass, ('HASHED', obj.descr))
# numpy.dtype consistent hashing is tricky to get right. This comes
# from the fact that atomic np.dtype objects are interned:
# ``np.dtype('f4') is np.dtype('f4')``. The situation is
# complicated by the fact that this interning does not resist a
# simple pickle.load/dump roundtrip:
# ``pickle.loads(pickle.dumps(np.dtype('f4'))) is not
# np.dtype('f4') Because pickle relies on memoization during
# pickling, it is easy to
# produce different hashes for seemingly identical objects, such as
# ``[np.dtype('f4'), np.dtype('f4')]``
# and ``[np.dtype('f4'), pickle.loads(pickle.dumps('f4'))]``.
# To prevent memoization from interfering with hashing, we isolate
# the serialization (and thue the pickle memo) of each dtype using
# each time a different ``pickle.dumps`` call unrelated to the
# current Hahsher/Pickler.
ogrisel marked this conversation as resolved.
Show resolved Hide resolved
self._hash.update("_HASHED_DTYPE".encode('utf-8'))
self._hash.update(pickle.dumps(obj))
return
Hasher.save(self, obj)


Expand Down
161 changes: 110 additions & 51 deletions joblib/test/test_hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import itertools
import pickle
import random
from concurrent.futures import ProcessPoolExecutor
from decimal import Decimal
import pytest

Expand Down Expand Up @@ -326,14 +327,48 @@ def test_string():


@with_numpy
def test_dtype():
# Test that we obtain the same hash for object owning several dtype,
# whatever the past of these dtypes. Catter for cache invalidation with
# complex dtype
a = np.dtype([('f1', np.uint), ('f2', np.int32)])
b = a
c = pickle.loads(pickle.dumps(a))
assert hash([a, c]) == hash([a, b])
def test_numpy_dtype_pickling():
# numpy dtype hahsing is tricky to get right: see #231, #239, #251 #1080,
ogrisel marked this conversation as resolved.
Show resolved Hide resolved
# #1082, and explanatory comments inside
# ``joblib.hashing.NumpyHasher.save``.

# In this test, we make sure that the pickling of numpy dtypes robust to
pierreglaser marked this conversation as resolved.
Show resolved Hide resolved
# object identity and object copy.

dt1 = np.dtype('f4')
dt2 = np.dtype('f4')

# simple dtypes objects are interned
assert dt1 is dt2
assert hash(dt1) == hash(dt2)

dt1_roundtripped = pickle.loads(pickle.dumps(dt1))
assert dt1 is not dt1_roundtripped
assert hash(dt1) == hash(dt1_roundtripped)

assert hash([dt1, dt1]) == hash([dt1_roundtripped, dt1_roundtripped])
assert hash([dt1, dt1]) == hash([dt1, dt1_roundtripped])

complex_dt1 = np.dtype(
[('name', np.str_, 16), ('grades', np.float64, (2,))]
)
complext_dt2 = np.dtype(
[('name', np.str_, 16), ('grades', np.float64, (2,))]
)

# complext dtypes objects are not interned
assert hash(complex_dt1) == hash(complext_dt2)
ogrisel marked this conversation as resolved.
Show resolved Hide resolved

complex_dt1_roundtripped = pickle.loads(pickle.dumps(complex_dt1))
assert complex_dt1_roundtripped is not complex_dt1
assert hash(complex_dt1) == hash(complex_dt1_roundtripped)

assert hash([complex_dt1, complex_dt1]) == hash(
[complex_dt1_roundtripped, complex_dt1_roundtripped]
)
assert hash([complex_dt1, complex_dt1]) == hash(
[complex_dt1_roundtripped, complex_dt1]
)


@parametrize('to_hash,expected',
Expand Down Expand Up @@ -378,49 +413,73 @@ def test_0d_and_1d_array_hashing_is_different():

@with_numpy
def test_hashes_stay_the_same_with_numpy_objects():
# We want to make sure that hashes don't change with joblib
# version. For end users, that would mean that they have to
# regenerate their cache from scratch, which potentially means
# lengthy recomputations.
rng = np.random.RandomState(42)
# Being explicit about dtypes in order to avoid
# architecture-related differences. Also using 'f4' rather than
# 'f8' for float arrays because 'f8' arrays generated by
# rng.random.randn don't seem to be bit-identical on 32bit and
# 64bit machines.
to_hash_list = [
rng.randint(-1000, high=1000, size=50).astype('<i8'),
tuple(rng.randn(3).astype('<f4') for _ in range(5)),
[rng.randn(3).astype('<f4') for _ in range(5)],
{
-3333: rng.randn(3, 5).astype('<f4'),
0: [
rng.randint(10, size=20).astype('<i8'),
rng.randn(10).astype('<f4')
]
},
# Non regression cases for https://github.com/joblib/joblib/issues/308.
# Generated with joblib 0.9.4.
np.arange(100, dtype='<i8').reshape((10, 10)),
# Fortran contiguous array
np.asfortranarray(np.arange(100, dtype='<i8').reshape((10, 10))),
# Non contiguous array
np.arange(100, dtype='<i8').reshape((10, 10))[:, :2],
]

# These expected results have been generated with joblib 0.9.0
expected_hashes = [
'10a6afc379ca2708acfbaef0ab676eab',
'988a7114f337f381393025911ebc823b',
'c6809f4b97e35f2fa0ee8d653cbd025c',
'b3ad17348e32728a7eb9cda1e7ede438',
'927b3e6b0b6a037e8e035bda134e0b05',
'108f6ee98e7db19ea2006ffd208f4bf1',
'bd48ccaaff28e16e6badee81041b7180'
]

for to_hash, expected in zip(to_hash_list, expected_hashes):
assert hash(to_hash) == expected
# Note: joblib used to test numpy objects hashing by comparing the produced
# hash of an object with some hard-coded target value to guarantee that
# hashing remains the same across joblib versions. However, since numpy
# 1.20 and joblib 1.0, joblib relies on potentially unstable implementation
# details of numpy to hash np.dtype objects, which makes the stabilility of
ogrisel marked this conversation as resolved.
Show resolved Hide resolved
# hash values across different environments hard to guarantee and to test.
# As a result, hashing stability across joblib versions becomes best-effor
ogrisel marked this conversation as resolved.
Show resolved Hide resolved
# only, and we only test the consistency within a single environment by
# making sure:
# - the hash of two copies of the same objects is the same
# - hashing some object in two different python processes produces the same
# value this should be viewed as a proxy for testing hash consistency
ogrisel marked this conversation as resolved.
Show resolved Hide resolved
# through time between Python sessions (provided no change in the
# environment was done between sessions).

def create_objects_to_hash():
rng = np.random.RandomState(42)
# Being explicit about dtypes in order to avoid
# architecture-related differences. Also using 'f4' rather than
# 'f8' for float arrays because 'f8' arrays generated by
# rng.random.randn don't seem to be bit-identical on 32bit and
# 64bit machines.
to_hash_list = [
rng.randint(-1000, high=1000, size=50).astype('<i8'),
tuple(rng.randn(3).astype('<f4') for _ in range(5)),
[rng.randn(3).astype('<f4') for _ in range(5)],
{
-3333: rng.randn(3, 5).astype('<f4'),
0: [
rng.randint(10, size=20).astype('<i8'),
rng.randn(10).astype('<f4')
]
},
# Non regression cases for
# https://github.com/joblib/joblib/issues/308. oekoke
# oekoefkhnwoaersifh oeiqo Generated with joblib 0.9.4.
ogrisel marked this conversation as resolved.
Show resolved Hide resolved
ogrisel marked this conversation as resolved.
Show resolved Hide resolved
np.arange(100, dtype='<i8').reshape((10, 10)),
# Fortran contiguous array
np.asfortranarray(np.arange(100, dtype='<i8').reshape((10, 10))),
# Non contiguous array
np.arange(100, dtype='<i8').reshape((10, 10))[:, :2],
]
return to_hash_list

# Create two lists containing copies of the same objects. joblib.hash
# should return the same hash for to_hash_list_one[i] and
# to_hash_list_two[i]
to_hash_list_one = create_objects_to_hash()
to_hash_list_two = create_objects_to_hash()

e1 = ProcessPoolExecutor(max_workers=1)
e2 = ProcessPoolExecutor(max_workers=1)

try:
for obj_1, obj_2 in zip(to_hash_list_one, to_hash_list_two):
# testing consistency of hashes across python processes
hash_1 = e1.submit(hash, obj_1).result()
hash_2 = e2.submit(hash, obj_1).result()
assert hash_1 == hash_2

# testing consistency when hashing two copies of the same objects.
hash_3 = e1.submit(hash, obj_2).result()
assert hash_1 == hash_3

finally:
e1.shutdown()
e2.shutdown()


def test_hashing_pickling_error():
Expand Down