From b3c26b8b3aa90c829aec50ba170d14873ca5bde9 Mon Sep 17 00:00:00 2001 From: Aimilios Tsouvelekakis Date: Thu, 2 Mar 2023 18:50:20 -0600 Subject: [PATCH] [SPARK-42647][PYTHON] Change alias for numpy deprecated and removed types ### Problem description Numpy has started changing the alias to some of its data-types. This means that users with the latest version of numpy they will face either warnings or errors according to the type that they are using. This affects all the users using numoy > 1.20.0 One of the types was fixed back in September with this [pull](https://github.com/apache/spark/pull/37817) request [numpy 1.24.0](https://github.com/numpy/numpy/pull/22607): The scalar type aliases ending in a 0 bit size: np.object0, np.str0, np.bytes0, np.void0, np.int0, np.uint0 as well as np.bool8 are now deprecated and will eventually be removed. [numpy 1.20.0](https://github.com/numpy/numpy/pull/14882): Using the aliases of builtin types like np.int is deprecated ### What changes were proposed in this pull request? From numpy 1.20.0 we receive a deprecattion warning on np.object(https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations) and from numpy 1.24.0 we received an attribute error: ``` attr = 'object' def __getattr__(attr): # Warn for expired attributes, and return a dummy function # that always raises an exception. import warnings try: msg = __expired_functions__[attr] except KeyError: pass else: warnings.warn(msg, DeprecationWarning, stacklevel=2) def _expired(*args, **kwds): raise RuntimeError(msg) return _expired # Emit warnings for deprecated attributes try: val, msg = __deprecated_attrs__[attr] except KeyError: pass else: warnings.warn(msg, DeprecationWarning, stacklevel=2) return val if attr in __future_scalars__: # And future warnings for those that will change, but also give # the AttributeError warnings.warn( f"In the future `np.{attr}` will be defined as the " "corresponding NumPy scalar.", FutureWarning, stacklevel=2) if attr in __former_attrs__: > raise AttributeError(__former_attrs__[attr]) E AttributeError: module 'numpy' has no attribute 'object'. E `np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe. E The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at: E https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations ``` From numpy version 1.24.0 we receive a deprecation warning on np.object0 and every np.datatype0 and np.bool8 >>> np.object0(123) :1: DeprecationWarning: `np.object0` is a deprecated alias for ``np.object0` is a deprecated alias for `np.object_`. `object` can be used instead. (Deprecated NumPy 1.24)`. (Deprecated NumPy 1.24) ### Why are the changes needed? The changes are needed so pyspark can be compatible with the latest numpy and avoid - attribute errors on data types being deprecated from version 1.20.0: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - warnings on deprecated data types from version 1.24.0: https://numpy.org/devdocs/release/1.24.0-notes.html#deprecations ### Does this PR introduce _any_ user-facing change? The change will suppress the warning coming from numpy 1.24.0 and the error coming from numpy 1.22.0 ### How was this patch tested? I assume that the existing tests should catch this. (see all section Extra questions) I found this to be a problem in my work's project where we use for our unit tests the toPandas() function to convert to np.object. Attaching the run result of our test: ``` _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /usr/local/lib/python3.9/dist-packages//unit/spark_test.py:64: in run_testcase self.handler.compare_df(result, expected, config=self.compare_config) /usr/local/lib/python3.9/dist-packages//spark_test_handler.py:38: in compare_df actual_pd = actual.toPandas().sort_values(by=sort_columns, ignore_index=True) /usr/local/lib/python3.9/dist-packages/pyspark/sql/pandas/conversion.py:232: in toPandas corrected_dtypes[index] = np.object # type: ignore[attr-defined] _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ attr = 'object' def __getattr__(attr): # Warn for expired attributes, and return a dummy function # that always raises an exception. import warnings try: msg = __expired_functions__[attr] except KeyError: pass else: warnings.warn(msg, DeprecationWarning, stacklevel=2) def _expired(*args, **kwds): raise RuntimeError(msg) return _expired # Emit warnings for deprecated attributes try: val, msg = __deprecated_attrs__[attr] except KeyError: pass else: warnings.warn(msg, DeprecationWarning, stacklevel=2) return val if attr in __future_scalars__: # And future warnings for those that will change, but also give # the AttributeError warnings.warn( f"In the future `np.{attr}` will be defined as the " "corresponding NumPy scalar.", FutureWarning, stacklevel=2) if attr in __former_attrs__: > raise AttributeError(__former_attrs__[attr]) E AttributeError: module 'numpy' has no attribute 'object'. E `np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe. E The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at: E https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations /usr/local/lib/python3.9/dist-packages/numpy/__init__.py:305: AttributeError ``` Although i cannot provide the code doing in python the following should show the problem: ``` >>> import numpy as np >>> np.object0(123) :1: DeprecationWarning: `np.object0` is a deprecated alias for ``np.object0` is a deprecated alias for `np.object_`. `object` can be used instead. (Deprecated NumPy 1.24)`. (Deprecated NumPy 1.24) 123 >>> np.object(123) :1: FutureWarning: In the future `np.object` will be defined as the corresponding NumPy scalar. Traceback (most recent call last): File "", line 1, in File "/usr/local/lib/python3.9/dist-packages/numpy/__init__.py", line 305, in __getattr__ raise AttributeError(__former_attrs__[attr]) AttributeError: module 'numpy' has no attribute 'object'. `np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe. The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations ``` I do not have a use-case in my tests for np.object0 but I fixed like the suggestion from numpy ### Supported Versions: I propose this fix to be included in all pyspark 3.3 and onwards ### JIRA I know a JIRA ticket should be created I sent an email and I am waiting for the answer to document the case also there. ### Extra questions: By grepping for np.bool and np.object I see that the tests include them. Shall we change them also? Data types with _ I think they are not affected. ``` git grep np.object python/pyspark/ml/functions.py: return data.dtype == np.object_ and isinstance(data.iloc[0], (np.ndarray, list)) python/pyspark/ml/functions.py: return any(data.dtypes == np.object_) and any( python/pyspark/sql/tests/test_dataframe.py: self.assertEqual(types[1], np.object) python/pyspark/sql/tests/test_dataframe.py: self.assertEqual(types[4], np.object) # datetime.date python/pyspark/sql/tests/test_dataframe.py: self.assertEqual(types[1], np.object) python/pyspark/sql/tests/test_dataframe.py: self.assertEqual(types[6], np.object) python/pyspark/sql/tests/test_dataframe.py: self.assertEqual(types[7], np.object) git grep np.bool python/docs/source/user_guide/pandas_on_spark/types.rst:np.bool BooleanType python/pyspark/pandas/indexing.py: isinstance(key, np.bool_) for key in cols_sel python/pyspark/pandas/tests/test_typedef.py: np.bool: (np.bool, BooleanType()), python/pyspark/pandas/tests/test_typedef.py: bool: (np.bool, BooleanType()), python/pyspark/pandas/typedef/typehints.py: elif tpe in (bool, np.bool_, "bool", "?"): python/pyspark/sql/connect/expressions.py: assert isinstance(value, (bool, np.bool_)) python/pyspark/sql/connect/expressions.py: elif isinstance(value, np.bool_): python/pyspark/sql/tests/test_dataframe.py: self.assertEqual(types[2], np.bool) python/pyspark/sql/tests/test_functions.py: (np.bool_, [("true", "boolean")]), ``` If yes concerning bool was merged already should we fix it too? Closes #40220 from aimtsou/numpy-patch. Authored-by: Aimilios Tsouvelekakis Signed-off-by: Sean Owen --- .../docs/source/user_guide/pandas_on_spark/types.rst | 4 ---- python/pyspark/pandas/groupby.py | 10 +++++----- python/pyspark/pandas/tests/indexes/test_base.py | 2 -- python/pyspark/pandas/tests/test_series.py | 2 -- python/pyspark/pandas/tests/test_typedef.py | 6 +----- python/pyspark/pandas/typedef/typehints.py | 12 ++++++------ python/pyspark/sql/pandas/conversion.py | 4 ++-- python/pyspark/sql/tests/test_dataframe.py | 12 ++++++------ 8 files changed, 20 insertions(+), 32 deletions(-) diff --git a/python/docs/source/user_guide/pandas_on_spark/types.rst b/python/docs/source/user_guide/pandas_on_spark/types.rst index 5404448e1566d..a806410681da1 100644 --- a/python/docs/source/user_guide/pandas_on_spark/types.rst +++ b/python/docs/source/user_guide/pandas_on_spark/types.rst @@ -168,13 +168,9 @@ np.byte ByteType np.int16 ShortType np.int32 IntegerType np.int64 LongType -np.int LongType np.float32 FloatType -np.float DoubleType np.float64 DoubleType -np.str StringType np.unicode\_ StringType -np.bool BooleanType np.datetime64 TimestampType np.ndarray ArrayType(StringType()) ============= ======================= diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index 7a81ede420118..92443b935e731 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -1923,7 +1923,7 @@ def apply(self, func: Callable, *args: Any, **kwargs: Any) -> Union[DataFrame, S In case of Series, it works as below. - >>> def plus_max(x) -> ps.Series[np.int]: + >>> def plus_max(x) -> ps.Series[int]: ... return x + x.max() >>> df.B.groupby(df.A).apply(plus_max).sort_index() # doctest: +SKIP 0 6 @@ -1941,7 +1941,7 @@ def apply(self, func: Callable, *args: Any, **kwargs: Any) -> Union[DataFrame, S You can also return a scalar value as an aggregated value of the group: - >>> def plus_length(x) -> np.int: + >>> def plus_length(x) -> int: ... return len(x) >>> df.B.groupby(df.A).apply(plus_length).sort_index() # doctest: +SKIP 0 1 @@ -1950,7 +1950,7 @@ def apply(self, func: Callable, *args: Any, **kwargs: Any) -> Union[DataFrame, S The extra arguments to the function can be passed as below. - >>> def calculation(x, y, z) -> np.int: + >>> def calculation(x, y, z) -> int: ... return len(x) + y * z >>> df.B.groupby(df.A).apply(calculation, 5, z=10).sort_index() # doctest: +SKIP 0 51 @@ -3077,7 +3077,7 @@ def transform(self, func: Callable[..., pd.Series], *args: Any, **kwargs: Any) - 1 a string 2 a string 6 2 a string 3 a string 5 - >>> def plus_max(x) -> ps.Series[np.int]: + >>> def plus_max(x) -> ps.Series[int]: ... return x + x.max() >>> g.transform(plus_max) # doctest: +NORMALIZE_WHITESPACE B C @@ -3111,7 +3111,7 @@ def transform(self, func: Callable[..., pd.Series], *args: Any, **kwargs: Any) - You can also specify extra arguments to pass to the function. - >>> def calculation(x, y, z) -> ps.Series[np.int]: + >>> def calculation(x, y, z) -> ps.Series[int]: ... return x + x.min() + y + z >>> g.transform(calculation, 5, z=20) # doctest: +NORMALIZE_WHITESPACE B C diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index 0e2c6409796a5..cc99b10a8e12d 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -2340,7 +2340,6 @@ def test_astype(self): psidx = ps.Index(pidx) self.assert_eq(psidx.astype(int), pidx.astype(int)) - self.assert_eq(psidx.astype(np.int), pidx.astype(np.int)) self.assert_eq(psidx.astype(np.int8), pidx.astype(np.int8)) self.assert_eq(psidx.astype(np.int16), pidx.astype(np.int16)) self.assert_eq(psidx.astype(np.int32), pidx.astype(np.int32)) @@ -2356,7 +2355,6 @@ def test_astype(self): self.assert_eq(psidx.astype("i"), pidx.astype("i")) self.assert_eq(psidx.astype("long"), pidx.astype("long")) self.assert_eq(psidx.astype("short"), pidx.astype("short")) - self.assert_eq(psidx.astype(np.float), pidx.astype(np.float)) self.assert_eq(psidx.astype(np.float32), pidx.astype(np.float32)) self.assert_eq(psidx.astype(np.float64), pidx.astype(np.float64)) self.assert_eq(psidx.astype("float"), pidx.astype("float")) diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index 21f3238e413bc..501da9e14d813 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -1576,7 +1576,6 @@ def _test_numeric_astype(self, pser): psser = ps.Series(pser) self.assert_eq(psser.astype(int), pser.astype(int)) - self.assert_eq(psser.astype(np.int), pser.astype(np.int)) self.assert_eq(psser.astype(np.int8), pser.astype(np.int8)) self.assert_eq(psser.astype(np.int16), pser.astype(np.int16)) self.assert_eq(psser.astype(np.int32), pser.astype(np.int32)) @@ -1592,7 +1591,6 @@ def _test_numeric_astype(self, pser): self.assert_eq(psser.astype("i"), pser.astype("i")) self.assert_eq(psser.astype("long"), pser.astype("long")) self.assert_eq(psser.astype("short"), pser.astype("short")) - self.assert_eq(psser.astype(np.float), pser.astype(np.float)) self.assert_eq(psser.astype(np.float32), pser.astype(np.float32)) self.assert_eq(psser.astype(np.float64), pser.astype(np.float64)) self.assert_eq(psser.astype("float"), pser.astype("float")) diff --git a/python/pyspark/pandas/tests/test_typedef.py b/python/pyspark/pandas/tests/test_typedef.py index a5f2b2dc2b43d..27e230f974850 100644 --- a/python/pyspark/pandas/tests/test_typedef.py +++ b/python/pyspark/pandas/tests/test_typedef.py @@ -321,20 +321,16 @@ def test_as_spark_type_pandas_on_spark_dtype(self): np.int16: (np.int16, ShortType()), np.int32: (np.int32, IntegerType()), np.int64: (np.int64, LongType()), - np.int: (np.int64, LongType()), int: (np.int64, LongType()), # floating np.float32: (np.float32, FloatType()), - np.float: (np.float64, DoubleType()), np.float64: (np.float64, DoubleType()), float: (np.float64, DoubleType()), # string - np.str: (np.unicode_, StringType()), np.unicode_: (np.unicode_, StringType()), str: (np.unicode_, StringType()), # bool - np.bool: (np.bool, BooleanType()), - bool: (np.bool, BooleanType()), + bool: (np.bool_, BooleanType()), # datetime np.datetime64: (np.datetime64, TimestampType()), datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()), diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py index 6d50caeba0161..dfb1bc6f9ba01 100644 --- a/python/pyspark/pandas/typedef/typehints.py +++ b/python/pyspark/pandas/typedef/typehints.py @@ -391,7 +391,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type LongType() - >>> def func() -> ps.DataFrame[np.float, str]: + >>> def func() -> ps.DataFrame[float, str]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes @@ -399,7 +399,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) - >>> def func() -> ps.DataFrame[np.float]: + >>> def func() -> ps.DataFrame[float]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes @@ -423,7 +423,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type LongType() - >>> def func() -> 'ps.DataFrame[np.float, str]': + >>> def func() -> 'ps.DataFrame[float, str]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes @@ -431,7 +431,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) - >>> def func() -> 'ps.DataFrame[np.float]': + >>> def func() -> 'ps.DataFrame[float]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes @@ -439,7 +439,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True)]) - >>> def func() -> ps.DataFrame['a': np.float, 'b': int]: + >>> def func() -> ps.DataFrame['a': float, 'b': int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes @@ -447,7 +447,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)]) - >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]": + >>> def func() -> "ps.DataFrame['a': float, 'b': int]": ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index 76f5c957fecac..de135e992eef7 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -182,7 +182,7 @@ def toPandas(self) -> "PandasDataFrameLike": field.dataType ) corrected_panda_types[tmp_column_names[index]] = ( - np.object0 if pandas_type is None else pandas_type + object if pandas_type is None else pandas_type ) pdf = pd.DataFrame(columns=tmp_column_names).astype( @@ -232,7 +232,7 @@ def toPandas(self) -> "PandasDataFrameLike": if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any(): corrected_dtypes[index] = np.float64 if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any(): - corrected_dtypes[index] = np.object # type: ignore[attr-defined] + corrected_dtypes[index] = object df = pd.DataFrame() for index, t in enumerate(corrected_dtypes): diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index e686fa9e929fd..bd2f1cb75b7af 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -1119,10 +1119,10 @@ def test_to_pandas(self): pdf = self._to_pandas() types = pdf.dtypes self.assertEqual(types[0], np.int32) - self.assertEqual(types[1], np.object) - self.assertEqual(types[2], np.bool) + self.assertEqual(types[1], object) + self.assertEqual(types[2], bool) self.assertEqual(types[3], np.float32) - self.assertEqual(types[4], np.object) # datetime.date + self.assertEqual(types[4], object) # datetime.date self.assertEqual(types[5], "datetime64[ns]") self.assertEqual(types[6], "datetime64[ns]") self.assertEqual(types[7], "timedelta64[ns]") @@ -1181,7 +1181,7 @@ def test_to_pandas_avoid_astype(self): df = self.spark.createDataFrame(data, schema) types = df.toPandas().dtypes self.assertEqual(types[0], np.float64) # doesn't convert to np.int32 due to NaN value. - self.assertEqual(types[1], np.object) + self.assertEqual(types[1], object) self.assertEqual(types[2], np.float64) @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore @@ -1242,8 +1242,8 @@ def test_to_pandas_from_null_dataframe(self): self.assertEqual(types[3], np.float64) self.assertEqual(types[4], np.float32) self.assertEqual(types[5], np.float64) - self.assertEqual(types[6], np.object) - self.assertEqual(types[7], np.object) + self.assertEqual(types[6], object) + self.assertEqual(types[7], object) self.assertTrue(np.can_cast(np.datetime64, types[8])) self.assertTrue(np.can_cast(np.datetime64, types[9])) self.assertTrue(np.can_cast(np.timedelta64, types[10]))