diff --git a/python/docs/source/user_guide/pandas_on_spark/types.rst b/python/docs/source/user_guide/pandas_on_spark/types.rst index 5404448e1566d..a806410681da1 100644 --- a/python/docs/source/user_guide/pandas_on_spark/types.rst +++ b/python/docs/source/user_guide/pandas_on_spark/types.rst @@ -168,13 +168,9 @@ np.byte ByteType np.int16 ShortType np.int32 IntegerType np.int64 LongType -np.int LongType np.float32 FloatType -np.float DoubleType np.float64 DoubleType -np.str StringType np.unicode\_ StringType -np.bool BooleanType np.datetime64 TimestampType np.ndarray ArrayType(StringType()) ============= ======================= diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index 7a81ede420118..92443b935e731 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -1923,7 +1923,7 @@ def apply(self, func: Callable, *args: Any, **kwargs: Any) -> Union[DataFrame, S In case of Series, it works as below. - >>> def plus_max(x) -> ps.Series[np.int]: + >>> def plus_max(x) -> ps.Series[int]: ... return x + x.max() >>> df.B.groupby(df.A).apply(plus_max).sort_index() # doctest: +SKIP 0 6 @@ -1941,7 +1941,7 @@ def apply(self, func: Callable, *args: Any, **kwargs: Any) -> Union[DataFrame, S You can also return a scalar value as an aggregated value of the group: - >>> def plus_length(x) -> np.int: + >>> def plus_length(x) -> int: ... return len(x) >>> df.B.groupby(df.A).apply(plus_length).sort_index() # doctest: +SKIP 0 1 @@ -1950,7 +1950,7 @@ def apply(self, func: Callable, *args: Any, **kwargs: Any) -> Union[DataFrame, S The extra arguments to the function can be passed as below. - >>> def calculation(x, y, z) -> np.int: + >>> def calculation(x, y, z) -> int: ... return len(x) + y * z >>> df.B.groupby(df.A).apply(calculation, 5, z=10).sort_index() # doctest: +SKIP 0 51 @@ -3077,7 +3077,7 @@ def transform(self, func: Callable[..., pd.Series], *args: Any, **kwargs: Any) - 1 a string 2 a string 6 2 a string 3 a string 5 - >>> def plus_max(x) -> ps.Series[np.int]: + >>> def plus_max(x) -> ps.Series[int]: ... return x + x.max() >>> g.transform(plus_max) # doctest: +NORMALIZE_WHITESPACE B C @@ -3111,7 +3111,7 @@ def transform(self, func: Callable[..., pd.Series], *args: Any, **kwargs: Any) - You can also specify extra arguments to pass to the function. - >>> def calculation(x, y, z) -> ps.Series[np.int]: + >>> def calculation(x, y, z) -> ps.Series[int]: ... return x + x.min() + y + z >>> g.transform(calculation, 5, z=20) # doctest: +NORMALIZE_WHITESPACE B C diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index 0e2c6409796a5..cc99b10a8e12d 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -2340,7 +2340,6 @@ def test_astype(self): psidx = ps.Index(pidx) self.assert_eq(psidx.astype(int), pidx.astype(int)) - self.assert_eq(psidx.astype(np.int), pidx.astype(np.int)) self.assert_eq(psidx.astype(np.int8), pidx.astype(np.int8)) self.assert_eq(psidx.astype(np.int16), pidx.astype(np.int16)) self.assert_eq(psidx.astype(np.int32), pidx.astype(np.int32)) @@ -2356,7 +2355,6 @@ def test_astype(self): self.assert_eq(psidx.astype("i"), pidx.astype("i")) self.assert_eq(psidx.astype("long"), pidx.astype("long")) self.assert_eq(psidx.astype("short"), pidx.astype("short")) - self.assert_eq(psidx.astype(np.float), pidx.astype(np.float)) self.assert_eq(psidx.astype(np.float32), pidx.astype(np.float32)) self.assert_eq(psidx.astype(np.float64), pidx.astype(np.float64)) self.assert_eq(psidx.astype("float"), pidx.astype("float")) diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index 21f3238e413bc..501da9e14d813 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -1576,7 +1576,6 @@ def _test_numeric_astype(self, pser): psser = ps.Series(pser) self.assert_eq(psser.astype(int), pser.astype(int)) - self.assert_eq(psser.astype(np.int), pser.astype(np.int)) self.assert_eq(psser.astype(np.int8), pser.astype(np.int8)) self.assert_eq(psser.astype(np.int16), pser.astype(np.int16)) self.assert_eq(psser.astype(np.int32), pser.astype(np.int32)) @@ -1592,7 +1591,6 @@ def _test_numeric_astype(self, pser): self.assert_eq(psser.astype("i"), pser.astype("i")) self.assert_eq(psser.astype("long"), pser.astype("long")) self.assert_eq(psser.astype("short"), pser.astype("short")) - self.assert_eq(psser.astype(np.float), pser.astype(np.float)) self.assert_eq(psser.astype(np.float32), pser.astype(np.float32)) self.assert_eq(psser.astype(np.float64), pser.astype(np.float64)) self.assert_eq(psser.astype("float"), pser.astype("float")) diff --git a/python/pyspark/pandas/tests/test_typedef.py b/python/pyspark/pandas/tests/test_typedef.py index a5f2b2dc2b43d..27e230f974850 100644 --- a/python/pyspark/pandas/tests/test_typedef.py +++ b/python/pyspark/pandas/tests/test_typedef.py @@ -321,20 +321,16 @@ def test_as_spark_type_pandas_on_spark_dtype(self): np.int16: (np.int16, ShortType()), np.int32: (np.int32, IntegerType()), np.int64: (np.int64, LongType()), - np.int: (np.int64, LongType()), int: (np.int64, LongType()), # floating np.float32: (np.float32, FloatType()), - np.float: (np.float64, DoubleType()), np.float64: (np.float64, DoubleType()), float: (np.float64, DoubleType()), # string - np.str: (np.unicode_, StringType()), np.unicode_: (np.unicode_, StringType()), str: (np.unicode_, StringType()), # bool - np.bool: (np.bool, BooleanType()), - bool: (np.bool, BooleanType()), + bool: (np.bool_, BooleanType()), # datetime np.datetime64: (np.datetime64, TimestampType()), datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()), diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py index 6d50caeba0161..dfb1bc6f9ba01 100644 --- a/python/pyspark/pandas/typedef/typehints.py +++ b/python/pyspark/pandas/typedef/typehints.py @@ -391,7 +391,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type LongType() - >>> def func() -> ps.DataFrame[np.float, str]: + >>> def func() -> ps.DataFrame[float, str]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes @@ -399,7 +399,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) - >>> def func() -> ps.DataFrame[np.float]: + >>> def func() -> ps.DataFrame[float]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes @@ -423,7 +423,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type LongType() - >>> def func() -> 'ps.DataFrame[np.float, str]': + >>> def func() -> 'ps.DataFrame[float, str]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes @@ -431,7 +431,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) - >>> def func() -> 'ps.DataFrame[np.float]': + >>> def func() -> 'ps.DataFrame[float]': ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes @@ -439,7 +439,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type StructType([StructField('c0', DoubleType(), True)]) - >>> def func() -> ps.DataFrame['a': np.float, 'b': int]: + >>> def func() -> ps.DataFrame['a': float, 'b': int]: ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes @@ -447,7 +447,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)]) - >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]": + >>> def func() -> "ps.DataFrame['a': float, 'b': int]": ... pass >>> inferred = infer_return_type(func) >>> inferred.dtypes diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index 76f5c957fecac..de135e992eef7 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -182,7 +182,7 @@ def toPandas(self) -> "PandasDataFrameLike": field.dataType ) corrected_panda_types[tmp_column_names[index]] = ( - np.object0 if pandas_type is None else pandas_type + object if pandas_type is None else pandas_type ) pdf = pd.DataFrame(columns=tmp_column_names).astype( @@ -232,7 +232,7 @@ def toPandas(self) -> "PandasDataFrameLike": if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any(): corrected_dtypes[index] = np.float64 if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any(): - corrected_dtypes[index] = np.object # type: ignore[attr-defined] + corrected_dtypes[index] = object df = pd.DataFrame() for index, t in enumerate(corrected_dtypes): diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index e686fa9e929fd..bd2f1cb75b7af 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -1119,10 +1119,10 @@ def test_to_pandas(self): pdf = self._to_pandas() types = pdf.dtypes self.assertEqual(types[0], np.int32) - self.assertEqual(types[1], np.object) - self.assertEqual(types[2], np.bool) + self.assertEqual(types[1], object) + self.assertEqual(types[2], bool) self.assertEqual(types[3], np.float32) - self.assertEqual(types[4], np.object) # datetime.date + self.assertEqual(types[4], object) # datetime.date self.assertEqual(types[5], "datetime64[ns]") self.assertEqual(types[6], "datetime64[ns]") self.assertEqual(types[7], "timedelta64[ns]") @@ -1181,7 +1181,7 @@ def test_to_pandas_avoid_astype(self): df = self.spark.createDataFrame(data, schema) types = df.toPandas().dtypes self.assertEqual(types[0], np.float64) # doesn't convert to np.int32 due to NaN value. - self.assertEqual(types[1], np.object) + self.assertEqual(types[1], object) self.assertEqual(types[2], np.float64) @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore @@ -1242,8 +1242,8 @@ def test_to_pandas_from_null_dataframe(self): self.assertEqual(types[3], np.float64) self.assertEqual(types[4], np.float32) self.assertEqual(types[5], np.float64) - self.assertEqual(types[6], np.object) - self.assertEqual(types[7], np.object) + self.assertEqual(types[6], object) + self.assertEqual(types[7], object) self.assertTrue(np.can_cast(np.datetime64, types[8])) self.assertTrue(np.can_cast(np.datetime64, types[9])) self.assertTrue(np.can_cast(np.timedelta64, types[10]))