Upgrade Pandas dependency to 2.1 (#31185)

* Upgrade to Pandas 2.1 * Pandas 2.1: Disable interchange protocol tests. * Exclude attrs tests as it is not supported. * Exclude new doctests that exercise unsupported order-sensitive ops. * Iteration over deferred DFs is not supported * Skip 'mul' op when index is used as an axis * Exclude new tests that use index. * Exclude shift test as order-sensitive. * Exclude known failure modes. * Exclude failures that existed on Pandas 1. * Allow bulk-exclusion of an example in all tests. * Exclude examples that use to_timedelta. * Exclude the test that evaluates an inferred .tz value. * Exclude more tz and timedelta tests. * Exclude a tests exercision PeriodProporties.end_time * Exclude tests exercising unsupported GroupBy operations. * Expand the list of elementwise string methods. * Exclude known WontImpl ops * Fix test output normalization. * Exclude remaining new tests that didn't work * Remove test that uses values, an unsupported non-deferred op. * lint
apache · May 7, 2024 · 2ca9af8 · 2ca9af8
1 parent bb51380
commit 2ca9af8
Show file tree

Hide file tree

Showing 5 changed files with 190 additions and 15 deletions.
diff --git a/sdks/python/apache_beam/dataframe/doctests.py b/sdks/python/apache_beam/dataframe/doctests.py
@@ -225,6 +225,8 @@ def concat(values):
 
   def fix(self, want, got):
     if 'DeferredBase' in got:
+      # When we have a tuple of Dataframes, pandas prints each from a new line.
+      got = re.sub(r'DeferredBase\[(\d+)\],', '\\g<0>\n', got)
       try:
         to_compute = {
             m.group(0): self._env._all_frames[int(m.group(1))]
@@ -381,20 +383,23 @@ def to_callable(cond):
     self._skipped_set = set()
 
   def _is_wont_implement_ok(self, example, test):
+    always_wont_implement = self._wont_implement_ok.get('*', [])
     return any(
-        wont_implement(example)
-        for wont_implement in self._wont_implement_ok.get(test.name, []))
+        wont_implement(example) for wont_implement in (
+            self._wont_implement_ok.get(test.name, []) + always_wont_implement))
 
   def _is_not_implemented_ok(self, example, test):
+    always_not_impl = self._not_implemented_ok.get('*', [])
     return any(
-        not_implemented(example)
-        for not_implemented in self._not_implemented_ok.get(test.name, []))
+        not_implemented(example) for not_implemented in (
+            self._not_implemented_ok.get(test.name, []) + always_not_impl))
 
   def run(self, test, **kwargs):
     self._checker.reset()
+    always_skip = self._skip.get('*', [])
     for example in test.examples:
       if any(should_skip(example)
-             for should_skip in self._skip.get(test.name, [])):
+             for should_skip in self._skip.get(test.name, []) + always_skip):
         self._skipped_set.add(example)
         example.source = 'pass'
         example.want = ''
@@ -726,6 +731,7 @@ def wrapper(fn):
         verify the examples, else use PartitioningSession to simulate
         distributed execution.
       skip (Dict[str,str]): A set of examples to skip entirely.
+        If a key is '*', an example will be skipped in all test scenarios.
       wont_implement_ok (Dict[str,str]): A set of examples that are allowed to
         raise WontImplementError.
       not_implemented_ok (Dict[str,str]): A set of examples that are allowed to

diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py
@@ -1181,8 +1181,11 @@ def _set_index(self, value):
       pd.DataFrame, 'hist', reason="plotting-tools")
 
   attrs = property(
-      frame_base.wont_implement_method(
-          pd.DataFrame, 'attrs', reason='experimental'))
+      fget=frame_base.wont_implement_method(
+          pd.DataFrame, 'attrs', reason='experimental'),
+      fset=frame_base.wont_implement_method(
+          pd.DataFrame, 'attrs', reason='experimental'),
+  )
 
   reorder_levels = frame_base._proxy_method(
       'reorder_levels',
@@ -5124,13 +5127,18 @@ def rsplit(self, **kwargs):
 ELEMENTWISE_STRING_METHODS = [
             'capitalize',
             'casefold',
+            'center',
             'contains',
             'count',
+            'decode',
+            'encode',
             'endswith',
             'extract',
+            'find',
             'findall',
             'fullmatch',
             'get',
+            'index',
             'isalnum',
             'isalpha',
             'isdecimal',
@@ -5142,22 +5150,29 @@ def rsplit(self, **kwargs):
             'isupper',
             'join',
             'len',
+            'lfind',
+            'ljust',
             'lower',
             'lstrip',
             'match',
+            'normalize',
             'pad',
             'partition',
             'removeprefix',
             'removesuffix',
             'replace',
             'rpartition',
+            'rfind',
+            'rindex',
+            'rjust',
             'rstrip',
             'slice',
             'slice_replace',
             'startswith',
             'strip',
             'swapcase',
             'title',
+            'translate',
             'upper',
             'wrap',
             'zfill',

diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py
@@ -29,6 +29,7 @@
 from apache_beam.dataframe import frame_base
 from apache_beam.dataframe import frames
 from apache_beam.dataframe.convert import to_dataframe
+from apache_beam.dataframe.doctests import teststring
 from apache_beam.runners.interactive import interactive_beam as ib
 from apache_beam.runners.interactive import interactive_environment as ie
 from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
@@ -363,6 +364,19 @@ def new_column(df):
     })
     self._run_inplace_test(new_column, df)
 
+  def test_tz_with_utc_zone_set_explicitly(self):
+    test = """
+      >>> s = pd.Series(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+03:00"])
+      >>> s = pd.to_datetime(s, utc=True)
+      >>> s
+      0   2020-01-01 10:00:00+00:00
+      1   2020-02-01 08:00:00+00:00
+      dtype: datetime64[ns, UTC]
+      >>> s.dt.tz
+      datetime.timezone.utc
+    """
+    teststring(test)
+
   def test_tz_localize_ambiguous_series(self):
     # This replicates a tz_localize doctest:
     #   s.tz_localize('CET', ambiguous=np.array([True, True, False]))