feat: map "if_exists" value to LoadJobConfig.WriteDisposition (#583)

aribray · web-flow · commit 7389cd2a363e · 2022-11-07T11:36:12.000-06:00
* feat: map "if_exists" value to LoadJobConfig.WriteDisposition

This uses LoadJobConfig.WriteDisposition to replace if_exists='fail'/'replace'/'append' behavior in to_gbq()

### Dependency updates

- Update the minimum version of `db-dtypes` to 1.0.4
- Update the minimum version of `google-api-core` to 2.10.2
- Update the minimum version of `google-auth` to 2.13.0
- Update the minimum version of `google-auth-oauthlib` to 0.7.0
- Update the minimum version of `google-cloud-bigquery` to 3.3.5
- Update the minimum version of `google-cloud-bigquery-storage` to 2.16.2
- Update the minimum version of `pandas` to 1.1.4
- Update the minimum version of `pydata-google-auth` to 1.4.0
diff --git a/ci/requirements-3.7-0.24.2.conda b/ci/requirements-3.7-0.24.2.conda
@@ -1,14 +1,17 @@
 codecov
 coverage
-db-dtypes==0.3.1
+db-dtypes
 fastavro
 flake8
 freezegun
-numpy==1.16.6
-google-cloud-bigquery==1.27.2
-google-cloud-bigquery-storage==1.1.0
-pyarrow==3.0.0
+numpy
+google-api-core
+google-auth
+google-cloud-bigquery
+google-cloud-bigquery-storage
+pyarrow
 pydata-google-auth
 pytest
 pytest-cov
-tqdm==4.23.0
+requests-oauthlib
+tqdm
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -20,10 +20,7 @@
 if typing.TYPE_CHECKING:  # pragma: NO COVER
     import pandas
 
-from pandas_gbq.exceptions import (
-    AccessDenied,
-    GenericGBQException,
-)
+from pandas_gbq.exceptions import AccessDenied, GenericGBQException
 from pandas_gbq.features import FEATURES
 import pandas_gbq.schema
 import pandas_gbq.timestamp
@@ -116,20 +113,12 @@ class InvalidSchema(ValueError):
     table in BigQuery.
     """
 
-    def __init__(
-        self, message: str, local_schema: Dict[str, Any], remote_schema: Dict[str, Any]
-    ):
-        super().__init__(message)
-        self._local_schema = local_schema
-        self._remote_schema = remote_schema
-
-    @property
-    def local_schema(self) -> Dict[str, Any]:
-        return self._local_schema
+    def __init__(self, message: str):
+        self._message = message
 
     @property
-    def remote_schema(self) -> Dict[str, Any]:
-        return self._remote_schema
+    def message(self) -> str:
+        return self._message
 
 
 class NotFoundException(ValueError):
@@ -155,7 +144,12 @@ class TableCreationError(ValueError):
     Raised when the create table method fails
     """
 
-    pass
+    def __init__(self, message: str):
+        self._message = message
+
+    @property
+    def message(self) -> str:
+        return self._message
 
 
 class Context(object):
@@ -382,8 +376,14 @@ def process_http_error(ex):
 
         if "cancelled" in ex.message:
             raise QueryTimeout("Reason: {0}".format(ex))
-
-        raise GenericGBQException("Reason: {0}".format(ex))
+        elif "Provided Schema does not match" in ex.message:
+            error_message = ex.errors[0]["message"]
+            raise InvalidSchema(f"Reason: {error_message}")
+        elif "Already Exists: Table" in ex.message:
+            error_message = ex.errors[0]["message"]
+            raise TableCreationError(f"Reason: {error_message}")
+        else:
+            raise GenericGBQException("Reason: {0}".format(ex))
 
     def download_table(
         self,
@@ -577,6 +577,7 @@ def load_data(
         self,
         dataframe,
         destination_table_ref,
+        write_disposition,
         chunksize=None,
         schema=None,
         progress_bar=True,
@@ -596,6 +597,7 @@ def load_data(
                 schema=schema,
                 location=self.location,
                 api_method=api_method,
+                write_disposition=write_disposition,
                 billing_project=billing_project,
             )
             if progress_bar and tqdm:
@@ -609,11 +611,6 @@ def load_data(
         except self.http_error as ex:
             self.process_http_error(ex)
 
-    def delete_and_recreate_table(self, project_id, dataset_id, table_id, table_schema):
-        table = _Table(project_id, dataset_id, credentials=self.credentials)
-        table.delete(table_id)
-        table.create(table_id, table_schema)
-
 
 def _bqschema_to_nullsafe_dtypes(schema_fields):
     """Specify explicit dtypes based on BigQuery schema.
@@ -975,11 +972,9 @@ def to_gbq(
 ):
     """Write a DataFrame to a Google BigQuery table.
 
-    The main method a user calls to export pandas DataFrame contents to
-    Google BigQuery table.
+    The main method a user calls to export pandas DataFrame contents to Google BigQuery table.
 
-    This method uses the Google Cloud client library to make requests to
-    Google BigQuery, documented `here
+    This method uses the Google Cloud client library to make requests to Google BigQuery, documented `here
     <https://googleapis.dev/python/bigquery/latest/index.html>`__.
 
     See the :ref:`How to authenticate with Google BigQuery <authentication>`
@@ -1114,15 +1109,21 @@ def to_gbq(
                 stacklevel=2,
             )
 
-    if if_exists not in ("fail", "replace", "append"):
-        raise ValueError("'{0}' is not valid for if_exists".format(if_exists))
-
     if "." not in destination_table:
         raise NotFoundException(
             "Invalid Table Name. Should be of the form 'datasetId.tableId' or "
             "'projectId.datasetId.tableId'"
         )
 
+    if if_exists not in ("fail", "replace", "append"):
+        raise ValueError("'{0}' is not valid for if_exists".format(if_exists))
+
+    if_exists_list = ["fail", "replace", "append"]
+    dispositions = ["WRITE_EMPTY", "WRITE_TRUNCATE", "WRITE_APPEND"]
+    dispositions_dict = dict(zip(if_exists_list, dispositions))
+
+    write_disposition = dispositions_dict[if_exists]
+
     connector = GbqConnector(
         project_id,
         reauth=reauth,
@@ -1142,17 +1143,20 @@ def to_gbq(
     table_id = destination_table_ref.table_id
 
     default_schema = _generate_bq_schema(dataframe)
+    # If table_schema isn't provided, we'll create one for you
     if not table_schema:
         table_schema = default_schema
+    # It table_schema is provided, we'll update the default_schema to the provided table_schema
     else:
         table_schema = pandas_gbq.schema.update_schema(
             default_schema, dict(fields=table_schema)
         )
 
-    # If table exists, check if_exists parameter
     try:
+        # Try to get the table
         table = bqclient.get_table(destination_table_ref)
     except google_exceptions.NotFound:
+        # If the table doesn't already exist, create it
         table_connector = _Table(
             project_id_table,
             dataset_id,
@@ -1161,34 +1165,12 @@ def to_gbq(
         )
         table_connector.create(table_id, table_schema)
     else:
+        # Convert original schema (the schema that already exists) to pandas-gbq API format
         original_schema = pandas_gbq.schema.to_pandas_gbq(table.schema)
 
-        if if_exists == "fail":
-            raise TableCreationError(
-                "Could not create the table because it "
-                "already exists. "
-                "Change the if_exists parameter to "
-                "'append' or 'replace' data."
-            )
-        elif if_exists == "replace":
-            connector.delete_and_recreate_table(
-                project_id_table, dataset_id, table_id, table_schema
-            )
-        else:
-            if not pandas_gbq.schema.schema_is_subset(original_schema, table_schema):
-                raise InvalidSchema(
-                    "Please verify that the structure and "
-                    "data types in the DataFrame match the "
-                    "schema of the destination table.",
-                    table_schema,
-                    original_schema,
-                )
-
-            # Update the local `table_schema` so mode (NULLABLE/REQUIRED)
-            # matches. See: https://github.com/pydata/pandas-gbq/issues/315
-            table_schema = pandas_gbq.schema.update_schema(
-                table_schema, original_schema
-            )
+        # Update the local `table_schema` so mode (NULLABLE/REQUIRED)
+        # matches. See: https://github.com/pydata/pandas-gbq/issues/315
+        table_schema = pandas_gbq.schema.update_schema(table_schema, original_schema)
 
     if dataframe.empty:
         # Create the table (if needed), but don't try to run a load job with an
@@ -1198,6 +1180,7 @@ def to_gbq(
     connector.load_data(
         dataframe,
         destination_table_ref,
+        write_disposition=write_disposition,
         chunksize=chunksize,
         schema=table_schema,
         progress_bar=progress_bar,
diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py
@@ -113,13 +113,13 @@ def load_parquet(
     client: bigquery.Client,
     dataframe: pandas.DataFrame,
     destination_table_ref: bigquery.TableReference,
+    write_disposition: str,
     location: Optional[str],
     schema: Optional[Dict[str, Any]],
     billing_project: Optional[str] = None,
 ):
     job_config = bigquery.LoadJobConfig()
-    job_config.write_disposition = "WRITE_APPEND"
-    job_config.create_disposition = "CREATE_NEVER"
+    job_config.write_disposition = write_disposition
     job_config.source_format = "PARQUET"
 
     if schema is not None:
@@ -143,13 +143,13 @@ def load_parquet(
 
 def load_csv(
     dataframe: pandas.DataFrame,
+    write_disposition: str,
     chunksize: Optional[int],
     bq_schema: Optional[List[bigquery.SchemaField]],
     load_chunk: Callable,
 ):
     job_config = bigquery.LoadJobConfig()
-    job_config.write_disposition = "WRITE_APPEND"
-    job_config.create_disposition = "CREATE_NEVER"
+    job_config.write_disposition = write_disposition
     job_config.source_format = "CSV"
     job_config.allow_quoted_newlines = True
 
@@ -167,6 +167,7 @@ def load_csv_from_dataframe(
     client: bigquery.Client,
     dataframe: pandas.DataFrame,
     destination_table_ref: bigquery.TableReference,
+    write_disposition: str,
     location: Optional[str],
     chunksize: Optional[int],
     schema: Optional[Dict[str, Any]],
@@ -187,13 +188,14 @@ def load_chunk(chunk, job_config):
             project=billing_project,
         ).result()
 
-    return load_csv(dataframe, chunksize, bq_schema, load_chunk)
+    return load_csv(dataframe, write_disposition, chunksize, bq_schema, load_chunk)
 
 
 def load_csv_from_file(
     client: bigquery.Client,
     dataframe: pandas.DataFrame,
     destination_table_ref: bigquery.TableReference,
+    write_disposition: str,
     location: Optional[str],
     chunksize: Optional[int],
     schema: Optional[Dict[str, Any]],
@@ -223,7 +225,7 @@ def load_chunk(chunk, job_config):
         finally:
             chunk_buffer.close()
 
-    return load_csv(dataframe, chunksize, bq_schema, load_chunk)
+    return load_csv(dataframe, write_disposition, chunksize, bq_schema, load_chunk)
 
 
 def load_chunks(
@@ -234,13 +236,15 @@ def load_chunks(
     schema=None,
     location=None,
     api_method="load_parquet",
+    write_disposition="WRITE_EMPTY",
     billing_project: Optional[str] = None,
 ):
     if api_method == "load_parquet":
         load_parquet(
             client,
             dataframe,
             destination_table_ref,
+            write_disposition,
             location,
             schema,
             billing_project=billing_project,
@@ -253,6 +257,7 @@ def load_chunks(
                 client,
                 dataframe,
                 destination_table_ref,
+                write_disposition,
                 location,
                 chunksize,
                 schema,
@@ -263,6 +268,7 @@ def load_chunks(
                 client,
                 dataframe,
                 destination_table_ref,
+                write_disposition,
                 location,
                 chunksize,
                 schema,
diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,4 @@ pandas
 google-auth
 google-auth-oauthlib
 google-cloud-bigquery
-tqdm
+tqdm
diff --git a/setup.py b/setup.py
@@ -23,24 +23,24 @@
 release_status = "Development Status :: 4 - Beta"
 dependencies = [
     "setuptools",
-    "db-dtypes >=0.3.1,<2.0.0",
+    "db-dtypes >=1.0.4,<2.0.0",
     "numpy >=1.16.6",
-    "pandas >=0.24.2",
+    "pandas >=1.1.4",
     "pyarrow >=3.0.0, <10.0dev",
-    "pydata-google-auth",
+    "pydata-google-auth >=1.4.0",
     # Note: google-api-core and google-auth are also included via transitive
     # dependency on google-cloud-bigquery, but this library also uses them
     # directly.
-    "google-api-core >= 1.31.5, <3.0.0dev,!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0",
-    "google-auth >=1.25.0",
-    "google-auth-oauthlib >=0.0.1",
+    "google-api-core >= 2.10.2, <3.0.0dev",
+    "google-auth >=2.13.0",
+    "google-auth-oauthlib >=0.7.0",
     # Require 1.27.* because it has a fix for out-of-bounds timestamps.  See:
     # https://github.com/googleapis/python-bigquery/pull/209 and
     # https://github.com/googleapis/python-bigquery-pandas/issues/365
     # Exclude 2.4.* because it has a bug where waiting for the query can hang
     # indefinitely. https://github.com/pydata/pandas-gbq/issues/343
-    "google-cloud-bigquery >=1.27.2,<4.0.0dev,!=2.4.*",
-    "google-cloud-bigquery-storage >=1.1.0,<3.0.0dev",
+    "google-cloud-bigquery >=3.3.5,<4.0.0dev,!=2.4.*",
+    "google-cloud-bigquery-storage >=2.16.2,<3.0.0dev",
 ]
 extras = {
     "tqdm": "tqdm>=4.23.0",
diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt
@@ -5,15 +5,15 @@
 #
 # e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev",
 # Then this file should have foo==1.14.0
-db-dtypes==0.3.1
-google-api-core==1.31.5
-google-auth==1.25.0
-google-auth-oauthlib==0.0.1
-google-cloud-bigquery==1.27.2
-google-cloud-bigquery-storage==1.1.0
+db-dtypes==1.0.4
+google-api-core==2.10.2
+google-auth==2.13.0
+google-auth-oauthlib==0.7.0
+google-cloud-bigquery==3.3.5
+google-cloud-bigquery-storage==2.16.2
 numpy==1.16.6
-pandas==0.24.2
+pandas==1.1.4
 pyarrow==3.0.0
-pydata-google-auth==0.1.2
+pydata-google-auth==1.4.0
 tqdm==4.23.0
 protobuf==3.19.5
diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py
diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py
diff --git a/tests/unit/test_to_gbq.py b/tests/unit/test_to_gbq.py

-Original file line number
+Diff line change
         test_id = "2"
         test_size = 10
         df = make_mixed_dataframe_v2(test_size)
 -        self.table.create(TABLE_ID + test_id, gbq._generate_bq_schema(df))
 -        # Test the default value of if_exists is 'fail'
 +        # Initialize table with sample data
 +        gbq.to_gbq(
 +            df,
 +            self.destination_table + test_id,
 +            project_id,
 +            chunksize=10000,
 +            credentials=self.credentials,
 +        )
++
 +        # Test the default value of if_exists == 'fail'
         with pytest.raises(gbq.TableCreationError):
             gbq.to_gbq(
                 df,
-Original file line number
+Diff line change
     _ = list(
         load.load_csv_from_dataframe(
 -            mock_bigquery_client, df, destination, None, None, None
 +            mock_bigquery_client, df, destination, None, None, None, None
+        )
+    )
+    )
     _ = list(
 -        load.load_csv_from_file(mock_bigquery_client, df, destination, None, None, None)
 +        load.load_csv_from_file(
 +            mock_bigquery_client, df, destination, None, None, None, None
 +        )
+    )
     mock_load = mock_bigquery_client.load_table_from_file
 def test_load_chunks_with_invalid_api_method():
     with pytest.raises(ValueError, match="Got unexpected api_method:"):
 -        load.load_chunks(None, None, None, api_method="not_a_thing")
 +        load.load_chunks(None, None, None, None, api_method="not_a_thing")
 def test_load_parquet_allows_client_to_generate_schema(mock_bigquery_client):
         "my-project.my_dataset.my_table"
+    )
 -    load.load_parquet(mock_bigquery_client, df, destination, None, None)
 +    load.load_parquet(
 +        mock_bigquery_client,
 +        df,
 +        destination,
 +        None,
 +        None,
 +        None,
 +    )
     mock_load = mock_bigquery_client.load_table_from_dataframe
     assert mock_load.called
+    )
     with pytest.raises(exceptions.ConversionError):
 -        load.load_parquet(mock_bigquery_client, df, destination, None, None)
 +        load.load_parquet(mock_bigquery_client, df, destination, None, None, None)
 @pytest.mark.parametrize(
-Original file line number
+Diff line change
         "myproj.my_dataset.my_table",
         schema=(SchemaField("col_a", "INTEGER"), SchemaField("col_b", "STRING")),
+    )
 -    with pytest.raises(gbq.InvalidSchema) as exception_block:
 +    mock_bigquery_client.side_effect = gbq.InvalidSchema(
 +        message=r"Provided Schema does not match Table *"
 +    )
++
 +    with pytest.raises((gbq.InvalidSchema)) as exception_block:
         gbq.to_gbq(
             DataFrame({"col_a": [0.25, 1.5, -1.0]}),
             "my_dataset.my_table",
+        )
     exc = exception_block.value
 -    assert exc.remote_schema == {
 -        "fields": [
 -            {"name": "col_a", "type": "INTEGER", "mode": "NULLABLE"},
 -            {"name": "col_b", "type": "STRING", "mode": "NULLABLE"},
 -        ]
 -    }
 -    assert exc.local_schema == {"fields": [{"name": "col_a", "type": "FLOAT"}]}
 +    assert exc.message == r"Provided Schema does not match Table *"
 -def test_to_gbq_with_if_exists_replace(mock_bigquery_client):
 +def test_to_gbq_with_if_exists_replace(mock_bigquery_client, expected_load_method):
     mock_bigquery_client.get_table.side_effect = (
         # Initial check
         google.cloud.bigquery.Table("myproj.my_dataset.my_table"),
         project_id="myproj",
         if_exists="replace",
+    )
 -    # TODO: We can avoid these API calls by using write disposition in the load
 -    # job. See: https://github.com/googleapis/python-bigquery-pandas/issues/118
 -    assert mock_bigquery_client.delete_table.called
 -    assert mock_bigquery_client.create_table.called
 +    expected_load_method.assert_called_once()
 def test_to_gbq_with_if_exists_replace_cross_project(
         project_id="billing-project",
         if_exists="replace",
+    )
 -    # TODO: We can avoid these API calls by using write disposition in the load
 -    # job. See: https://github.com/googleapis/python-bigquery-pandas/issues/118
 -    assert mock_bigquery_client.delete_table.called
 -    args, _ = mock_bigquery_client.delete_table.call_args
 -    table_delete: google.cloud.bigquery.TableReference = args[0]
 -    assert table_delete.project == "data-project"
 -    assert table_delete.dataset_id == "my_dataset"
 -    assert table_delete.table_id == "my_table"
 -    assert mock_bigquery_client.create_table.called
 -    args, _ = mock_bigquery_client.create_table.call_args
 -    table_create: google.cloud.bigquery.TableReference = args[0]
 -    assert table_create.project == "data-project"
 -    assert table_create.dataset_id == "my_dataset"
 -    assert table_create.table_id == "my_table"
 +    expected_load_method.assert_called_once()
     # Check that billing project and destination table is set correctly.
     expected_load_method.assert_called_once()