BUG: support INT64 and other standard SQL aliases in to_gbq table_schema (#340)

tswast · web-flow · commit b46e95581da7 · 2020-11-09T16:05:28.000-06:00
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -10,6 +10,8 @@ Bug fixes
 ~~~~~~~~~
 
 - Encode floating point values with greater precision. (:issue:`326`)
+- Support ``INT64`` and other standard SQL aliases in
+  :func:`~pandas_gbq.to_gbq` ``table_schema`` argument. (:issue:`322`)
 
 
 .. _changelog-0.14.0:
diff --git a/pandas_gbq/schema.py b/pandas_gbq/schema.py
@@ -3,6 +3,16 @@
 import copy
 
 
+# API may return data types as legacy SQL, so maintain a mapping of aliases
+# from standard SQL to legacy data types.
+_TYPE_ALIASES = {
+    "BOOL": "BOOLEAN",
+    "FLOAT64": "FLOAT",
+    "INT64": "INTEGER",
+    "STRUCT": "RECORD",
+}
+
+
 def to_pandas_gbq(client_schema):
     """Given a sequence of :class:`google.cloud.bigquery.schema.SchemaField`,
     return a schema in pandas-gbq API format.
@@ -24,10 +34,12 @@ def _clean_schema_fields(fields):
     are not generated by func:`pandas_gbq.schema.generate_bq_schema`.
     """
     fields_sorted = sorted(fields, key=lambda field: field["name"])
-    return [
-        {"name": field["name"], "type": field["type"]}
-        for field in fields_sorted
-    ]
+    clean_schema = []
+    for field in fields_sorted:
+        field_type = field["type"].upper()
+        field_type = _TYPE_ALIASES.get(field_type, field_type)
+        clean_schema.append({"name": field["name"], "type": field_type})
+    return clean_schema
 
 
 def schema_is_subset(schema_remote, schema_local):
diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py
@@ -11,22 +11,44 @@ def module_under_test():
     return pandas_gbq.schema
 
 
-def test_schema_is_subset_passes_if_subset(module_under_test):
+@pytest.mark.parametrize(
+    "original_fields,dataframe_fields",
+    [
+        (
+            [
+                {"name": "A", "type": "FLOAT"},
+                {"name": "B", "type": "FLOAT64"},
+                {"name": "C", "type": "STRING"},
+            ],
+            [
+                {"name": "A", "type": "FLOAT64"},
+                {"name": "B", "type": "FLOAT"},
+            ],
+        ),
+        # Original schema from API may contain legacy SQL datatype names.
+        # https://github.com/pydata/pandas-gbq/issues/322
+        (
+            [{"name": "A", "type": "INTEGER"}],
+            [{"name": "A", "type": "INT64"}],
+        ),
+        (
+            [{"name": "A", "type": "BOOL"}],
+            [{"name": "A", "type": "BOOLEAN"}],
+        ),
+        (
+            # TODO: include sub-fields when struct uploads are supported.
+            [{"name": "A", "type": "STRUCT"}],
+            [{"name": "A", "type": "RECORD"}],
+        ),
+    ],
+)
+def test_schema_is_subset_passes_if_subset(
+    module_under_test, original_fields, dataframe_fields
+):
     # Issue #24 schema_is_subset indicates whether the schema of the
     # dataframe is a subset of the schema of the bigquery table
-    table_schema = {
-        "fields": [
-            {"name": "A", "type": "FLOAT"},
-            {"name": "B", "type": "FLOAT"},
-            {"name": "C", "type": "STRING"},
-        ]
-    }
-    tested_schema = {
-        "fields": [
-            {"name": "A", "type": "FLOAT"},
-            {"name": "B", "type": "FLOAT"},
-        ]
-    }
+    table_schema = {"fields": original_fields}
+    tested_schema = {"fields": dataframe_fields}
     assert module_under_test.schema_is_subset(table_schema, tested_schema)