Allow newlines in data passed to to_gbq() (#230)

cbandy · tswast · commit 7c3dbafe31ce · 2018-10-26T11:12:51.000-07:00
* Allow newlines in data passed to to_gbq()

* Add version header to changelog
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -1,6 +1,13 @@
 Changelog
 =========
 
+.. _changelog-0.7.1:
+
+0.7.1 / unreleased
+--------------------
+
+- Allow newlines in data passed to ``to_gbq``. (:issue:`180`)
+
 .. _changelog-0.7.0:
 
 0.7.0 / 2018-10-19
diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py
@@ -61,6 +61,7 @@ def load_chunks(
     job_config = bigquery.LoadJobConfig()
     job_config.write_disposition = "WRITE_APPEND"
     job_config.source_format = "CSV"
+    job_config.allow_quoted_newlines = True
 
     if schema is None:
         schema = pandas_gbq.schema.generate_bq_schema(dataframe)
diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py
@@ -1167,6 +1167,35 @@ def test_upload_mixed_float_and_int(self, project_id):
 
         assert len(result_df) == test_size
 
+    def test_upload_data_with_newlines(self, project_id):
+        test_id = "data_with_newlines"
+        test_size = 2
+        df = DataFrame({"s": ["abcd", "ef\ngh"]})
+
+        gbq.to_gbq(
+            df,
+            self.destination_table + test_id,
+            project_id=project_id,
+            private_key=self.credentials,
+        )
+
+        result_df = gbq.read_gbq(
+            "SELECT * FROM {0}".format(self.destination_table + test_id),
+            project_id=project_id,
+            private_key=self.credentials,
+            dialect="legacy",
+        )
+
+        assert len(result_df) == test_size
+
+        if sys.version_info.major < 3:
+            pytest.skip(msg="Unicode comparison in Py2 not working")
+
+        result = result_df["s"].sort_values()
+        expected = df["s"].sort_values()
+
+        tm.assert_numpy_array_equal(expected.values, result.values)
+
     def test_upload_data_flexible_column_order(self, project_id):
         test_id = "13"
         test_size = 10
diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py
@@ -37,6 +37,18 @@ def test_encode_chunk_with_floats():
     assert "1.05153" in csv_string
 
 
+def test_encode_chunk_with_newlines():
+    """See: https://github.com/pydata/pandas-gbq/issues/180
+    """
+    df = pandas.DataFrame({"s": ["abcd", "ef\ngh", "ij\r\nkl"]})
+    csv_buffer = load.encode_chunk(df)
+    csv_bytes = csv_buffer.read()
+    csv_string = csv_bytes.decode("utf-8")
+    assert "abcd" in csv_string
+    assert '"ef\ngh"' in csv_string
+    assert '"ij\r\nkl"' in csv_string
+
+
 def test_encode_chunks_splits_dataframe():
     df = pandas.DataFrame(numpy.random.randn(6, 4), index=range(6))
     chunks = list(load.encode_chunks(df, chunksize=2))