Skip to content

Commit d02a91b

Browse files
authored
apacheGH-41608: [C++][Python] Extends the add_key_value to parquet::arrow and PyArrow (apache#41633)
### Rationale for this change The previous pr ( apache#34889 ) add a `AddKeyValueMetadata` to FileWriter. And now we should export it to Parquet Arrow and Python API. ### What changes are included in this PR? 1. Add `AddKeyValueMetadata` in parquet::arrow 2. Add `add_key_value_metadata` in pyarrow 3. testing ### Are these changes tested? Yes ### Are there any user-facing changes? New api allowing add key-value metadata to Parquet file * GitHub Issue: apache#41608 Authored-by: mwish <[email protected]> Signed-off-by: mwish <[email protected]>
1 parent 524a463 commit d02a91b

File tree

9 files changed

+157
-2
lines changed

9 files changed

+157
-2
lines changed

cpp/src/parquet/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,7 @@ add_parquet_test(writer-test
397397

398398
add_parquet_test(arrow-test
399399
SOURCES
400+
arrow/arrow_metadata_test.cc
400401
arrow/arrow_reader_writer_test.cc
401402
arrow/arrow_schema_test.cc
402403
arrow/arrow_statistics_test.cc)
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "gtest/gtest.h"
19+
20+
#include "arrow/table.h"
21+
#include "arrow/testing/gtest_util.h"
22+
#include "arrow/util/key_value_metadata.h"
23+
24+
#include "parquet/api/writer.h"
25+
26+
#include "parquet/arrow/reader.h"
27+
#include "parquet/arrow/schema.h"
28+
#include "parquet/arrow/writer.h"
29+
#include "parquet/file_writer.h"
30+
#include "parquet/test_util.h"
31+
32+
namespace parquet::arrow {
33+
34+
TEST(Metadata, AppendMetadata) {
35+
// A sample table, type and structure does not matter in this test case
36+
auto schema = ::arrow::schema({::arrow::field("f", ::arrow::utf8())});
37+
auto table = ::arrow::Table::Make(
38+
schema, {::arrow::ArrayFromJSON(::arrow::utf8(), R"(["a", "b", "c"])")});
39+
40+
auto sink = CreateOutputStream();
41+
ArrowWriterProperties::Builder builder;
42+
builder.store_schema();
43+
ASSERT_OK_AND_ASSIGN(auto writer,
44+
parquet::arrow::FileWriter::Open(
45+
*schema, ::arrow::default_memory_pool(), sink,
46+
parquet::default_writer_properties(), builder.build()));
47+
48+
auto kv_meta = std::make_shared<KeyValueMetadata>();
49+
kv_meta->Append("test_key_1", "test_value_1");
50+
// <test_key_2, test_value_2_temp> would be overwritten later.
51+
kv_meta->Append("test_key_2", "test_value_2_temp");
52+
ASSERT_OK(writer->AddKeyValueMetadata(kv_meta));
53+
54+
// Key value metadata that will be added to the file.
55+
auto kv_meta_added = std::make_shared<::arrow::KeyValueMetadata>();
56+
kv_meta_added->Append("test_key_2", "test_value_2");
57+
kv_meta_added->Append("test_key_3", "test_value_3");
58+
59+
ASSERT_OK(writer->AddKeyValueMetadata(kv_meta_added));
60+
ASSERT_OK(writer->Close());
61+
62+
// return error if the file is closed
63+
ASSERT_RAISES(IOError, writer->AddKeyValueMetadata(kv_meta_added));
64+
65+
auto verify_key_value_metadata =
66+
[&](const std::shared_ptr<const KeyValueMetadata>& key_value_metadata) {
67+
ASSERT_TRUE(nullptr != key_value_metadata);
68+
69+
// Verify keys that were added before file writer was closed are present.
70+
for (int i = 1; i <= 3; ++i) {
71+
auto index = std::to_string(i);
72+
PARQUET_ASSIGN_OR_THROW(auto value,
73+
key_value_metadata->Get("test_key_" + index));
74+
EXPECT_EQ("test_value_" + index, value);
75+
}
76+
EXPECT_TRUE(key_value_metadata->Contains("ARROW:schema"));
77+
};
78+
// verify the metadata in writer
79+
verify_key_value_metadata(writer->metadata()->key_value_metadata());
80+
81+
ASSERT_OK(writer->Close());
82+
83+
ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());
84+
// verify the metadata in reader
85+
{
86+
std::unique_ptr<FileReader> reader;
87+
FileReaderBuilder reader_builder;
88+
ASSERT_OK_NO_THROW(
89+
reader_builder.Open(std::make_shared<::arrow::io::BufferReader>(buffer)));
90+
ASSERT_OK(
91+
reader_builder.properties(default_arrow_reader_properties())->Build(&reader));
92+
93+
verify_key_value_metadata(reader->parquet_reader()->metadata()->key_value_metadata());
94+
}
95+
}
96+
97+
} // namespace parquet::arrow

cpp/src/parquet/arrow/writer.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -482,6 +482,14 @@ class FileWriterImpl : public FileWriter {
482482
return writer_->metadata();
483483
}
484484

485+
/// \brief Append the key-value metadata to the file metadata
486+
::arrow::Status AddKeyValueMetadata(
487+
const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata)
488+
override {
489+
PARQUET_CATCH_NOT_OK(writer_->AddKeyValueMetadata(key_value_metadata));
490+
return Status::OK();
491+
}
492+
485493
private:
486494
friend class FileWriter;
487495

cpp/src/parquet/arrow/writer.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,16 @@ class PARQUET_EXPORT FileWriter {
143143
virtual ~FileWriter();
144144

145145
virtual MemoryPool* memory_pool() const = 0;
146+
/// \brief Add key-value metadata to the file.
147+
/// \param[in] key_value_metadata the metadata to add.
148+
/// \note This will overwrite any existing metadata with the same key.
149+
/// \return Error if Close() has been called.
150+
///
151+
/// WARNING: If `store_schema` is enabled, `ARROW:schema` would be stored
152+
/// in the key-value metadata. Overwriting this key would result in
153+
/// `store_schema` being unusable during read.
154+
virtual ::arrow::Status AddKeyValueMetadata(
155+
const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata) = 0;
146156
/// \brief Return the file metadata, only available after calling Close().
147157
virtual const std::shared_ptr<FileMetaData> metadata() const = 0;
148158
};

cpp/src/parquet/file_writer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ class PARQUET_EXPORT ParquetFileWriter {
202202

203203
/// \brief Add key-value metadata to the file.
204204
/// \param[in] key_value_metadata the metadata to add.
205-
/// \note This will overwrite any existing metadata with the same key.
205+
/// \note This will overwrite any existing metadata with the same key(s).
206206
/// \throw ParquetException if Close() has been called.
207207
void AddKeyValueMetadata(
208208
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata);

python/pyarrow/_parquet.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,7 @@ cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil:
554554
CStatus WriteTable(const CTable& table, int64_t chunk_size)
555555
CStatus NewRowGroup(int64_t chunk_size)
556556
CStatus Close()
557+
CStatus AddKeyValueMetadata(const shared_ptr[const CKeyValueMetadata]& key_value_metadata)
557558

558559
const shared_ptr[CFileMetaData] metadata() const
559560

python/pyarrow/_parquet.pyx

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,10 @@ from pyarrow.includes.libarrow_python cimport *
2929
from pyarrow.lib cimport (_Weakrefable, Buffer, Schema,
3030
check_status,
3131
MemoryPool, maybe_unbox_memory_pool,
32-
Table, NativeFile,
32+
Table, KeyValueMetadata,
3333
pyarrow_wrap_chunked_array,
3434
pyarrow_wrap_schema,
35+
pyarrow_unwrap_metadata,
3536
pyarrow_unwrap_schema,
3637
pyarrow_wrap_table,
3738
pyarrow_wrap_batch,
@@ -2206,6 +2207,15 @@ cdef class ParquetWriter(_Weakrefable):
22062207
check_status(self.writer.get()
22072208
.WriteTable(deref(ctable), c_row_group_size))
22082209

2210+
def add_key_value_metadata(self, key_value_metadata):
2211+
cdef:
2212+
shared_ptr[const CKeyValueMetadata] c_metadata
2213+
2214+
c_metadata = pyarrow_unwrap_metadata(KeyValueMetadata(key_value_metadata))
2215+
with nogil:
2216+
check_status(self.writer.get()
2217+
.AddKeyValueMetadata(c_metadata))
2218+
22092219
@property
22102220
def metadata(self):
22112221
cdef:

python/pyarrow/parquet/core.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,6 +1108,19 @@ def close(self):
11081108
if self.file_handle is not None:
11091109
self.file_handle.close()
11101110

1111+
def add_key_value_metadata(self, key_value_metadata):
1112+
"""
1113+
Add key-value metadata to the file.
1114+
This will overwrite any existing metadata with the same key.
1115+
1116+
Parameters
1117+
----------
1118+
key_value_metadata : dict
1119+
Keys and values must be string-like / coercible to bytes.
1120+
"""
1121+
assert self.is_open
1122+
self.writer.add_key_value_metadata(key_value_metadata)
1123+
11111124

11121125
def _get_pandas_index_columns(keyvalues):
11131126
return (json.loads(keyvalues[b'pandas'].decode('utf8'))

python/pyarrow/tests/parquet/test_parquet_writer.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,3 +346,18 @@ def test_parquet_writer_store_schema(tempdir):
346346

347347
meta = pq.read_metadata(path2)
348348
assert meta.metadata is None
349+
350+
351+
def test_parquet_writer_append_key_value_metadata(tempdir):
352+
table = pa.Table.from_arrays([pa.array([], type='int32')], ['f0'])
353+
path = tempdir / 'metadata.parquet'
354+
355+
with pq.ParquetWriter(path, table.schema) as writer:
356+
writer.write_table(table)
357+
writer.add_key_value_metadata({'key1': '1', 'key2': 'x'})
358+
writer.add_key_value_metadata({'key2': '2', 'key3': '3'})
359+
reader = pq.ParquetFile(path)
360+
metadata = reader.metadata.metadata
361+
assert metadata[b'key1'] == b'1'
362+
assert metadata[b'key2'] == b'2'
363+
assert metadata[b'key3'] == b'3'

0 commit comments

Comments
 (0)