Skip to content
This repository was archived by the owner on Jul 22, 2024. It is now read-only.

Commit e69e21e

Browse files
author
Ivan Gavryliuk
committed
Compatibility
- setting BIT_PACKED encoding confuses spark so it's now set to RLE - custom metadata supported on read and write
1 parent a7df2f2 commit e69e21e

File tree

6 files changed

+38
-9
lines changed

6 files changed

+38
-9
lines changed

src/Parquet.Test/MapsTest.cs

+6-6
Original file line numberDiff line numberDiff line change
@@ -12,21 +12,21 @@ public class MapsTest
1212
public void Simple_first_level_map_int_to_string()
1313
{
1414
var ds = new DataSet(
15-
new SchemaElement<int>("id"),
16-
new SchemaElement<IDictionary<int, string>>("names"));
15+
new SchemaElement<IDictionary<int, string>>("names"),
16+
new SchemaElement<int>("id"));
1717

18-
ds.Add(1, new Dictionary<int, string>
18+
ds.Add(new Dictionary<int, string>
1919
{
2020
[1] = "one",
2121
[2] = "two",
2222
[3] = "three"
23-
});
23+
}, 1);
2424

25-
//ParquetWriter.WriteFile(ds, "c:\\tmp\\map.parquet");
25+
//ParquetWriter.WriteFile(ds, "c:\\tmp\\pmap2.parquet");
2626

2727
DataSet ds1 = DataSetGenerator.WriteRead(ds);
2828

29-
Assert.Equal("{1;[1=>one;2=>two;3=>three]}", ds1[0].ToString());
29+
Assert.Equal("{[1=>one;2=>two;3=>three];1}", ds1[0].ToString());
3030
}
3131
}
3232
}

src/Parquet.Test/ParquetReaderTest.cs

+3
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,9 @@ public void Read_simple_map()
283283
Assert.Equal(3, ms.Extra[1].MaxDefinitionLevel);
284284

285285
Assert.Equal("{1;[1=>one;2=>two;3=>three]}", ds[0].ToString());
286+
287+
//DataSet ds2 = DataSetGenerator.WriteRead(ds);
288+
//ParquetWriter.WriteFile(ds, "c:\\tmp\\pmaps.parquet", CompressionMethod.None);
286289
}
287290

288291
[Fact]

src/Parquet/Data/DataSetMetadata.cs

+8-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
namespace Parquet.Data
1+
using System.Collections.Generic;
2+
3+
namespace Parquet.Data
24
{
35
/// <summary>
46
/// Public metadata
@@ -9,5 +11,10 @@ public class DataSetMetadata
911
/// Gets the creator tag.
1012
/// </summary>
1113
public string CreatedBy { get; internal set; }
14+
15+
/// <summary>
16+
/// Custom metadata properties
17+
/// </summary>
18+
public Dictionary<string, string> Custom { get; private set; } = new Dictionary<string, string>();
1219
}
1320
}

src/Parquet/File/FileMetadataBuilder.cs

+4-1
Original file line numberDiff line numberDiff line change
@@ -53,14 +53,17 @@ public FileMetadataBuilder()
5353
public void AddSchema(DataSet ds)
5454
{
5555
ds.Metadata.CreatedBy = CreatedBy;
56+
5657
_meta.Schema = new List<TSchemaElement> { new TSchemaElement("schema") { Num_children = ds.Schema.Elements.Count } };
58+
_meta.Key_value_metadata = ds.Metadata.Custom.Select(kv => new Thrift.KeyValue(kv.Key) { Value = kv.Value }).ToList();
5759

5860
foreach(SchemaElement se in ds.Schema.Elements)
5961
{
6062
AddSchema(_meta.Schema, se);
6163
}
6264

6365
_meta.Num_rows = ds.Count;
66+
6467
}
6568

6669
private static void AddSchema(List<TSchemaElement> container, SchemaElement se)
@@ -210,7 +213,7 @@ public Thrift.PageHeader CreateDataPage(int valueCount)
210213
{
211214
Encoding = Thrift.Encoding.PLAIN,
212215
Definition_level_encoding = Thrift.Encoding.RLE,
213-
Repetition_level_encoding = Thrift.Encoding.BIT_PACKED,
216+
Repetition_level_encoding = Thrift.Encoding.RLE,
214217
Num_values = valueCount
215218
};
216219

src/Parquet/File/FileMetadataParser.cs

+13
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,19 @@ void Build(SchemaElement node, ref int i, int count, bool isRoot)
7979
return new Schema(root.Children);
8080
}
8181

82+
public void AddMeta(DataSet ds)
83+
{
84+
ds.Metadata.Custom.Clear();
85+
86+
if (_fileMeta.Key_value_metadata != null && _fileMeta.Key_value_metadata.Count > 0)
87+
{
88+
foreach(Thrift.KeyValue tkv in _fileMeta.Key_value_metadata)
89+
{
90+
ds.Metadata.Custom[tkv.Key] = tkv.Value;
91+
}
92+
}
93+
}
94+
8295
private SchemaElement BuildListSchema(ref Thrift.SchemaElement tse, ref int i, bool isRoot, SchemaElement node, ParquetOptions formatOptions)
8396
{
8497
Thrift.SchemaElement tseTop = tse;

src/Parquet/ParquetReader.cs

+4-1
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,10 @@ public DataSet Read()
148148
pos += rg.Num_rows;
149149
}
150150

151-
return new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by) { Thrift = _meta };
151+
var ds = new DataSet(schema, pathToValues, _meta.Num_rows, _meta.Created_by);
152+
metaParser.AddMeta(ds);
153+
ds.Thrift = _meta;
154+
return ds;
152155
}
153156

154157
/// <summary>

0 commit comments

Comments
 (0)