Skip to content

Commit

Permalink
Bug/parquet empty nested structures (#1117)
Browse files Browse the repository at this point in the history
* Split flattener tests into smaller units

* Covered parquet row flattener with more tests, fixed bugs

* RLEBitPackHybrid is now expecting bitWidth to be provided

* Fixed logic related to assemblying/shreding nested empty structures

* Revereted development leftovers
  • Loading branch information
norberttech authored Jul 15, 2024
1 parent 7bd566b commit ab5af92
Show file tree
Hide file tree
Showing 42 changed files with 1,149 additions and 428 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public function test_file_status_on_partial_path() : void
{
$fs = azure_filesystem($this->blobService('flow-php'));

$fs->writeTo(new Path('azure-blob://some_path_to/file.txt'))->fromResource(\fopen(__DIR__ . '/Fixtures/orders.csv', 'rb'));
$fs->writeTo(new Path('azure-blob://some_path_to/file.txt'))->fromResource(\fopen(__DIR__ . '/Fixtures/orders.csv', 'rb'))->close();

self::assertNull($fs->status(new Path('azure-blob://some_path')));
}
Expand All @@ -62,7 +62,7 @@ public function test_file_status_on_pattern() : void
{
$fs = azure_filesystem($this->blobService('flow-php'));

$fs->writeTo(new Path('azure-blob://some_path_to/file.txt'))->fromResource(\fopen(__DIR__ . '/Fixtures/orders.csv', 'rb'));
$fs->writeTo(new Path('azure-blob://some_path_to/file.txt'))->fromResource(\fopen(__DIR__ . '/Fixtures/orders.csv', 'rb'))->close();

self::assertTrue($fs->status(new Path('azure-blob://some_path_to/*.txt'))->isFile());
self::assertSame('azure-blob://some_path_to/file.txt', $fs->status(new Path('azure-blob://some_path_to/*.txt'))->path->uri());
Expand Down
4 changes: 3 additions & 1 deletion src/functions.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
if (!\function_exists('dd')) {
function dd(...$args) : void
{
\var_dump(...$args);
foreach ($args as $arg) {
\var_dump($arg);
}

exit(1);
}
Expand Down
11 changes: 2 additions & 9 deletions src/lib/dremel/src/Flow/Dremel/Dremel.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,18 @@ public function __construct()
*
* @psalm-suppress UndefinedInterfaceMethod
*/
public function assemble(array $repetitions, array $definitions, array $values) : array
public function assemble(array $repetitions, array $definitions, array $values, int $maxDefinitionLevel, int $maxRepetitionLevel) : array
{
$this->assertInput($repetitions, $definitions);

$maxDefinitionLevel = \count($definitions) ? \max($definitions) : 0;
$maxRepetitionLevel = \count($repetitions) ? \max($repetitions) : 0;

$output = [];
$valueIndex = 0;

if ($maxRepetitionLevel === 0) {
foreach ($definitions as $definition) {
if ($definition === 0) {
$output[] = null;
} elseif ($definition === $maxDefinitionLevel) {
} else {
$output[] = $values[$valueIndex] ?? null;
$valueIndex++;
}
Expand Down Expand Up @@ -80,10 +77,6 @@ public function shred(array $data, int $maxDefinitionLevel) : DataShredded
$repetitions = [];
$this->buildRepetitions($data, 0, 0, $repetitions);

if (!\count($repetitions) || \max($repetitions) === 0) {
$repetitions = [];
}

return new DataShredded(
$repetitions,
$definitions,
Expand Down
10 changes: 8 additions & 2 deletions src/lib/dremel/src/Flow/Dremel/ListNode.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

namespace Flow\Dremel;

use Flow\Dremel\Exception\RuntimeException;

final class ListNode implements Node
{
private int $maxRepetitionLevel;
Expand All @@ -22,11 +24,11 @@ public function __construct(int $maxRepetitionLevel)
public function push(mixed $value, int $level) : self
{
if ($level > $this->maxRepetitionLevel) {
throw new \RuntimeException('Invalid level, max repetition level is ' . $this->maxRepetitionLevel . ' but ' . $level . ' was given');
throw new RuntimeException('Invalid level, max repetition level is ' . $this->maxRepetitionLevel . ' but ' . $level . ' was given');
}

if ($level === 0) {
throw new \RuntimeException('Invalid level, level must be greater than 0');
throw new RuntimeException('Invalid level, level must be greater than 0');
}

$this->pushToLevel($this->value, $value, $level, $level);
Expand All @@ -53,6 +55,10 @@ private function initializeList(int $level) : array
private function initializeListWithValue(int $level, mixed $value) : array
{
if ($level === 1) {
if ($value === null) {
return [];
}

return [$value];
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public function test_deeply_nested_lists() : void

self::assertSame(
$values,
(new Dremel())->assemble($shredded->repetitions, $shredded->definitions, $shredded->values)
(new Dremel())->assemble($shredded->repetitions, $shredded->definitions, $shredded->values, 3, 2)
);
}

Expand All @@ -44,7 +44,7 @@ public function test_dremel_shredding_and_assembling() : void
self::assertSame($repetitions, $shredded->repetitions);
self::assertSame($definitions, $shredded->definitions);

$assembledValues = $dremel->assemble($shredded->repetitions, $shredded->definitions, $shredded->values);
$assembledValues = $dremel->assemble($shredded->repetitions, $shredded->definitions, $shredded->values, 2, 1);

self::assertSame($values, $assembledValues);
}
Expand All @@ -63,7 +63,7 @@ public function test_dremel_shredding_and_assembling_list_with_empty_elements()

self::assertSame(
$values,
$dremel->assemble($shredded->repetitions, $shredded->definitions, $shredded->values)
$dremel->assemble($shredded->repetitions, $shredded->definitions, $shredded->values, 3, 1)
);
}

Expand All @@ -81,7 +81,7 @@ public function test_dremel_shredding_and_assembling_list_with_nulls_in_list() :

self::assertSame(
$values,
$dremel->assemble($shredded->repetitions, $shredded->definitions, $shredded->values)
$dremel->assemble($shredded->repetitions, $shredded->definitions, $shredded->values, 3, 1)
);
}

Expand All @@ -101,7 +101,7 @@ public function test_dremel_shredding_and_assembling_nullable_nested_values() :

self::assertSame(
$values,
$dremel->assemble($shredded->repetitions, $shredded->definitions, $shredded->values)
$dremel->assemble($shredded->repetitions, $shredded->definitions, $shredded->values, 2, 1)
);
}
}
88 changes: 10 additions & 78 deletions src/lib/dremel/tests/Flow/Dremel/Tests/Unit/DremelAssembleTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public function test_assembly_shred_on_nested_nullable_lists() : void

self::assertSame(
[[[0, 1, 2]], [[null, null]], [[4, 5, 6]]],
(new Dremel())->assemble($repetitions, $definitions, $values)
(new Dremel())->assemble($repetitions, $definitions, $values, 2, 2)
);
}

Expand All @@ -30,7 +30,7 @@ public function test_decode_column_of_integer_list() : void

self::assertSame(
[[3, 7, 5], [4, 4, 7], [10, 6, 4], [10, 3, 2], [10, 4, 4], [5, 3, 2], [1, 4, 3], [4, 3, 9], [10, 3, 4], [5, 7, 4]],
(new Dremel())->assemble($repetitions, $definitions, $values)
(new Dremel())->assemble($repetitions, $definitions, $values, 3, 1)
);
}

Expand All @@ -42,7 +42,7 @@ public function test_decode_flat_column_of_integers_where_every_second_one_is_nu

self::assertSame(
[0, null, 2, null, 4, null, 6, null, 8, null],
(new Dremel())->assemble($repetitions, $definitions, $values)
(new Dremel())->assemble($repetitions, $definitions, $values, 1, 0)
);
}

Expand All @@ -54,7 +54,7 @@ public function test_decode_flat_column_of_integers_without_nulls() : void

self::assertSame(
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
(new Dremel())->assemble($repetitions, $definitions, $values)
(new Dremel())->assemble($repetitions, $definitions, $values, 1, 0)
);
}

Expand All @@ -66,7 +66,7 @@ public function test_decode_list_with_nulls_and_different_size_of_each_list() :

self::assertSame(
[[10], [7, null, null], [9], [8, null], [9], [4, null, null, null, null], [6], [3, null, null], [10, null, null], [8, null]],
(new Dremel())->assemble($repetitions, $definitions, $values)
(new Dremel())->assemble($repetitions, $definitions, $values, 3, 1)
);
}

Expand Down Expand Up @@ -178,7 +178,7 @@ public function test_decode_nested_list_v2() : void
[[[8, 4, 4], [10]], [[3, 1, 7], [7], [1, 1]], [[2], [1]]],
[[[6, 5], [7, 3], [9]], [[5, 6, 1]], [[4, 8], [7]]],
],
(new Dremel())->assemble($repetitions, $definitions, $values)
(new Dremel())->assemble($repetitions, $definitions, $values, 7, 3)
);
}

Expand All @@ -190,7 +190,7 @@ public function test_decode_nullable_column_of_integer_list() : void

self::assertSame(
[[5, 9, 2], null, [3, 2, 9], null, [5, 2, 3], null, [7, 2, 3], null, [2, 6, 6], null],
(new Dremel())->assemble($repetitions, $definitions, $values)
(new Dremel())->assemble($repetitions, $definitions, $values, 3, 1)
);
}

Expand All @@ -199,75 +199,7 @@ public function test_decode_when_repetitions_definitions_and_values_does_not_hav
$this->expectException(\InvalidArgumentException::class);
$this->expectExceptionMessage('repetitions, definitions and values count must be exactly the same');

\iterator_to_array((new Dremel())->assemble([1, 2], [1], [1, 2]));
}

public function test_decoding_nested_list_with_nulls_and_different_size_of_each_list() : void
{
$repetitions = [0, 2, 1, 2, 0, 2, 2, 1, 2, 2, 0, 2, 2, 1, 2, 2, 1, 2, 2, 0, 2, 2, 1, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 1, 2, 1, 2, 1, 2, 0, 2, 1, 2, 0, 2, 1, 2, 1, 2, 1, 2, 0, 2, 1, 2, 0, 2, 2, 1, 2, 2, 1, 2, 2];
$definitions = [5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 4, 4, 5, 5, 5, 5, 5, 4, 5, 5, 4, 5, 4, 4, 5, 5, 5, 5, 4, 4, 5, 4, 5, 5, 5, 5, 4, 4, 4, 5, 5, 5, 4, 4, 5, 5, 4, 5, 4, 4, 4, 5, 5, 4, 4, 5];
$values = [9, 5, 9, 6, 6, 7, 7, 9, 3, 10, 4, 4, 5, 4, 6, 9, 10, 9, 6, 3, 3, 8, 7, 10, 4, 3, 8, 2, 8, 3, 6, 10, 4, 5, 4, 5, 2, 10, 5, 1, 2, 2, 3, 9, 9, 9, 9, 9];

/**
* stack: []
* current_list: null.
*
* iteration 0: stack[], current_list[9] - rep 0, def 5 (take next val)
* iteration 1: stack[], current_list[9, 5] - rep 2, def 5 (take next val)
* iteration 2: stack[[[9, 5]]], current_list[9] - rep 1 def 5 (take next val)
* iteration 3: stack[[[9, 5]]], current_list[9, null] - rep 2 def 4 (take null)
* iteration 4: stack[[[9, 5], [9, null]]], current_list[6] - rep 0 def 5 (take next val)
* iteration 5: stack[[[9, 5], [9, null]]], current_list[6,6] - rep 2 def 5 (take next val)
* iteration 6: stack[[[9, 5], [9, null]]], current_list[6,6,7] - rep 2 def 5 (take next val)
* iteration 7: stack[[[9, 5], [9, null]], [[6,6,7]], current_list[7] - rep 1 def 5 (take next val)
*/
self::assertSame(
[
[[9, 5], [9, null]],
[[6, 6, 7], [7, 9, 3]],
[[10, 4, 4], [5, 4, null], [6, 9, 10]],
[[9, null, null], [6, 3, 3]],
[[8, 7, null, 10], [4, null, 3, null]],
[[null, 8], [2, 8], [3, null], [null, 6]],
[[null, 10], [4, 5]],
[[4, null], [null, null], [5, 2], [10, null]],
[[null, 5], [1, null]],
[[2, null, null], [null, 2, 3], [null, null, 9]],
],
(new Dremel())->assemble($repetitions, $definitions, $values)
);
}

public function test_decoding_nested_list_with_nulls_and_different_size_of_each_list_and_deep_nesting() : void
{
$repetitions = [0, 3, 2, 3, 2, 3, 2, 3, 1, 3, 2, 3, 2, 3, 2, 3, 1, 3, 2, 3, 2, 3, 2, 3, 1, 3, 2, 3, 2, 3, 2, 3, 0, 3, 3, 2, 3, 3, 2, 3, 3, 2, 3, 3, 1, 3, 3, 2, 3, 3, 2, 3, 3, 2, 3, 3, 0, 3, 3, 3, 2, 3, 3, 3, 1, 3, 3, 3, 2, 3, 3, 3, 1, 3, 3, 3, 2, 3, 3, 3, 1, 3, 3, 3, 2, 3, 3, 3, 0, 3, 3, 2, 3, 3, 2, 3, 3, 1, 3, 3, 2, 3, 3, 2, 3, 3, 0, 3, 2, 3, 1, 3, 2, 3, 1, 3, 2, 3, 0, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 0, 3, 3, 2, 3, 3, 2, 3, 3, 2, 3, 3, 1, 3, 3, 2, 3, 3, 2, 3, 3, 2, 3, 3, 0, 3, 2, 3, 2, 3, 1, 3, 2, 3, 2, 3, 1, 3, 2, 3, 2, 3, 1, 3, 2, 3, 2, 3, 0, 3, 3, 2, 3, 3, 2, 3, 3, 1, 3, 3, 2, 3, 3, 2, 3, 3, 1, 3, 3, 2, 3, 3, 2, 3, 3, 0, 3, 3, 3, 2, 3, 3, 3, 1, 3, 3, 3, 2, 3, 3, 3, 1, 3, 3, 3, 2, 3, 3, 3];
$definitions = [7, 7, 7, 6, 7, 7, 7, 6, 6, 7, 7, 7, 6, 7, 7, 7, 6, 7, 6, 7, 6, 6, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 6, 7, 6, 7, 6, 6, 7, 7, 7, 6, 7, 7, 6, 7, 7, 6, 6, 6, 7, 6, 6, 7, 7, 7, 7, 7, 7, 6, 7, 7, 7, 7, 6, 7, 7, 7, 7, 6, 6, 7, 7, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 6, 7, 6, 7, 7, 7, 7, 7, 7, 6, 7, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 7, 6, 6, 7, 6, 7, 6, 6, 7, 6, 7, 6, 7, 7, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 6, 7, 7, 7, 7, 7, 7, 6, 7, 7, 7, 7, 6, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 6, 7, 7, 7, 7, 6, 6, 7, 7, 7, 6, 6, 7, 7, 6, 6, 7, 6, 6, 6, 6, 7, 6, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 7, 7, 7, 6, 6, 7, 7, 7, 7, 6, 7, 7, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 6, 7, 6, 7, 7, 6];
$values = [10, 5, 9, 1, 10, 7, 6, 3, 1, 6, 9, 9, 4, 6, 3, 8, 5, 8, 1, 10, 9, 4, 6, 2, 8, 4, 6, 8, 9, 4, 10, 4, 8, 4, 3, 9, 2, 10, 8, 5, 2, 7, 8, 7, 10, 3, 8, 1, 5, 10, 6, 5, 6, 1, 6, 5, 6, 7, 1, 6, 10, 7, 8, 2, 8, 2, 7, 7, 8, 7, 3, 9, 2, 3, 10, 1, 8, 3, 8, 7, 9, 5, 9, 8, 3, 6, 3, 10, 10, 10, 6, 8, 7, 2, 1, 6, 2, 4, 4, 1, 7, 2, 1, 7, 8, 9, 3, 9, 4, 8, 4, 10, 9, 5, 9, 2, 1, 6, 9, 3, 6, 10, 4, 5, 9, 10, 8, 3, 3, 9, 7, 10, 9, 4, 10, 5, 1, 2, 7, 6, 10, 2, 5, 7, 5, 3, 9, 10, 1, 5, 7, 6, 7, 4, 4, 6, 7, 7, 6, 3, 6, 1, 8, 2, 8, 3, 7, 9, 3, 5, 8, 5, 2, 2, 5];

/**
* stack: []
* current_list: null.
*
* iteration 0: stack[], current_list[10] - rep 0, def 7 (take next val)
* iteration 1: stack[], current_list[10, 5] - rep 3, def 7 (take next val)
* iteration 2: stack[], current_list[[10, 5], [9]] - rep 2 def 7 (take next val)
* iteration 3: stack[], current_list[[10, 5], [9, null]] - rep 3 def 6 (take null)
*/
self::assertSame(
[
[[[10, 5], [9, null], [1, 10], [7, null]], [[null, 6], [3, 1], [null, 6], [9, 9]], [[null, 4], [null, 6], [null, null], [3, 8]], [[5, 8], [1, 10], [null, null], [9, 4]]],
[[[null, 6, null], [2, null, null], [8, 4, 6], [null, 8, 9]], [[null, 4, 10], [null, null, null], [4, null, null], [8, 4, 3]]],
[[[9, 2, 10, null], [8, 5, 2, 7]], [[null, 8, 7, 10], [3, null, null, 8]], [[1, 5, 10, 6], [null, 5, 6, 1]], [[6, 5, 6, 7], [1, 6, null, 10]]],
[[[7, 8, 2], [8, 2, null], [7, null, 7]], [[8, 7, 3], [9, 2, null], [3, null, null]]],
[[[null, 10], [1, 8]], [[3, 8], [7, 9]], [[5, null], [9, null]]],
[[[null, 8, null, 3], [null, null, 6, null], [3, null, 10, 10], [null, 10, 6, 8]], [[7, 2, 1, 6], [2, 4, null, null], [4, 1, null, 7], [2, 1, 7, 8]]],
[[[9, null, 3], [9, 4, 8], [null, 4, 10], [9, null, 5]], [[9, 2, 1], [6, 9, null], [3, 6, 10], [4, null, null]]],
[[[5, 9], [10, null], [null, 8]], [[3, null], [null, 3], [null, null]], [[null, null], [9, null], [7, 10]], [[9, null], [4, 10], [5, 1]]],
[[[2, 7, 6], [10, 2, null], [null, null, null]], [[5, 7, 5], [null, null, 3], [9, 10, 1]], [[null, 5, 7], [null, null, 6], [7, 4, 4]]],
[[[6, 7, 7, 6], [3, 6, 1, 8]], [[2, 8, 3, null], [7, 9, 3, 5]], [[8, 5, null, 2], [null, 2, 5, null]]],
],
(new Dremel())->assemble($repetitions, $definitions, $values)
);
\iterator_to_array((new Dremel())->assemble([1, 2], [1], [1, 2], 1, 1));
}

public function test_reconstructing_map() : void
Expand Down Expand Up @@ -347,7 +279,7 @@ public function test_reconstructing_map() : void
],
],
],
(new Dremel())->assemble($repetitions, $definitions, $values)
(new Dremel())->assemble($repetitions, $definitions, $values, 11, 4)
);
}

Expand All @@ -360,6 +292,6 @@ public function test_starting_repetitions_with_1() : void
$definitions = [4, 4, 4, 4, 4, 4];
$values = ['value', 'value', 'value', 'value', 'value', 'value'];

(new Dremel())->assemble($repetitions, $definitions, $values);
(new Dremel())->assemble($repetitions, $definitions, $values, 1, 1);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public function test_dremel_shred_on_a_flat_non_nullable_columns() : void
$shredded = (new Dremel())->shred($data, 10);
self::assertEquals(
[
'repetitions' => [],
'repetitions' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'definitions' => [10, 10, 10, 10, 10, 10, 10, 10, 10, 10],
'values' => $data,
],
Expand Down

This file was deleted.

3 changes: 1 addition & 2 deletions src/lib/parquet/src/Flow/Parquet/Data/DataConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

namespace Flow\Parquet\Data;

use Flow\Parquet\Data\Converter\{BytesStringConverter,
use Flow\Parquet\Data\Converter\{
Int32DateConverter,
Int32DateTimeConverter,
Int64DateTimeConverter,
Expand Down Expand Up @@ -40,7 +40,6 @@ public static function initialize(Options $options) : self
new Int32DateTimeConverter(),
new Int64DateTimeConverter(),
new Int96DateTimeConverter(),
new BytesStringConverter(),
new UuidConverter(),
new JsonConverter(),
],
Expand Down
Loading

0 comments on commit ab5af92

Please sign in to comment.