Skip to content

Commit 2253b91

Browse files
authored
Added DataFrame::schema method to just read schema from dataset (#925)
1 parent e5d4aeb commit 2253b91

File tree

7 files changed

+200
-0
lines changed

7 files changed

+200
-0
lines changed

src/core/etl/src/Flow/ETL/DSL/functions.php

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -984,6 +984,14 @@ function list_schema(string $name, ListType $type, bool $nullable = false, ?Sche
984984
return Definition::list($name, $type, $nullable, $constraint, $metadata);
985985
}
986986

987+
/**
988+
* @param array<class-string<Row\Entry>> $entry_classes
989+
*/
990+
function union_schema(string $name, array $entry_classes, ?Schema\Constraint $constraint = null, ?Schema\Metadata $metadata = null) : Definition
991+
{
992+
return Definition::union($name, $entry_classes, $constraint, $metadata);
993+
}
994+
987995
/**
988996
* @param class-string<\UnitEnum> $type
989997
*/

src/core/etl/src/Flow/ETL/DataFrame.php

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -769,6 +769,20 @@ public function saveMode(SaveMode $mode) : self
769769
return $this->mode($mode);
770770
}
771771

772+
/**
773+
* @trigger
774+
*/
775+
public function schema() : Schema
776+
{
777+
$schema = new Schema();
778+
779+
foreach ($this->pipeline->process($this->context) as $rows) {
780+
$schema = $schema->merge($rows->schema());
781+
}
782+
783+
return $schema;
784+
}
785+
772786
/**
773787
* @lazy
774788
* Keep only given entries.

src/core/etl/src/Flow/ETL/Row/Schema.php

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,17 @@ public function merge(self $schema) : self
104104
return new self(...\array_values($newDefinitions));
105105
}
106106

107+
public function narrow() : self
108+
{
109+
$definitions = [];
110+
111+
foreach ($this->definitions as $definition) {
112+
$definitions[] = $definition->narrow();
113+
}
114+
115+
return new self(...$definitions);
116+
}
117+
107118
public function nullable() : self
108119
{
109120
$definitions = [];

src/core/etl/src/Flow/ETL/Row/Schema/Definition.php

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,15 @@ public function metadata() : Metadata
283283
return $this->metadata;
284284
}
285285

286+
public function narrow() : self
287+
{
288+
if (!$this->isUnion()) {
289+
return $this;
290+
}
291+
292+
return self::string($this->ref, $this->isNullable(), $this->constraint, $this->metadata);
293+
}
294+
286295
public function nullable() : self
287296
{
288297
if (!\in_array(NullEntry::class, $this->classes, true)) {
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Tests\Integration\DataFrame;
6+
7+
use function Flow\ETL\DSL\array_to_rows;
8+
use function Flow\ETL\DSL\bool_schema;
9+
use function Flow\ETL\DSL\df;
10+
use function Flow\ETL\DSL\from_rows;
11+
use function Flow\ETL\DSL\int_schema;
12+
use function Flow\ETL\DSL\schema;
13+
use function Flow\ETL\DSL\str_schema;
14+
use function Flow\ETL\DSL\union_schema;
15+
use Flow\ETL\Row\Entry\IntegerEntry;
16+
use Flow\ETL\Row\Entry\StringEntry;
17+
use Flow\ETL\Tests\Integration\IntegrationTestCase;
18+
19+
final class SchemaTest extends IntegrationTestCase
20+
{
21+
public function test_getting_schema() : void
22+
{
23+
$rows = array_to_rows(\array_map(
24+
function ($i) {
25+
return [
26+
'id' => $i,
27+
'name' => 'name_' . $i,
28+
'active' => $i % 2 === 0,
29+
];
30+
},
31+
\range(1, 100)
32+
));
33+
34+
$this->assertEquals(
35+
schema(
36+
int_schema('id'),
37+
str_schema('name'),
38+
bool_schema('active')
39+
),
40+
df()
41+
->read(from_rows($rows))
42+
->autoCast()
43+
->schema()
44+
);
45+
}
46+
47+
public function test_getting_schema_from_limited_rows() : void
48+
{
49+
$rows = array_to_rows(\array_map(
50+
function ($i) {
51+
return [
52+
'id' => $i,
53+
'name' => 'name_' . $i,
54+
'active' => $i % 2 === 0,
55+
'union' => $i > 50 ? 'string' : 1,
56+
];
57+
},
58+
\range(1, 100)
59+
));
60+
61+
$this->assertEquals(
62+
schema(
63+
int_schema('id'),
64+
str_schema('name'),
65+
bool_schema('active'),
66+
int_schema('union')
67+
),
68+
df()
69+
->read(from_rows($rows))
70+
->autoCast()
71+
->limit(50)
72+
->schema()
73+
);
74+
}
75+
76+
public function test_getting_schema_with_union_type() : void
77+
{
78+
$rows = array_to_rows(\array_map(
79+
function ($i) {
80+
return [
81+
'id' => $i,
82+
'name' => 'name_' . $i,
83+
'active' => $i % 2 === 0,
84+
'union' => $i > 50 ? 'string' : 1,
85+
];
86+
},
87+
\range(1, 100)
88+
));
89+
90+
$this->assertEquals(
91+
schema(
92+
int_schema('id'),
93+
str_schema('name'),
94+
bool_schema('active'),
95+
union_schema('union', [IntegerEntry::class, StringEntry::class])
96+
),
97+
df()
98+
->read(from_rows($rows))
99+
->autoCast()
100+
->schema()
101+
);
102+
}
103+
}

src/core/etl/tests/Flow/ETL/Tests/Unit/Row/Schema/DefinitionTest.php

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66

77
use function Flow\ETL\DSL\bool_entry;
88
use function Flow\ETL\DSL\int_entry;
9+
use function Flow\ETL\DSL\int_schema;
910
use function Flow\ETL\DSL\null_entry;
1011
use function Flow\ETL\DSL\str_entry;
12+
use function Flow\ETL\DSL\str_schema;
1113
use function Flow\ETL\DSL\struct_element;
1214
use function Flow\ETL\DSL\struct_entry;
1315
use function Flow\ETL\DSL\struct_type;
@@ -17,6 +19,7 @@
1719
use Flow\ETL\PHP\Type\Logical\List\ListElement;
1820
use Flow\ETL\PHP\Type\Logical\ListType;
1921
use Flow\ETL\PHP\Type\Logical\StructureType;
22+
use Flow\ETL\Row\Entry\DateTimeEntry;
2023
use Flow\ETL\Row\Entry\IntegerEntry;
2124
use Flow\ETL\Row\Entry\ListEntry;
2225
use Flow\ETL\Row\Entry\NullEntry;
@@ -175,6 +178,36 @@ public function test_multi_types_is_not_union() : void
175178
$this->assertTrue(Definition::union('id', [IntegerEntry::class, StringEntry::class, NullEntry::class])->isUnion());
176179
}
177180

181+
public function test_narrow_non_union_type() : void
182+
{
183+
$def = int_schema('int');
184+
185+
$this->assertSame(
186+
$def,
187+
$def->narrow()
188+
);
189+
}
190+
191+
public function test_narrow_nullable_union_type() : void
192+
{
193+
$def = Definition::union('test', [IntegerEntry::class, StringEntry::class, DateTimeEntry::class, NullEntry::class]);
194+
195+
$this->assertEquals(
196+
str_schema('test', true),
197+
$def->narrow()
198+
);
199+
}
200+
201+
public function test_narrow_union_type() : void
202+
{
203+
$def = Definition::union('test', [IntegerEntry::class, StringEntry::class, DateTimeEntry::class]);
204+
205+
$this->assertEquals(
206+
str_schema('test'),
207+
$def->narrow()
208+
);
209+
}
210+
178211
public function test_not_matches_when_constraint_not_satisfied() : void
179212
{
180213
$constraint = $this->createMock(Constraint::class);

src/lib/parquet/tests/Flow/Parquet/Tests/Unit/ParquetFile/SchemaTest.php

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,13 @@
22

33
namespace Flow\Parquet\Tests\Unit\ParquetFile;
44

5+
use function Flow\ETL\DSL\int_schema;
6+
use function Flow\ETL\DSL\str_schema;
7+
use function Flow\ETL\DSL\union_schema;
58
use Flow\ETL\Adapter\Elasticsearch\Tests\Integration\TestCase;
9+
use Flow\ETL\Row\Entry\IntegerEntry;
10+
use Flow\ETL\Row\Entry\NullEntry;
11+
use Flow\ETL\Row\Entry\StringEntry;
612
use Flow\Parquet\ParquetFile\Schema;
713
use Flow\Parquet\ParquetFile\Schema\FlatColumn;
814
use Flow\Parquet\ParquetFile\Schema\ListElement;
@@ -105,4 +111,20 @@ public function test_flattening_schema_to_receive_simple_array_of_flat_columns()
105111
$this->assertInstanceOf(FlatColumn::class, $column);
106112
}
107113
}
114+
115+
public function test_narrowing_schema_with_union_types() : void
116+
{
117+
$schema = \Flow\ETL\DSL\schema(
118+
int_schema('id'),
119+
union_schema('tracking_number', [StringEntry::class, IntegerEntry::class, NullEntry::class]),
120+
)->narrow();
121+
122+
$this->assertEquals(
123+
\Flow\ETL\DSL\schema(
124+
int_schema('id'),
125+
str_schema('tracking_number', true),
126+
),
127+
$schema
128+
);
129+
}
108130
}

0 commit comments

Comments
 (0)