Skip to content

Commit be1b369

Browse files
committed
DataFrame::run method can now return execution report with Schema and Statistics
1 parent 9bc119e commit be1b369

File tree

5 files changed

+2657
-1
lines changed

5 files changed

+2657
-1
lines changed

src/core/etl/src/Flow/ETL/DataFrame.php

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
use function Flow\ETL\DSL\to_output;
88
use Flow\ETL\DataFrame\GroupedDataFrame;
99
use Flow\ETL\DataFrame\PartitionedDataFrame;
10+
use Flow\ETL\Dataset\Report;
11+
use Flow\ETL\Dataset\Statistics;
1012
use Flow\ETL\Exception\InvalidArgumentException;
1113
use Flow\ETL\Exception\InvalidFileFormatException;
1214
use Flow\ETL\Exception\RuntimeException;
@@ -748,17 +750,32 @@ public function rows(Transformer|Transformation $transformer) : self
748750
* @trigger
749751
*
750752
* @param null|callable(Rows $rows): void $callback
753+
* @param bool $analyze - when set to true, run will return Report
751754
*/
752755
#[DSLMethod(exclude: true)]
753-
public function run(?callable $callback = null) : void
756+
public function run(?callable $callback = null, bool $analyze = false) : null|Report
754757
{
755758
$clone = clone $this;
756759

760+
$totalRows = 0;
761+
$schema = new Schema();
762+
757763
foreach ($clone->pipeline->process($clone->context) as $rows) {
758764
if ($callback !== null) {
759765
$callback($rows);
760766
}
767+
768+
if ($analyze) {
769+
$schema = $schema->merge($rows->schema());
770+
$totalRows += $rows->count();
771+
}
761772
}
773+
774+
if ($analyze) {
775+
return new Report($schema, new Statistics($totalRows));
776+
}
777+
778+
return null;
762779
}
763780

764781
/**
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Dataset;
6+
7+
use Flow\ETL\Row\Schema;
8+
9+
final class Report
10+
{
11+
public function __construct(
12+
private readonly Schema $schema,
13+
private readonly Statistics $statistics,
14+
) {
15+
16+
}
17+
18+
public function schema() : Schema
19+
{
20+
return $this->schema;
21+
}
22+
23+
public function statistics() : Statistics
24+
{
25+
return $this->statistics;
26+
}
27+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Dataset;
6+
7+
final class Statistics
8+
{
9+
public function __construct(
10+
private readonly int $totalRows,
11+
) {
12+
}
13+
14+
public function totalRows() : int
15+
{
16+
return $this->totalRows;
17+
}
18+
}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Tests\Integration\DataFrame;
6+
7+
use function Flow\ETL\Adapter\CSV\from_csv;
8+
use function Flow\ETL\Adapter\Text\from_text;
9+
use function Flow\ETL\DSL\datetime_schema;
10+
use function Flow\ETL\DSL\df;
11+
use function Flow\ETL\DSL\float_schema;
12+
use function Flow\ETL\DSL\int_schema;
13+
use function Flow\ETL\DSL\schema;
14+
use function Flow\ETL\DSL\str_schema;
15+
use Flow\ETL\Tests\Integration\IntegrationTestCase;
16+
17+
final class AnalyzeTest extends IntegrationTestCase
18+
{
19+
public function test_analyzing_csv_file_with_auto_cast() : void
20+
{
21+
$report = df()
22+
->read(from_csv(__DIR__ . '/Fixtures/Analyze/goldstock.csv'))
23+
->autoCast()
24+
->run(analyze: true);
25+
26+
$this->assertSame(2511, $report->statistics()->totalRows());
27+
$this->assertEquals(
28+
schema(
29+
int_schema('Index'),
30+
datetime_schema('Date'),
31+
float_schema('Close'),
32+
float_schema('Volume'),
33+
float_schema('Open'),
34+
float_schema('High'),
35+
float_schema('Low'),
36+
),
37+
$report->schema()
38+
);
39+
$this->assertSame(7, $report->schema()->count());
40+
}
41+
42+
public function test_analyzing_csv_file_with_limit() : void
43+
{
44+
$report = df()
45+
->read(from_csv(__DIR__ . '/Fixtures/Analyze/goldstock.csv'))
46+
->limit(100)
47+
->run(analyze: true);
48+
49+
$this->assertSame(100, $report->statistics()->totalRows());
50+
$this->assertEquals(
51+
schema(
52+
str_schema('Index'),
53+
str_schema('Date'),
54+
str_schema('Close'),
55+
str_schema('Volume'),
56+
str_schema('Open'),
57+
str_schema('High'),
58+
str_schema('Low'),
59+
),
60+
$report->schema()
61+
);
62+
$this->assertSame(7, $report->schema()->count());
63+
}
64+
65+
public function test_analyzing_partitioned_datasets() : void
66+
{
67+
$report = df()
68+
->read(from_text(__DIR__ . '/Fixtures/Partitioning/multi_partition_pruning_test/year=*/month=*/day=*/*.txt'))
69+
->run(analyze: true);
70+
71+
$this->assertSame(7, $report->statistics()->totalRows());
72+
$this->assertEquals(
73+
schema(
74+
str_schema('year'),
75+
str_schema('month'),
76+
str_schema('day'),
77+
str_schema('text'),
78+
),
79+
$report->schema()
80+
);
81+
}
82+
}

0 commit comments

Comments
 (0)