Skip to content

Commit 00ff0ee

Browse files
authored
Added file analyze command to CLI (#1438)
* Added file analyze command to CLI * Number format output rows
1 parent 78a9be7 commit 00ff0ee

File tree

6 files changed

+300
-3
lines changed

6 files changed

+300
-3
lines changed

documentation/components/cli/docs.md

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,46 @@ $ flow schema orders.csv --table --auto-cast
173173
7 rows
174174
```
175175
176+
177+
### `file:analyze` alias `analyze`
178+
179+
```
180+
file:analyze --help
181+
Description:
182+
Analyze a file.
183+
184+
Usage:
185+
file:analyze [options] [--] <input-file>
186+
analyze
187+
188+
Arguments:
189+
input-file Path to a file from which schema should be extracted.
190+
191+
Options:
192+
--input-file-format=INPUT-FILE-FORMAT File format. When not set file format is guessed from source file path extension
193+
--input-file-batch-size=INPUT-FILE-BATCH-SIZE Number of rows that are going to be read and displayed in one batch, when set to -1 whole dataset will be displayed at once [default: 1000]
194+
--input-file-limit=INPUT-FILE-LIMIT Limit number of rows that are going to be used to infer file schema, when not set whole file is analyzed
195+
--config=CONFIG Path to a local php file that MUST return instance of: Flow\ETL\Config
196+
--input-json-pointer=INPUT-JSON-POINTER JSON Pointer to a subtree from which schema should be extracted
197+
--input-json-pointer-entry-name When set, JSON Pointer will be used as an entry name in the schema
198+
--input-csv-header[=INPUT-CSV-HEADER] When set, CSV header will be used as a schema
199+
--input-csv-empty-to-null[=INPUT-CSV-EMPTY-TO-NULL] When set, empty CSV values will be treated as NULL values
200+
--input-csv-separator=INPUT-CSV-SEPARATOR CSV separator character
201+
--input-csv-enclosure=INPUT-CSV-ENCLOSURE CSV enclosure character
202+
--input-csv-escape=INPUT-CSV-ESCAPE CSV escape character
203+
--input-xml-node-path=INPUT-XML-NODE-PATH XML node path to a subtree from which schema should be extracted, for example /root/element This is not xpath, just a node names separated by slash
204+
--input-xml-buffer-size=INPUT-XML-BUFFER-SIZE XML buffer size in bytes
205+
--input-parquet-columns=INPUT-PARQUET-COLUMNS Columns to read from parquet file (multiple values allowed)
206+
--input-parquet-offset=INPUT-PARQUET-OFFSET Offset to start reading from
207+
-h, --help Display help for the given command. When no command is given display help for the list command
208+
--silent Do not output any message
209+
-q, --quiet Only errors are displayed. All other output is suppressed
210+
-V, --version Display this application version
211+
--ansi|--no-ansi Force (or disable --no-ansi) ANSI output
212+
-n, --no-interaction Do not ask any interactive question
213+
-v|vv|vvv, --verbose Increase the verbosity of messages: 1 for normal output, 2 for more verbose output and 3 for debug
214+
```
215+
176216
### `file:read` alias `read`
177217
178218
```shell
@@ -315,4 +355,4 @@ Help:
315355
]))
316356
->collect()
317357
->write(to_output());
318-
```
358+
```

src/cli/flow

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env php
22
<?php declare(strict_types=1);
33

4+
use Flow\CLI\Command\FileAnalyzeCommand;
45
use Flow\CLI\Command\FileConvertCommand;
56
use Flow\CLI\Command\FileReadCommand;
67
use Flow\CLI\Command\FileRowsCountCommand;
@@ -52,5 +53,6 @@ $application->add((new FileReadCommand())->setName('file:read')->setAliases(['re
5253
$application->add((new FileSchemaCommand())->setName('file:schema')->setAliases(['schema']));
5354
$application->add((new FileRowsCountCommand())->setName('file:rows:count')->setAliases(['count']));
5455
$application->add((new FileConvertCommand())->setName('file:convert')->setAliases(['convert']));
56+
$application->add((new FileAnalyzeCommand())->setName('file:analyze')->setAliases(['analyze']));
5557

5658
$application->run();
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\CLI\Command;
6+
7+
use function Flow\CLI\{option_int, option_int_nullable};
8+
use function Flow\ETL\DSL\{df};
9+
use Flow\CLI\Arguments\{FilePathArgument};
10+
use Flow\CLI\Command\Traits\{
11+
CSVOptions,
12+
ConfigOptions,
13+
JSONOptions,
14+
ParquetOptions,
15+
XMLOptions
16+
};
17+
use Flow\CLI\Factory\ExtractorFactory;
18+
use Flow\CLI\Options\{ConfigOption, FileFormat, FileFormatOption};
19+
use Flow\CLI\Style\FlowStyle;
20+
use Flow\ETL\{Config, Rows};
21+
use Flow\Filesystem\Path;
22+
use Symfony\Component\Console\Command\Command;
23+
use Symfony\Component\Console\Input\{InputArgument, InputInterface, InputOption};
24+
use Symfony\Component\Console\Output\OutputInterface;
25+
26+
final class FileAnalyzeCommand extends Command
27+
{
28+
use ConfigOptions;
29+
use CSVOptions;
30+
use JSONOptions;
31+
use ParquetOptions;
32+
use XMLOptions;
33+
34+
private const DEFAULT_BATCH_SIZE = 1_000;
35+
36+
private ?FileFormat $fileFormat = null;
37+
38+
private ?Config $flowConfig = null;
39+
40+
private ?Path $sourcePath = null;
41+
42+
public function configure() : void
43+
{
44+
$this
45+
->setName('file:analyze')
46+
->setDescription('Analyze a file.')
47+
->addArgument('input-file', InputArgument::REQUIRED, 'Path to a file from which schema should be extracted.')
48+
->addOption('input-file-format', null, InputArgument::OPTIONAL, 'File format. When not set file format is guessed from source file path extension', null)
49+
->addOption('input-file-batch-size', null, InputOption::VALUE_REQUIRED, 'Number of rows that are going to be read and displayed in one batch, when set to -1 whole dataset will be displayed at once', self::DEFAULT_BATCH_SIZE)
50+
->addOption('input-file-limit', null, InputOption::VALUE_REQUIRED, 'Limit number of rows that are going to be used to infer file schema, when not set whole file is analyzed', null);
51+
52+
$this->addConfigOptions($this);
53+
$this->addJSONInputOptions($this);
54+
$this->addCSVInputOptions($this);
55+
$this->addXMLInputOptions($this);
56+
$this->addParquetInputOptions($this);
57+
}
58+
59+
protected function execute(InputInterface $input, OutputInterface $output) : int
60+
{
61+
$style = new FlowStyle($input, $output);
62+
63+
$style->title('Analyzing File');
64+
$style->info('File path: ' . $this->sourcePath->basename());
65+
66+
$df = df($this->flowConfig)->read((new ExtractorFactory($this->sourcePath, $this->fileFormat))->get($input));
67+
68+
$batchSize = option_int('input-file-batch-size', $input, self::DEFAULT_BATCH_SIZE);
69+
70+
if ($batchSize <= 0) {
71+
$style->error('Batch size must be greater than 0.');
72+
73+
return Command::FAILURE;
74+
}
75+
76+
$df->batchSize($batchSize)
77+
->autoCast();
78+
79+
$limit = option_int_nullable('input-file-limit', $input);
80+
81+
if ($limit !== null && $limit > 0) {
82+
$df->limit($limit);
83+
}
84+
85+
$progress = $style->createProgressBar();
86+
$progress->setFormat('Analyzed Rows: %current% %bar%');
87+
88+
$report = $df->run(
89+
static function (Rows $rows) use ($progress) : void {
90+
$progress->advance($rows->count());
91+
},
92+
analyze: true
93+
);
94+
95+
if ($report === null) {
96+
$style->error("Couldn't analyze given file.");
97+
98+
return Command::FAILURE;
99+
}
100+
101+
$progress->finish();
102+
103+
$style->newLine(2);
104+
105+
$style->clear();
106+
107+
$style->section('Schema');
108+
109+
$normalizedSchema = [];
110+
111+
foreach ($report->schema()->definitions() as $definition) {
112+
$normalizedSchema[] = [
113+
'name' => $definition->entry()->name(),
114+
'type' => $definition->type()->toString(),
115+
'nullable' => $definition->isNullable() ? 'true' : 'false',
116+
'metadata' => $definition->metadata() !== null ? json_encode($definition->metadata(), JSON_PRETTY_PRINT) : null,
117+
];
118+
}
119+
120+
$style->createTable()
121+
->setHeaders(['Name', 'Type', 'Nullable', 'Metadata'])
122+
->setRows($normalizedSchema)
123+
->setStyle('box')
124+
->render();
125+
126+
$formatter = $this->getHelper('formatter');
127+
128+
$style->section('Statistics');
129+
130+
$output->writeln(
131+
$formatter->formatBlock(
132+
[
133+
'Analyzed Rows: ' . \number_format($report->statistics()->totalRows()),
134+
],
135+
'blue-block',
136+
true
137+
)
138+
);
139+
140+
$output->writeln(
141+
$formatter->formatBlock(
142+
[
143+
'Execution Time: ' . $report->statistics()->executionTime->highResolutionTime->toString(),
144+
],
145+
'blue-block',
146+
true
147+
)
148+
);
149+
150+
return Command::SUCCESS;
151+
}
152+
153+
protected function initialize(InputInterface $input, OutputInterface $output) : void
154+
{
155+
$this->flowConfig = (new ConfigOption('config'))->get($input);
156+
$this->sourcePath = (new FilePathArgument('input-file'))->getExisting($input, $this->flowConfig);
157+
$this->fileFormat = (new FileFormatOption($this->sourcePath, 'input-file-format'))->get($input);
158+
}
159+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\CLI\Style;
6+
7+
use Symfony\Component\Console\Cursor;
8+
use Symfony\Component\Console\Formatter\{OutputFormatterStyle};
9+
use Symfony\Component\Console\Input\InputInterface;
10+
use Symfony\Component\Console\Output\OutputInterface;
11+
use Symfony\Component\Console\Style\{SymfonyStyle};
12+
13+
final class FlowStyle extends SymfonyStyle
14+
{
15+
public function __construct(InputInterface $input, private readonly OutputInterface $output)
16+
{
17+
parent::__construct($input, $output);
18+
19+
$output->getFormatter()->setStyle('blue-block', new OutputFormatterStyle('white', 'blue'));
20+
21+
$output->getFormatter()->setStyle('flow-orange-01', new OutputFormatterStyle('#FF5547', null, ['bold', 'blink']));
22+
$output->getFormatter()->setStyle('flow-blue-01', new OutputFormatterStyle('#806DFE', null, ['bold', 'blink']));
23+
$output->getFormatter()->setStyle('flow-blue-02', new OutputFormatterStyle('#5945D8', null, ['bold', 'blink']));
24+
$output->getFormatter()->setStyle('flow-blue-03', new OutputFormatterStyle('#4026AC', null, ['bold', 'blink']));
25+
}
26+
27+
public function clear() : void
28+
{
29+
(new Cursor($this->output))->clearOutput();
30+
}
31+
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\CLI\Tests\Integration;
6+
7+
use Flow\CLI\Command\{FileAnalyzeCommand};
8+
use PHPUnit\Framework\TestCase;
9+
use Symfony\Component\Console\Application;
10+
use Symfony\Component\Console\Tester\CommandTester;
11+
12+
final class FileAnalyzeCommandTest extends TestCase
13+
{
14+
public function test_read_rows_csv() : void
15+
{
16+
$application = new Application();
17+
$application->add(new FileAnalyzeCommand());
18+
$tester = new CommandTester($application->get('file:analyze'));
19+
20+
$tester->execute(['input-file' => __DIR__ . '/Fixtures/orders.csv', '--input-file-limit' => 5]);
21+
22+
$tester->assertCommandIsSuccessful();
23+
24+
self::assertStringContainsString(
25+
<<<'OUTPUT'
26+
Analyzing File
27+
==============
28+
29+
[INFO] File path: orders.csv
30+
OUTPUT,
31+
$tester->getDisplay()
32+
);
33+
34+
self::assertStringContainsString(
35+
<<<'OUTPUT'
36+
┌────────────┬───────────────────────────────────────────────────────────────┬──────────┬──────────┐
37+
│ Name │ Type │ Nullable │ Metadata │
38+
├────────────┼───────────────────────────────────────────────────────────────┼──────────┼──────────┤
39+
│ order_id │ uuid │ false │ {} │
40+
│ created_at │ datetime │ false │ {} │
41+
│ updated_at │ datetime │ false │ {} │
42+
│ discount │ ?float │ true │ {} │
43+
│ address │ map<string, string> │ false │ {} │
44+
│ notes │ list<string> │ false │ {} │
45+
│ items │ list<structure{sku: string, quantity: integer, price: float}> │ false │ {} │
46+
└────────────┴───────────────────────────────────────────────────────────────┴──────────┴──────────┘
47+
OUTPUT,
48+
$tester->getDisplay()
49+
);
50+
51+
self::assertStringContainsString(
52+
<<<'OUTPUT'
53+
Analyzed Rows: 5
54+
OUTPUT,
55+
$tester->getDisplay()
56+
);
57+
58+
self::assertStringContainsString(
59+
<<<'OUTPUT'
60+
Execution Time:
61+
OUTPUT,
62+
$tester->getDisplay()
63+
);
64+
}
65+
}

src/core/etl/src/Flow/ETL/Dataset/Statistics/HighResolutionTime.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,9 @@ public function toSeconds() : float
5353
return $this->seconds + $this->nanoseconds / 1_000_000_000;
5454
}
5555

56-
public function toString() : string
56+
public function toString(int $precision = 9) : string
5757
{
58-
$formatted = number_format($this->toSeconds(), 9, '.', '');
58+
$formatted = number_format($this->toSeconds(), $precision, '.', '');
5959
$formatted = rtrim($formatted, '0');
6060
$formatted = rtrim($formatted, '.');
6161

0 commit comments

Comments
 (0)