Skip to content

Commit 2da8585

Browse files
authored
Added DataFrame::autoCast() (#923)
* Added DataFrame::autoCast() * Delay using json_validate to avoid turning scalars into jsons
1 parent afddb71 commit 2da8585

File tree

7 files changed

+416
-68
lines changed

7 files changed

+416
-68
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
use function Flow\ETL\Adapter\CSV\from_csv;
6+
use function Flow\ETL\DSL\to_output;
7+
use Aeon\Calendar\Stopwatch;
8+
use Flow\ETL\Flow;
9+
use Flow\ETL\Loader\StreamLoader\Output;
10+
11+
require __DIR__ . '/../../../bootstrap.php';
12+
13+
if (!\file_exists(__FLOW_OUTPUT__ . '/dataset.csv')) {
14+
include __DIR__ . '/../../../setup/php_to_csv.php';
15+
}
16+
17+
$flow = (new Flow())
18+
->read(from_csv(__FLOW_OUTPUT__ . '/dataset.csv'))
19+
->limit(1000)
20+
->autoCast()
21+
->collect()
22+
->write(to_output(false, Output::rows_and_schema));
23+
24+
if ($_ENV['FLOW_PHAR_APP'] ?? false) {
25+
return $flow;
26+
}
27+
28+
$csvFileSize = \round(\filesize(__FLOW_OUTPUT__ . '/dataset.csv') / 1024 / 1024);
29+
print "Reading CSV {$csvFileSize}Mb file...\n";
30+
31+
$stopwatch = new Stopwatch();
32+
$stopwatch->start();
33+
34+
$flow->run();
35+
36+
$stopwatch->stop();
37+
38+
print "Total elapsed time: {$stopwatch->totalElapsedTime()->inSecondsPrecise()}s\n";

src/core/etl/src/Flow/ETL/DataFrame.php

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
use Flow\ETL\Row\Reference;
3232
use Flow\ETL\Row\References;
3333
use Flow\ETL\Row\Schema;
34+
use Flow\ETL\Transformer\AutoCastTransformer;
3435
use Flow\ETL\Transformer\CallbackRowTransformer;
3536
use Flow\ETL\Transformer\CrossJoinRowsTransformer;
3637
use Flow\ETL\Transformer\DropDuplicatesTransformer;
@@ -148,6 +149,13 @@ public function appendSafe(bool $appendSafe = true) : self
148149
return $this;
149150
}
150151

152+
public function autoCast() : self
153+
{
154+
$this->pipeline->add(new AutoCastTransformer());
155+
156+
return $this;
157+
}
158+
151159
/**
152160
* Merge/Split Rows yielded by Extractor into batches of given size.
153161
* For example, when Extractor is yielding one row at time, this method will merge them into batches of given size

src/core/etl/src/Flow/ETL/Row/Factory/NativeEntryFactory.php

Lines changed: 9 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -58,20 +58,18 @@ public function create(string $entryName, mixed $value, ?Schema $schema = null)
5858

5959
if ($valueType instanceof ScalarType) {
6060
if ($valueType->isString()) {
61-
$trimmedValue = \trim($value);
61+
$stringChecker = new StringTypeChecker($value);
6262

63-
if ('' !== $trimmedValue) {
64-
if ($this->isJson($trimmedValue)) {
65-
return json_entry($entryName, $value);
66-
}
63+
if ($stringChecker->isJson()) {
64+
return json_entry($entryName, $value);
65+
}
6766

68-
if ($this->isUuid($trimmedValue)) {
69-
return uuid_entry($entryName, Entry\Type\Uuid::fromString($value));
70-
}
67+
if ($stringChecker->isUuid()) {
68+
return uuid_entry($entryName, Entry\Type\Uuid::fromString($value));
69+
}
7170

72-
if ($this->isXML($trimmedValue)) {
73-
return xml_entry($entryName, $value);
74-
}
71+
if ($stringChecker->isXML()) {
72+
return xml_entry($entryName, $value);
7573
}
7674

7775
return str_entry($entryName, $value);
@@ -246,61 +244,4 @@ private function fromDefinition(Schema\Definition $definition, mixed $value) : E
246244

247245
throw new InvalidArgumentException("Can't convert value into entry \"{$definition->entry()}\"");
248246
}
249-
250-
private function isJson(string $string) : bool
251-
{
252-
if ('{' !== $string[0] && '[' !== $string[0]) {
253-
return false;
254-
}
255-
256-
if (
257-
(!\str_starts_with($string, '{') || !\str_ends_with($string, '}'))
258-
&& (!\str_starts_with($string, '[') || !\str_ends_with($string, ']'))
259-
) {
260-
return false;
261-
}
262-
263-
try {
264-
return \is_array(\json_decode($string, true, flags: \JSON_THROW_ON_ERROR));
265-
} catch (\Exception) {
266-
return false;
267-
}
268-
}
269-
270-
private function isUuid(string $string) : bool
271-
{
272-
if (\strlen($string) !== 36) {
273-
return false;
274-
}
275-
276-
return 0 !== \preg_match(Entry\Type\Uuid::UUID_REGEXP, $string);
277-
}
278-
279-
private function isXML(string $string) : bool
280-
{
281-
if ('<' !== $string[0]) {
282-
return false;
283-
}
284-
285-
if (\preg_match('/<(.+?)>(.+?)<\/(.+?)>/', $string) === 1) {
286-
try {
287-
\libxml_use_internal_errors(true);
288-
289-
$doc = new \DOMDocument();
290-
$result = $doc->loadXML($string);
291-
\libxml_clear_errors(); // Clear any errors if needed
292-
\libxml_use_internal_errors(false); // Restore standard error handling
293-
294-
/** @psalm-suppress RedundantCastGivenDocblockType */
295-
return (bool) $result;
296-
} catch (\Exception) {
297-
\libxml_clear_errors(); // Clear any errors if needed
298-
\libxml_use_internal_errors(false); // Restore standard error handling
299-
300-
return false;
301-
}
302-
}
303-
304-
return false;
305-
}
306247
}
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Row\Factory;
6+
7+
use Flow\ETL\Row\Entry\Type\Uuid;
8+
9+
final class StringTypeChecker
10+
{
11+
private readonly string $string;
12+
13+
public function __construct(string $string)
14+
{
15+
$this->string = \trim($string);
16+
}
17+
18+
public function isBoolean() : bool
19+
{
20+
if ($this->string === '') {
21+
return false;
22+
}
23+
24+
return \in_array(\strtolower($this->string), ['true', 'false'], true);
25+
}
26+
27+
public function isDateTime() : bool
28+
{
29+
if ($this->string === '') {
30+
return false;
31+
}
32+
33+
$dateParts = \date_parse($this->string);
34+
35+
if ($dateParts['error_count'] > 0) {
36+
return false;
37+
}
38+
39+
if ($dateParts['year'] === false) {
40+
return false;
41+
}
42+
43+
if ($dateParts['month'] === false) {
44+
return false;
45+
}
46+
47+
if ($dateParts['day'] === false) {
48+
return false;
49+
}
50+
51+
return true;
52+
}
53+
54+
public function isFloat() : bool
55+
{
56+
if ($this->string === '') {
57+
return false;
58+
}
59+
60+
return \is_numeric($this->string) && \str_contains($this->string, '.');
61+
}
62+
63+
public function isInteger() : bool
64+
{
65+
if ($this->string === '') {
66+
return false;
67+
}
68+
69+
if (\is_numeric($this->string)) {
70+
return (string) ((int) $this->string) === $this->string;
71+
}
72+
73+
return false;
74+
}
75+
76+
public function isJson() : bool
77+
{
78+
if ($this->string === '') {
79+
return false;
80+
}
81+
82+
if ('{' !== $this->string[0] && '[' !== $this->string[0]) {
83+
return false;
84+
}
85+
86+
if (\function_exists('json_validate')) {
87+
return \json_validate($this->string);
88+
}
89+
90+
if (
91+
(!\str_starts_with($this->string, '{') || !\str_ends_with($this->string, '}'))
92+
&& (!\str_starts_with($this->string, '[') || !\str_ends_with($this->string, ']'))
93+
) {
94+
return false;
95+
}
96+
97+
try {
98+
return \is_array(\json_decode($this->string, true, flags: \JSON_THROW_ON_ERROR));
99+
} catch (\Exception) {
100+
return false;
101+
}
102+
}
103+
104+
public function isNull() : bool
105+
{
106+
return \in_array(\mb_strtolower($this->string), ['null', 'nil'], true);
107+
}
108+
109+
public function isUuid() : bool
110+
{
111+
if ($this->string === '') {
112+
return false;
113+
}
114+
115+
if (\strlen($this->string) !== 36) {
116+
return false;
117+
}
118+
119+
return 0 !== \preg_match(Uuid::UUID_REGEXP, $this->string);
120+
}
121+
122+
public function isXML() : bool
123+
{
124+
if ($this->string === '') {
125+
return false;
126+
}
127+
128+
if ('<' !== $this->string[0]) {
129+
return false;
130+
}
131+
132+
if (\preg_match('/<(.+?)>(.+?)<\/(.+?)>/', $this->string) === 1) {
133+
try {
134+
\libxml_use_internal_errors(true);
135+
136+
$doc = new \DOMDocument();
137+
$result = $doc->loadXML($this->string);
138+
\libxml_clear_errors(); // Clear any errors if needed
139+
\libxml_use_internal_errors(false); // Restore standard error handling
140+
141+
/** @psalm-suppress RedundantCastGivenDocblockType */
142+
return (bool) $result;
143+
} catch (\Exception) {
144+
\libxml_clear_errors(); // Clear any errors if needed
145+
\libxml_use_internal_errors(false); // Restore standard error handling
146+
147+
return false;
148+
}
149+
}
150+
151+
return false;
152+
}
153+
154+
public function value() : string
155+
{
156+
return $this->string;
157+
}
158+
}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Flow\ETL\Transformer;
6+
7+
use function Flow\ETL\DSL\bool_entry;
8+
use function Flow\ETL\DSL\datetime_entry;
9+
use function Flow\ETL\DSL\float_entry;
10+
use function Flow\ETL\DSL\int_entry;
11+
use function Flow\ETL\DSL\json_entry;
12+
use function Flow\ETL\DSL\null_entry;
13+
use function Flow\ETL\DSL\uuid_entry;
14+
use Flow\ETL\FlowContext;
15+
use Flow\ETL\Row;
16+
use Flow\ETL\Row\Entry;
17+
use Flow\ETL\Row\Entry\StringEntry;
18+
use Flow\ETL\Rows;
19+
use Flow\ETL\Transformer;
20+
21+
final class AutoCastTransformer implements Transformer
22+
{
23+
public function autoCast(Entry $entry) : Entry
24+
{
25+
if (!$entry instanceof StringEntry) {
26+
return $entry;
27+
}
28+
29+
$typeChecker = new Row\Factory\StringTypeChecker($entry->value());
30+
31+
if ($typeChecker->isNull()) {
32+
return null_entry($entry->name());
33+
}
34+
35+
if ($typeChecker->isInteger()) {
36+
return int_entry($entry->name(), (int) $entry->value());
37+
}
38+
39+
if ($typeChecker->isFloat()) {
40+
return float_entry($entry->name(), (float) $entry->value());
41+
}
42+
43+
if ($typeChecker->isBoolean()) {
44+
return bool_entry($entry->name(), (bool) $entry->value());
45+
}
46+
47+
if ($typeChecker->isJson()) {
48+
return json_entry($entry->name(), $entry->value());
49+
}
50+
51+
if ($typeChecker->isUuid()) {
52+
return uuid_entry($entry->name(), $entry->value());
53+
}
54+
55+
if ($typeChecker->isDateTime()) {
56+
return datetime_entry($entry->name(), $entry->value());
57+
}
58+
59+
return $entry;
60+
}
61+
62+
public function transform(Rows $rows, FlowContext $context) : Rows
63+
{
64+
return $rows->map(function (Row $row) {
65+
return $row->map(function (Entry $entry) {
66+
return $this->autoCast($entry);
67+
});
68+
});
69+
}
70+
}

0 commit comments

Comments
 (0)