Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added schema inferring example #966

Merged
merged 1 commit into from
Feb 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion examples/topics/phar/data_frame/code.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
use function Flow\ETL\DSL\to_output;

// flow.phar run examples/topics/phar/data_frame/code.php

// when executing data processing pipeline through phar make sure to not use any trigger, like ->run();
// this is handled by the phar internally.
return df()
->read(from_rows(rows(
row(int_entry('id', 1), array_entry('array', ['a' => 1, 'b' => 2, 'c' => 3])),
Expand Down
31 changes: 31 additions & 0 deletions examples/topics/schema/inferring/code.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<?php

declare(strict_types=1);

use function Flow\ETL\Adapter\CSV\from_csv;
use function Flow\ETL\DSL\df;
use function Flow\ETL\DSL\schema_from_json;
use function Flow\ETL\DSL\schema_to_json;
use function Flow\ETL\DSL\to_output;
use Flow\ETL\Loader\StreamLoader\Output;

require __DIR__ . '/../../../autoload.php';

if (!\file_exists(__DIR__ . '/output/schema.json')) {
$schema = df()
->read(from_csv(__DIR__ . '/input/dataset.csv'))
->limit(100) // Limiting the number of rows to read will speed up the process but might bring less accurate results
->autoCast()
->schema();

\file_put_contents(__DIR__ . '/output/schema.json', schema_to_json($schema));
} else {
$schema = schema_from_json(\file_get_contents(__DIR__ . '/output/schema.json'));
}

// Reading schemaless data formats with predefined schema can significantly improve performance
df()
->read(from_csv(__DIR__ . '/input/dataset.csv', schema: $schema))
->collect()
->write(to_output(truncate: false, output: Output::rows_and_schema))
->run();
21 changes: 21 additions & 0 deletions examples/topics/schema/inferring/input/dataset.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees
1,8cC6B5992C0309c,Acevedo LLC,https://www.donovan.com/,Holy See (Vatican City State),Multi-channeled bottom-line core,2019,Graphic Design / Web Design,7070
2,ec094061FeaF7Bc,Walls-Mcdonald,http://arias-willis.net/,Lithuania,Compatible encompassing groupware,2005,Utilities,8156
3,DAcC5dbc58946A7,Gregory PLC,http://www.lynch-hoover.net/,Tokelau,Multi-channeled intangible help-desk,2019,Leisure / Travel,6121
4,8Dd7beDa37FbeD0,"Byrd, Patterson and Knox",https://www.james-velez.net/,Netherlands,Pre-emptive national function,1982,Furniture,3494
5,a3b5c54AEC163e4,Mcdowell-Hopkins,http://fuentes.com/,Mayotte,Cloned bifurcated solution,2016,Online Publishing,36
6,fDfEBeFDaEb59Af,Hayden and Sons,https://www.shaw-mooney.info/,Belize,Persistent mobile task-force,1978,Insurance,7010
7,752ef90Eae1f7f5,Castro LLC,http://wilkinson.com/,Jamaica,Advanced value-added definition,2008,Outsourcing / Offshoring,2526
8,B1D4c5CA34f9992,"Barajas, Baird and Shaw",http://www.jordan-harvey.com/,United States of America,Stand-alone bandwidth-monitored algorithm,2000,Wholesale,4478
9,Cfa1a44106faD4B,"Lucas, Galloway and Benjamin",http://silva.info/,Western Sahara,Persevering leadingedge ability,1990,Retail Industry,8223
10,C08fcf292AB17DF,"Barker, Hubbard and Bennett",http://www.allen.biz/,Mauritania,Decentralized fault-tolerant functionalities,2014,Museums / Institutions,7716
11,94B9bEedc626820,Underwood-Mitchell,https://www.leonard.com/,Italy,Compatible dynamic support,1992,Fine Art,4564
12,FE42dEd40f5DfD8,"Lester, Ochoa and Franco",http://www.munoz.com/,Timor-Leste,Vision-oriented dynamic conglomeration,2014,Motion Pictures / Film,8075
13,1F861fAbeDdCFea,"Arias, Jackson and Hester",https://hardin-thompson.com/,Algeria,Switchable maximized synergy,1980,Utilities,1319
14,456de7dE1ab18ca,Riggs and Sons,http://klein-benton.info/,Czech Republic,Object-based discrete orchestration,2012,Law Enforcement,4946
15,457bcfFF18A7DD2,Stanley LLC,https://bowman.com/,Eritrea,Self-enabling 24/7 groupware,1984,Executive Office,4980
16,5B5ea5aea34dc5F,Page-Ware,http://lam-soto.com/,Togo,Realigned mobile groupware,1991,Entertainment / Movie Production,1307
17,A66F35C298Dfd82,"Garner, Melton and Burgess",https://mathews-knox.com/,Guinea-Bissau,Automated 5thgeneration complexity,2003,E - Learning,9038
18,EdAC2EF13734E0B,Andersen-Fuentes,http://www.mann.com/,Oman,Ameliorated coherent database,1991,Textiles,6436
19,dD1612190b24B12,Ford-Rice,https://peterson-irwin.com/,Turks and Caicos Islands,Sharable intangible leverage,1971,Computer / Network Security,3038
20,992CAdffccEebEa,Collins-Figueroa,http://www.holt-bartlett.info/,Mongolia,Realigned multi-state installation,1985,Aviation / Aerospace,9420
2 changes: 2 additions & 0 deletions examples/topics/schema/inferring/output/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
14 changes: 7 additions & 7 deletions examples/topics/schema/validate/code.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,18 @@

require __DIR__ . '/../../../autoload.php';

$schema = schema(
int_schema('id', $nullable = false),
str_schema('name', $nullable = true),
bool_schema('active', $nullable = false, Metadata::empty()->add('key', 'value')),
);

df()
->read(from_array([
['id' => 1, 'name' => 'Product 1', 'active' => true],
['id' => 2, 'name' => 'Product 2', 'active' => false],
['id' => 3, 'name' => 'Product 3', 'active' => true],
]))
->validate(
schema(
int_schema('id', $nullable = false),
str_schema('name', $nullable = true),
bool_schema('active', $nullable = false, Metadata::empty()->add('key', 'value')),
)
)
->validate($schema)
->write(to_output(false, Output::rows_and_schema))
->run();
9 changes: 2 additions & 7 deletions examples/topics/window_functions/dens_rank/code.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,5 @@
)
->withEntry('rank', dense_rank()->over(window()->partitionBy(ref('department'))->orderBy(ref('salary')->desc())))
->sortBy(ref('department'), ref('rank'))
->write(to_output(false));

if ($_ENV['FLOW_PHAR_APP'] ?? false) {
return $df;
}

$df->run();
->write(to_output(false))
->run();
Loading