Skip to content

Commit e5fa814

Browse files
authored
feat: implement type generation (#23)
* implemented type writer * added docs for types generation * updated readme usage * added test for types invoke * removed pretty option * added test for console
1 parent 7a7aefe commit e5fa814

File tree

6 files changed

+357
-5
lines changed

6 files changed

+357
-5
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,4 @@ samples/**/xidmap/*
158158

159159
settings.json
160160
schema.txt
161+
types.txt

README.md

+75-4
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ A Library (with accompanying cli tool) to transform [Pandas](https://pandas.pyda
1717
- [Edges](#edges)
1818
- [Configuration](#configuration)
1919
- [Additional Configuration](#additional-configuration)
20-
- [Schema](#schema)
20+
- [Schema and Types](#schema-and-types)
21+
- [Generating a Schema](#generating-a-schema)
22+
- [Generating Types](#generating-types)
2123
- [Samples](#samples)
2224
- [Working with Larger Files](#working-with-larger-files)
2325
- [Command Line](#command-line-1)
@@ -34,8 +36,9 @@ python -m pip install dgraphpandas
3436

3537
```sh
3638
❯ dgraphpandas --help
37-
usage: dgraphpandas [-h] -f FILE -c CONFIG -ck CONFIG_FILE_KEY [-o OUTPUT_DIR]
38-
[--console] [--export_csv] [--encoding ENCODING]
39+
usage: dgraphpandas [-h] [-x {upserts,schema,types}] [-f FILE] -c CONFIG
40+
[-ck CONFIG_FILE_KEY] [-o OUTPUT_DIR] [--console]
41+
[--export_csv] [--encoding ENCODING]
3942
[--chunk_size CHUNK_SIZE]
4043
[--gz_compression_level GZ_COMPRESSION_LEVEL]
4144
[--key_separator KEY_SEPARATOR]
@@ -387,7 +390,9 @@ These options can be placed on the root of the config or passed as `kwargs` dire
387390
- Schema option to define an edge as a list. This will ensure the type is `[uid]` rather then just `uid`
388391

389392

390-
## Schema
393+
## Schema and Types
394+
395+
### Generating a Schema
391396

392397
DGraph allows you to define a [schema](https://dgraph.io/docs/query-language/schema/#sidebar). This can be generated using the same configuration used above but there are also additional options you can add such as `options` and `list_edges` which are exclusively used for schema generation.
393398

@@ -440,6 +445,72 @@ class: uid @reverse .
440445
dgraph live -s schema.txt
441446
```
442447

448+
### Generating Types
449+
450+
DGraph also allows you to define [types](https://dgraph.io/docs/query-language/type-system/#sidebar) that can be used to categorize nodes. This can also be generated from the same configuration as data loading.
451+
452+
453+
```sh
454+
# Model the data, define types, edges and any options
455+
echo '
456+
{
457+
"transform": "horizontal",
458+
"files": {
459+
"animal": {
460+
"subject_fields": ["species_id"],
461+
"type_overrides": {
462+
"name": "string",
463+
"legs": "int",
464+
"weight": "float",
465+
"height": "float",
466+
"discovered": "datetime64",
467+
"aquatic": "bool"
468+
},
469+
"edge_fields": ["class_id", "found_in"],
470+
"class": ["@reverse"],
471+
"found_in": ["@reverse", "@count"],
472+
"list_edges": ["found_in"]
473+
},
474+
"habitat": {
475+
"subject_fields": ["id"],
476+
"type_overrides": {
477+
"name": "string"
478+
}
479+
}
480+
}
481+
}' > dgraphpandas.json
482+
483+
# Apply the config to the schema generation logic
484+
> dgraphpandas -c dgraphpandas.json -x types -v DEBUG
485+
486+
# Inspect Types
487+
❯ cat types.txt
488+
type animal {
489+
found_in
490+
aquatic
491+
discovered
492+
height
493+
weight
494+
legs
495+
species
496+
name
497+
class
498+
}
499+
500+
type habitat {
501+
id
502+
name
503+
}
504+
505+
506+
# Apply to DGraph
507+
# NOTE: you should always apply the schema
508+
# before applying types else dgraph
509+
# won't know what the predicates are
510+
dgraph live -s types.txt
511+
512+
```
513+
443514
## Samples
444515

445516
Samples can be found [here](https://github.com/kiran94/dgraphpandas/tree/main/samples). They follow a convention where the download script can be found within the `input` directory and the config, generate_upsert, publish scripts can be found root of each respective sample.

dgraphpandas/__main__.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,14 @@
88
from dgraphpandas import __version__, __description__, to_rdf
99
from dgraphpandas.strategies.schema import create_schema
1010
from dgraphpandas.writers.schema import generate_schema
11+
from dgraphpandas.writers.types import generate_types
1112

1213
pd.set_option('mode.chained_assignment', None)
1314

1415

1516
def main():
1617
parser = argparse.ArgumentParser(description=__description__)
17-
parser.add_argument('-x', '--method', choices=['upserts', 'schema'], default='upserts')
18+
parser.add_argument('-x', '--method', choices=['upserts', 'schema', 'types'], default='upserts')
1819
parser.add_argument('-f', '--file', required=False, help='The Data File (CSV) to convert into RDF.')
1920
parser.add_argument('-c', '--config', required=True, help='The DgraphPandas Configuration. See Documentation for options/examples.')
2021
parser.add_argument('-ck', '--config_file_key', required=False, help='The Entry in the Configuration to use for this passed file.')
@@ -72,6 +73,10 @@ def main():
7273
schema_frame = create_schema(args.config, ensure_xid_predicate=True, **(options))
7374
generate_schema(schema_frame, export_schema=True, **(options))
7475

76+
elif args.method == 'types':
77+
schema_frame = create_schema(args.config, ensure_xid_predicate=True, **(options))
78+
generate_types(schema_frame, export_schema=True, **(options))
79+
7580

7681
if __name__ == '__main__':
7782
main() # pragma: no cover

dgraphpandas/writers/types.py

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import os
2+
import logging
3+
from typing import List
4+
5+
import pandas as pd
6+
7+
logger = logging.getLogger(__name__)
8+
9+
10+
def generate_types(frame: pd.DataFrame, **kwargs) -> List[str]:
11+
'''
12+
Given the pre-processed DataFrame from the schema
13+
strategy, generate types.
14+
'''
15+
if frame is None:
16+
raise ValueError('frame')
17+
if 'column' not in frame:
18+
raise ValueError('column')
19+
if 'type' not in frame:
20+
raise ValueError('type')
21+
if 'table' not in frame:
22+
raise ValueError('table')
23+
if 'options' not in frame:
24+
raise ValueError('options')
25+
26+
output_dir = kwargs.get('output_dir', '.')
27+
export_schema = kwargs.get('export_schema', False)
28+
export_file = kwargs.get('export_file', 'types.txt')
29+
console = kwargs.get('console', False)
30+
encoding = kwargs.get('encoding', 'utf-8')
31+
line_delimeter = kwargs.get('line_delimeter ', '\n')
32+
33+
all_types: List[str] = []
34+
all_types_reverse: List[str] = []
35+
36+
tables = frame.groupby(by=['table'])
37+
for name, current_frame in tables:
38+
logger.debug(f'Creating types for {name}')
39+
40+
reverse_edge_mask = (~current_frame['options'].isnull()) & current_frame['options'].str.contains('@reverse')
41+
current_frame.loc[reverse_edge_mask, 'column'] = '<~' + current_frame['column'] + '>'
42+
43+
type_builder = 'type ' + name
44+
type_builder += ' { '
45+
type_builder += line_delimeter
46+
type_builder += line_delimeter.join(current_frame['column'].unique().tolist())
47+
type_builder += line_delimeter
48+
type_builder += ' }'
49+
type_builder += line_delimeter
50+
51+
# Split up types with reverse edges so we can gurantee they are applied after other types
52+
# This is required because if dgraph live encounters a reverse edge for a type defined later in the file
53+
# then dgraph live will fails.
54+
# NOTE: There might be a better solution here
55+
# and we could build a dependency tree based on the references
56+
# topological sort?
57+
# also this won't detect circular dependencies
58+
if current_frame.loc[reverse_edge_mask, 'column'].shape[0]:
59+
all_types_reverse.append(type_builder)
60+
else:
61+
all_types.append(type_builder)
62+
63+
if console:
64+
print(type_builder)
65+
print(line_delimeter)
66+
67+
if export_schema:
68+
export_path = os.path.join(output_dir, export_file)
69+
logger.debug(f'Writing to {export_path} ({encoding})')
70+
with open(export_path, 'w', encoding=encoding) as f:
71+
for current_type in all_types:
72+
f.write(current_type)
73+
f.write('\n')
74+
for current_type in all_types_reverse:
75+
f.write(current_type)
76+
f.write('\n')
77+
78+
return all_types + all_types_reverse

tests/test_main.py

+54
Original file line numberDiff line numberDiff line change
@@ -248,3 +248,57 @@ def test_version(
248248
main()
249249

250250
assert __version__ in capsys.readouterr().out
251+
252+
253+
@patch('dgraphpandas.__main__.logging')
254+
@patch('dgraphpandas.__main__.generate_types')
255+
@patch('dgraphpandas.__main__.create_schema')
256+
@patch('dgraphpandas.__main__.sys')
257+
def test_types(
258+
argv_mock: Mock,
259+
create_schema_mock: Mock,
260+
generate_types_mock: Mock,
261+
logger_mock: Mock,
262+
capsys):
263+
'''
264+
Ensures when types is called, the underlying services
265+
are called
266+
'''
267+
argv_mock.argv = [
268+
'script',
269+
'-x', 'types',
270+
'-c', 'config.json',
271+
]
272+
273+
create_schema_mock.return_value = 'fake_schema'
274+
275+
main()
276+
277+
assert create_schema_mock.called
278+
assert generate_types_mock.called
279+
280+
args, kwargs = create_schema_mock.call_args_list[0]
281+
282+
assert args == ('config.json',)
283+
assert kwargs == {
284+
'add_dgraph_type_records': True,
285+
'drop_na_intrinsic_objects': True,
286+
'drop_na_edge_objects': True,
287+
'illegal_characters': ['%', '\\.', '\\s', '"', '\\n', '\\r\\n'],
288+
'illegal_characters_intrinsic_object': ['"', '\\n', '\\r\\n'],
289+
'chunk_size': 10000000,
290+
'ensure_xid_predicate': True
291+
}
292+
293+
args, kwargs = generate_types_mock.call_args_list[0]
294+
295+
assert args == ('fake_schema',)
296+
assert kwargs == {
297+
'add_dgraph_type_records': True,
298+
'drop_na_intrinsic_objects': True,
299+
'drop_na_edge_objects': True,
300+
'illegal_characters': ['%', '\\.', '\\s', '"', '\\n', '\\r\\n'],
301+
'illegal_characters_intrinsic_object': ['"', '\\n', '\\r\\n'],
302+
'chunk_size': 10000000,
303+
'export_schema': True
304+
}

0 commit comments

Comments
 (0)