diff --git a/nuclear/nuclear.md b/nuclear/nuclear.md index d377ff1..4347a22 100644 --- a/nuclear/nuclear.md +++ b/nuclear/nuclear.md @@ -381,7 +381,7 @@ def import_nuclear(client): 'country' : { '@ref' : f"Country/{country_long}" }, 'location' : { '@type' : "Point", 'type' : "Point", - 'coordinates': [latitude, longitude] }, + 'coordinates': [longitude, latitude] }, 'capacity' : { '@type' : 'Quantity', 'unit' : 'Unit/MWe', 'quantity' : capacity_mw }, diff --git a/supply-chain/README.md b/supply-chain/README.md new file mode 100644 index 0000000..8e6ecb4 --- /dev/null +++ b/supply-chain/README.md @@ -0,0 +1,225 @@ +# Supply Chain Data Ingestion + +The following example shows how you can perform an ingestion using +python from an Excel spreadsheet. + +## Preparation + +First, create a virtual environment: + +```shell +$ python3 -m venv ~/.virtualenvs/terminusdb +$ source ~/.virtualenvs/terminusdb/bin/activate +``` + +Now we can install the TerminusDB Python Client: + +```shell +$ python3 -m pip install terminusdb-client +``` + +Finally, we will need to install the supporting libraries for reading +Excel files. + +```shell +$ pip install openpyxl +``` + +## Schema Inference + +The first step is to build a schema, however, since we already have a +well structured Excel file which contains data type information and +column names, it makes sense to try to produce on automatically. + +We do this by invocing the file `infer_schema.py`: + +```shell +$ python infer_schema.py +``` + +This program opens the notebook, and iterates over each sheet, taking +the column names and inferring a data type for the column. The heart +of this inference is in the function `infer_schema`: + +```python +def infer_schema(wb_obj): + schema = [] + for sheet_name in wb_obj.sheetnames: + schema_obj = { '@type' : 'Class', '@id' : sheet_name } + object_type = sheet_name + sheet_obj = wb_obj[sheet_name] + # Collect header + for i in itertools.count(start=1): + cell = sheet_obj.cell(row = 1, column = i) + property_name = cell.value + if property_name == None: + break + else: + schema_obj[property_name] = infer_type(sheet_obj, i) + + schema.append(schema_obj) + return schema +``` + +The type inference takes the sheet and the current column number and +tries to take a stab at the correct type. + +```python +def infer_type(sheet_obj, j): + ty = None + for i in range(2, 2+MAX_ROW_SEARCH): + val = sheet_obj.cell(row = i, column = j).value + if val == None: + break + else: + ty = type_map(sheet_obj.cell(row = i, column = j)) + return ty +``` + +In the actual implementation, `MAX_ROW_SEARCH` is set to 1, but in +general, it might be useful to look at more cells to see if there are +empty cells (an optional) or if there are a very small limited number +of options (a potential enum). + +The type map translates Excel data types into xsd data types using the +following function: + +```python +def type_map(cell): + value = cell.value + ty = cell.data_type + if ty == 'n': + return "xsd:decimal" + elif ty == 's': + matches = re.match(OID_REGEXP, value) + if matches: + matchdict = matches.groupdict() + for key in matchdict: + if matchdict[key]: + return key + None + else: + return "xsd:string" + elif ty == 'd': + return "xsd:dateTime" +``` + +Here there is one additional fold. The objects in the tables are +represented using an identifier prefixed with some code. We use a +regexp to match these as IDs, along with a group name which happens to +be the class we will use to hold the identifiers. + +```regexp +^(?PPORT\d*)|(?P(PLANT|CND)\d*)|(?PV(\d|_)*)$ +``` + +And that's basically it! Once you've run this program, you'll get a +JSON output of the schema. + +## Ingesting the data + +Once you've done the schema inference you can run the ingest as +follows: + +```shell +$ python ingest.py +``` + +If you are using TerminusCMS you should replace the client connection +information with the snippet obtained from your profile. + +Here again, the ingest heavy-lifting is done by a single function +which iterates over the workbook and inserts one object type per +table. + +```python +def import_data(client,schema): + objects = [] + stubs = {} + wb_obj = openpyxl.load_workbook(path) + for sheet_name in wb_obj.sheetnames: + print(f"Processing objects {sheet_name}") + object_type = sheet_name + sheet_obj = wb_obj[sheet_name] + cls = schema[object_type] + property_types = [] + # Collect header + for i in itertools.count(start=1): + property_type = sheet_obj.cell(row = 1, column = i).value + if property_type == None: + break + else: + property_types.append(property_type) + for j in itertools.count(start=2): + obj = { '@type' : object_type } + value = None + for i, property_type in enumerate(property_types, 1): + #print(f"{i} {j}") + value_cell = sheet_obj.cell(row = j, column = i) + value = value_cell.value + if value == None: + break + else: + rng = cls[property_type] + #print(rng) + #print(f"property_type: {property_type} range: {rng} value: {value}") + if base_type(rng): + if 'xsd:dateTime' == rng: + obj[property_type] = value.isoformat() + else: + obj[property_type] = value + else: + matches = re.match(infer_schema.OID_REGEXP, value) + if matches: + matchdict = matches.groupdict() + for key in matchdict: + # print(f"matchdict[{key}] = {matchdict[key]}") + if matchdict[key]: + stubs[value] = key + # print(stubs) + if value not in stubs: + print(matchdict) + print(f"value: {value}") + exit(0) + obj[property_type] = { '@ref' : value } + if value == None: + break + objects.append(obj) + # print(objects[-1]) + for oid in stubs: + object_type = stubs[oid] + objects.append({'@type' : object_type, '@capture' : oid, 'id' : oid }) + with open('objects.json', 'w') as f: + f.write(json.dumps(objects)) + client.insert_document(objects) +``` + +Most of this marshalling is straight-foward, however, there are a few +things of interest to note. + +Our date type needs to be converted to one that is in ISO 8601.in +order to be understood by TerminusDB. This is a straigtforward process +of calling the `isoformat` function. + +Those columns that point to object identifiers add a `@ref`, which +allows the foreign object id reference to be used as if it were an +internal terminusdb ID reference. TerminusDB will assign its own id, +but we're allowed to use this one for the purposes of our +transaction. This is extremely convenient when importing from foreign +sources. + +Finally, at the end of ingestion, we need to put in object stubs for +each kind of object which is referred to as a reference in our +tables. These tables all represent relationships between objects who +have nothing but an object identifier, which is a fairly standard +approach to relational database management. + +## Conclusion + +Schema inference is often less trouble than writing a schema from +scratch, and can provide a baseline model which can be altered later +as more information becomes available from other sources. In this case +the schema inference was only about 70 lines of code. + +Hopefully this tutorial gives some ideas about how to attack ingestion +problems from traditional database, csv and excel sources. diff --git a/supply-chain/data/LICENSE b/supply-chain/data/LICENSE new file mode 100644 index 0000000..eb8061c --- /dev/null +++ b/supply-chain/data/LICENSE @@ -0,0 +1,3 @@ +Kalganova, Tatiana; Dzalbs, Ivars (2019): Supply Chain Logistics Problem Dataset. Brunel University London. Dataset. https://doi.org/10.17633/rd.brunel.7558679.v2 + +https://creativecommons.org/licenses/by/4.0/ \ No newline at end of file diff --git a/supply-chain/data/SupplyChainLogisticsProblem.xlsx b/supply-chain/data/SupplyChainLogisticsProblem.xlsx new file mode 100644 index 0000000..c06b733 Binary files /dev/null and b/supply-chain/data/SupplyChainLogisticsProblem.xlsx differ diff --git a/supply-chain/infer_schema.py b/supply-chain/infer_schema.py new file mode 100644 index 0000000..8acd7df --- /dev/null +++ b/supply-chain/infer_schema.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +from terminusdb_client import Client +import json +import urllib.parse +import openpyxl +import itertools +import re + +# Give the location of the file +path = "data/SupplyChainLogisticsProblem.xlsx" +MAX_ROW_SEARCH=1 +OID_REGEXP=r"^(?PPORT\d*)|(?P(PLANT|CND)\d*)|(?PV(\d|_)*)$" + +def type_map(cell): + value = cell.value + ty = cell.data_type + if ty == 'n': + return "xsd:decimal" + elif ty == 's': + matches = re.match(OID_REGEXP, value) + if matches: + matchdict = matches.groupdict() + for key in matchdict: + if matchdict[key]: + return key + None + else: + return "xsd:string" + elif ty == 'd': + return "xsd:dateTime" + +def infer_type(sheet_obj, j): + ty = None + for i in range(2, 2+MAX_ROW_SEARCH): + val = sheet_obj.cell(row = i, column = j).value + if val == None: + break + else: + ty = type_map(sheet_obj.cell(row = i, column = j)) + return ty + +def infer_schema(wb_obj): + schema = [] + for sheet_name in wb_obj.sheetnames: + schema_obj = { '@type' : 'Class', '@id' : sheet_name } + object_type = sheet_name + sheet_obj = wb_obj[sheet_name] + # Collect header + for i in itertools.count(start=1): + cell = sheet_obj.cell(row = 1, column = i) + property_name = cell.value + if property_name == None: + break + else: + schema_obj[property_name] = infer_type(sheet_obj, i) + + schema.append(schema_obj) + return schema + +if __name__ == "__main__": + # To open the workbook + # workbook object is created + wb_obj = openpyxl.load_workbook(path) + prefixes = [{'@type' : '@context', + '@base' : 'http://lib.terminusdb.com/SupplyChain/', + '@schema' : 'http://lib.terminusdb.com/SupplyChain#'}] + schema = infer_schema(wb_obj) + auxiliary = [ { "@type" : "Class", "@id" : "Port", "id" : "xsd:string" }, + { "@type" : "Class", "@id" : "Plant", "id" : "xsd:string" }, + { "@type" : "Class", "@id" : "Customer", "id" : "xsd:string" } ] + with open('supply_chain.json', 'w') as f: + f.write(json.dumps(prefixes+schema+auxiliary)) diff --git a/supply-chain/ingest.py b/supply-chain/ingest.py new file mode 100644 index 0000000..bf53b0a --- /dev/null +++ b/supply-chain/ingest.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +from terminusdb_client import Client +import re +import json +import urllib.parse +import openpyxl +import itertools +import infer_schema +import datetime + +team = "admin" # os.environ['TERMINUSDB_TEAM'] +team_quoted = urllib.parse.quote(team) + +#client = Client(f"https://cloud.terminusdb.com/{team_quoted}/") +client = Client("http://127.0.0.1:6363") +# make sure you have put the token in environment variable +# https://terminusdb.com/docs/terminuscms/get-started +client.connect(team=team) +dbid = "supply_chain" +label = "Supply Chain" +description = "A knowledge graph of supply chain information." + +# Give the location of the file +path = "data/SupplyChainLogisticsProblem.xlsx" + +def base_type(ty): + return ty[0:3] == 'xsd' + +def import_schema(client): + with open('supply_chain.json', 'r') as f: + schema = json.load(f) + client.insert_document(schema, graph_type='schema', compress='never', full_replace=True) + return schema + +def generate_schema_dict(schema): + schema_dict = {} + for sobj in schema: + if sobj['@type']== '@context': + key = '@context' + else: + key = sobj['@id'] + schema_dict[key] = sobj + + return schema_dict + +def import_data(client,schema): + objects = [] + stubs = {} + wb_obj = openpyxl.load_workbook(path) + for sheet_name in wb_obj.sheetnames: + print(f"Processing objects {sheet_name}") + object_type = sheet_name + sheet_obj = wb_obj[sheet_name] + cls = schema[object_type] + property_types = [] + # Collect header + for i in itertools.count(start=1): + property_type = sheet_obj.cell(row = 1, column = i).value + if property_type == None: + break + else: + property_types.append(property_type) + for j in itertools.count(start=2): + obj = { '@type' : object_type } + value = None + for i, property_type in enumerate(property_types, 1): + #print(f"{i} {j}") + value_cell = sheet_obj.cell(row = j, column = i) + value = value_cell.value + if value == None: + break + else: + rng = cls[property_type] + #print(rng) + #print(f"property_type: {property_type} range: {rng} value: {value}") + if base_type(rng): + if 'xsd:dateTime' == rng: + obj[property_type] = value.isoformat() + else: + obj[property_type] = value + else: + matches = re.match(infer_schema.OID_REGEXP, value) + if matches: + matchdict = matches.groupdict() + for key in matchdict: + # print(f"matchdict[{key}] = {matchdict[key]}") + if matchdict[key]: + stubs[value] = key + # print(stubs) + if value not in stubs: + print(matchdict) + print(f"value: {value}") + exit(0) + obj[property_type] = { '@ref' : value } + if value == None: + break + objects.append(obj) + # print(objects[-1]) + for oid in stubs: + object_type = stubs[oid] + objects.append({'@type' : object_type, '@capture' : oid, 'id' : oid }) + with open('objects.json', 'w') as f: + f.write(json.dumps(objects)) + client.insert_document(objects) + +if __name__ == "__main__": + exists = client.has_database(dbid) + + if exists: + client.delete_database(dbid, team=team, force=True) + + client.create_database(dbid, + team, + label=label, + description=description) + schema = import_schema(client) + import_data(client, generate_schema_dict(schema)) + diff --git a/supply-chain/supply_chain.json b/supply-chain/supply_chain.json new file mode 100644 index 0000000..7e0fe74 --- /dev/null +++ b/supply-chain/supply_chain.json @@ -0,0 +1 @@ +[{"@type": "@context", "@base": "http://lib.terminusdb.com/SupplyChain/", "@schema": "http://lib.terminusdb.com/SupplyChain#"}, {"@type": "Class", "@id": "OrderList", "Order ID": "xsd:decimal", "Order Date": "xsd:dateTime", "Origin Port": "Port", "Carrier": "Customer", "TPT": "xsd:decimal", "Service Level": "xsd:string", "Ship ahead day count": "xsd:decimal", "Ship Late Day count": "xsd:decimal", "Customer": "Customer", "Product ID": "xsd:decimal", "Plant Code": "Plant", "Destination Port": "Port", "Unit quantity": "xsd:decimal", "Weight": "xsd:decimal"}, {"@type": "Class", "@id": "FreightRates", "Carrier": "Customer", "orig_port_cd": "Port", "dest_port_cd": "Port", "minm_wgh_qty": "xsd:decimal", "max_wgh_qty": "xsd:decimal", "svc_cd": "xsd:string", "minimum cost": "xsd:decimal", "rate": "xsd:decimal", "mode_dsc": "xsd:string", "tpt_day_cnt": "xsd:decimal", "Carrier type": "Customer"}, {"@type": "Class", "@id": "WhCosts", "WH": "Plant", "Cost/unit": "xsd:decimal"}, {"@type": "Class", "@id": "WhCapacities", "Plant ID": "Plant", "Daily Capacity ": "xsd:decimal"}, {"@type": "Class", "@id": "ProductsPerPlant", "Plant Code": "Plant", "Product ID": "xsd:decimal"}, {"@type": "Class", "@id": "VmiCustomers", "Plant Code": "Plant", "Customers": "Customer"}, {"@type": "Class", "@id": "PlantPorts", "Plant Code": "Plant", "Port": "Port"}, {"@type": "Class", "@id": "Port", "id": "xsd:string"}, {"@type": "Class", "@id": "Plant", "id": "xsd:string"}, {"@type": "Class", "@id": "Customer", "id": "xsd:string"}] \ No newline at end of file