Skip to content

Commit 3d1c53e

Browse files
authored
SHACL Validation (#767)
* Adds SHACL validation to renku doctor * Adds renku log tests * Adds --strict option to renku log for shape validation
1 parent ab65fa6 commit 3d1c53e

24 files changed

+1430
-72
lines changed

MANIFEST.in

+2-1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ recursive-include renku *.html
5656
recursive-include renku *.sh
5757
recursive-include renku *.txt
5858
recursive-include renku *.yml
59+
recursive-include renku *.json
5960
recursive-include renku Dockerfile
60-
recursive-include tests *.py *.gz *.yml
61+
recursive-include tests *.py *.gz *.yml *.json
6162
prune .github

renku/cli/log.py

+26-2
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,15 @@
5252
5353
* `ascii`
5454
* `dot`
55+
* `dot-full`
56+
* `dot-landscape`
57+
* `dot-full-landscape`
58+
* `dot-debug`
59+
* `json-ld`
60+
* `json-ld-graph`
61+
* `Makefile`
62+
* `nt`
63+
* `rdf`
5564
5665
You can generate a PNG of the full history of all files in the repository
5766
using the :program:`dot` program.
@@ -62,6 +71,15 @@
6271
$ renku log --format dot $FILES | dot -Tpng > /tmp/graph.png
6372
$ open /tmp/graph.png
6473
74+
Output validation
75+
~~~~~~~~~~~~~~~~~
76+
77+
The ``--strict`` option forces the output to be validated against the Renku
78+
SHACL schema, causing the command to fail if the generated output is not
79+
valid, as well as printing detailed information on all the issues found.
80+
The ``--strict`` option is only supported for the ``jsonld``, ``rdf`` and
81+
``nt`` output formats.
82+
6583
"""
6684

6785
import click
@@ -86,9 +104,15 @@
86104
default=False,
87105
help='Display commands without output files.'
88106
)
107+
@click.option(
108+
'--strict',
109+
is_flag=True,
110+
default=False,
111+
help='Validate triples before output.'
112+
)
89113
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
90114
@pass_local_client
91-
def log(client, revision, format, no_output, paths):
115+
def log(client, revision, format, no_output, strict, paths):
92116
"""Show logs for a file."""
93117
graph = Graph(client)
94118
if not paths:
@@ -108,4 +132,4 @@ def log(client, revision, format, no_output, paths):
108132
# NOTE shall we warn when "not no_output and not paths"?
109133
graph.build(paths=paths, revision=revision, can_be_cwl=no_output)
110134

111-
FORMATS[format](graph)
135+
FORMATS[format](graph, strict=strict)

renku/core/commands/checks/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,14 @@
1919

2020
from .migration import check_dataset_metadata, check_missing_files
2121
from .references import check_missing_references
22+
from .validate_shacl import check_project_structure, check_datasets_structure
2223

2324
# Checks will be executed in the order as they are listed in __all__.
2425
# They are mostly used in ``doctor`` command to inspect broken things.
2526
__all__ = (
2627
'check_dataset_metadata',
2728
'check_missing_files',
2829
'check_missing_references',
30+
'check_project_structure',
31+
'check_datasets_structure',
2932
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright 2019 - Swiss Data Science Center (SDSC)
4+
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
5+
# Eidgenössische Technische Hochschule Zürich (ETHZ).
6+
#
7+
# Licensed under the Apache License, Version 2.0 (the "License");
8+
# you may not use this file except in compliance with the License.
9+
# You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
"""Check KG structure using SHACL."""
19+
import yaml
20+
from rdflib.namespace import Namespace
21+
from rdflib.term import BNode
22+
23+
from renku.core.commands.echo import WARNING
24+
from renku.core.compat import pyld
25+
from renku.core.models.jsonld import NoDatesSafeLoader
26+
from renku.core.utils.shacl import validate_graph
27+
28+
29+
def _shacl_graph_to_string(graph):
30+
"""Converts a shacl validation graph into human readable format."""
31+
sh = Namespace('http://www.w3.org/ns/shacl#')
32+
33+
problems = []
34+
35+
for _, result in graph.subject_objects(sh.result):
36+
path = graph.value(result, sh.resultPath)
37+
res = graph.value(result, sh.resultMessage)
38+
39+
if res:
40+
message = '{0}: {1}'.format(path, res)
41+
else:
42+
kind = graph.value(result, sh.sourceConstraintComponent)
43+
focusNode = graph.value(result, sh.focusNode)
44+
45+
if isinstance(focusNode, BNode):
46+
focusNode = '<Anonymous>'
47+
48+
message = '{0}: Type: {1}, Node ID: {2}'.format(
49+
path, kind, focusNode
50+
)
51+
52+
problems.append(message)
53+
54+
return '\n\t'.join(problems)
55+
56+
57+
def check_project_structure(client):
58+
"""Validate project metadata against SHACL."""
59+
project_path = client.renku_metadata_path
60+
61+
conform, graph, t = check_shacl_structure(project_path)
62+
63+
if conform:
64+
return True, None
65+
66+
problems = '{0}Invalid structure of project metadata\n\t{1}'.format(
67+
WARNING, _shacl_graph_to_string(graph)
68+
)
69+
70+
return False, problems
71+
72+
73+
def check_datasets_structure(client):
74+
"""Validate dataset metadata against SHACL."""
75+
ok = True
76+
77+
problems = ['{0}Invalid structure of dataset metadata'.format(WARNING)]
78+
79+
for path in client.renku_datasets_path.rglob(client.METADATA):
80+
try:
81+
conform, graph, t = check_shacl_structure(path)
82+
except (Exception, BaseException) as e:
83+
problems.append('Couldn\'t validate {0}: {1}\n\n'.format(path, e))
84+
continue
85+
86+
if conform:
87+
continue
88+
89+
ok = False
90+
91+
problems.append(
92+
'{0}\n\t{1}\n'.format(path, _shacl_graph_to_string(graph))
93+
)
94+
95+
if ok:
96+
return True, None
97+
98+
return False, '\n'.join(problems)
99+
100+
101+
def check_shacl_structure(path):
102+
"""Validates all metadata aginst the SHACL schema."""
103+
with path.open(mode='r') as fp:
104+
source = yaml.load(fp, Loader=NoDatesSafeLoader) or {}
105+
106+
rdf = pyld.jsonld.to_rdf(
107+
source,
108+
options={
109+
'format': 'application/n-quads',
110+
'produceGeneralizedRdf': True
111+
}
112+
)
113+
114+
return validate_graph(rdf)

renku/core/commands/dataset.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -567,7 +567,7 @@ def update_datasets(
567567

568568
file_.dataset = dataset
569569
possible_updates.append(file_)
570-
unique_remotes.add(file_.based_on['url'])
570+
unique_remotes.add(file_.based_on.url)
571571

572572
if ref and len(unique_remotes) > 1:
573573
raise ParameterError(

renku/core/commands/format/graph.py

+61-34
Original file line numberDiff line numberDiff line change
@@ -21,43 +21,58 @@
2121

2222
import click
2323

24+
from renku.core.errors import SHACLValidationError
25+
from renku.core.utils.shacl import validate_graph
2426

25-
def ascii(graph):
27+
28+
def ascii(graph, strict=False):
2629
"""Format graph as an ASCII art."""
2730
from ..ascii import DAG
2831
from ..echo import echo_via_pager
2932

33+
if strict:
34+
raise SHACLValidationError('--strict not supported for json-ld-graph')
35+
3036
echo_via_pager(str(DAG(graph)))
3137

3238

3339
def _jsonld(graph, format, *args, **kwargs):
3440
"""Return formatted graph in JSON-LD ``format`` function."""
3541
import json
3642

37-
from pyld import jsonld
43+
from renku.core.compat import pyld
3844
from renku.core.models.jsonld import asjsonld
3945

40-
output = getattr(jsonld, format)([
46+
output = getattr(pyld.jsonld, format)([
4147
asjsonld(action) for action in graph.activities.values()
4248
])
4349
return json.dumps(output, indent=2)
4450

4551

46-
def dot(graph, simple=True, debug=False, landscape=False):
47-
"""Format graph as a dot file."""
48-
import sys
49-
52+
def _conjunctive_graph(graph):
53+
"""Convert a renku ``Graph`` to an rdflib ``ConjunctiveGraph``."""
5054
from rdflib import ConjunctiveGraph
5155
from rdflib.plugin import register, Parser
52-
from rdflib.tools.rdf2dot import rdf2dot
5356

5457
register('json-ld', Parser, 'rdflib_jsonld.parser', 'JsonLDParser')
5558

56-
g = ConjunctiveGraph().parse(
59+
return ConjunctiveGraph().parse(
5760
data=_jsonld(graph, 'expand'),
5861
format='json-ld',
5962
)
6063

64+
65+
def dot(graph, simple=True, debug=False, landscape=False, strict=False):
66+
"""Format graph as a dot file."""
67+
import sys
68+
69+
from rdflib.tools.rdf2dot import rdf2dot
70+
71+
if strict:
72+
raise SHACLValidationError('--strict not supported for json-ld-graph')
73+
74+
g = _conjunctive_graph(graph)
75+
6176
g.bind('prov', 'http://www.w3.org/ns/prov#')
6277
g.bind('foaf', 'http://xmlns.com/foaf/0.1/')
6378
g.bind('wfdesc', 'http://purl.org/wf4ever/wfdesc#')
@@ -92,7 +107,7 @@ def _rdf2dot_simple(g, stream):
92107
import re
93108

94109
path_re = re.compile(
95-
r'file:///(?P<type>[a-zA-Z]+)/'
110+
r'(?P<prefix>file://|https://\w+/\w+/){0,1}(?P<type>[a-zA-Z]+)/'
96111
r'(?P<commit>\w+)'
97112
r'(?P<path>.+)?'
98113
)
@@ -293,10 +308,13 @@ def color(p):
293308
stream.write('}\n')
294309

295310

296-
def makefile(graph):
311+
def makefile(graph, strict=False):
297312
"""Format graph as Makefile."""
298313
from renku.core.models.provenance.activities import ProcessRun, WorkflowRun
299314

315+
if strict:
316+
raise SHACLValidationError('--strict not supported for json-ld-graph')
317+
300318
for activity in graph.activities.values():
301319
if not isinstance(activity, ProcessRun):
302320
continue
@@ -316,44 +334,53 @@ def makefile(graph):
316334
)
317335

318336

319-
def jsonld(graph):
337+
def jsonld(graph, strict=False):
320338
"""Format graph as JSON-LD file."""
321-
click.echo(_jsonld(graph, 'expand'))
339+
ld = _jsonld(graph, 'expand')
340+
341+
if strict:
342+
r, _, t = validate_graph(ld, format='json-ld')
343+
344+
if not r:
345+
raise SHACLValidationError(
346+
"{}\nCouldn't get log: Invalid Knowledge Graph data".format(t)
347+
)
348+
click.echo(ld)
322349

323350

324-
def jsonld_graph(graph):
351+
def jsonld_graph(graph, strict=False):
325352
"""Format graph as JSON-LD graph file."""
353+
if strict:
354+
raise SHACLValidationError('--strict not supported for json-ld-graph')
326355
click.echo(_jsonld(graph, 'flatten'))
327356

328357

329-
def nt(graph):
358+
def nt(graph, strict=False):
330359
"""Format graph as n-tuples."""
331-
from rdflib import ConjunctiveGraph
332-
from rdflib.plugin import register, Parser
360+
nt = _conjunctive_graph(graph).serialize(format='nt')
361+
if strict:
362+
r, _, t = validate_graph(nt, format='nt')
333363

334-
register('json-ld', Parser, 'rdflib_jsonld.parser', 'JsonLDParser')
364+
if not r:
365+
raise SHACLValidationError(
366+
"{}\nCouldn't get log: Invalid Knowledge Graph data".format(t)
367+
)
335368

336-
click.echo(
337-
ConjunctiveGraph().parse(
338-
data=_jsonld(graph, 'expand'),
339-
format='json-ld',
340-
).serialize(format='nt')
341-
)
369+
click.echo(nt)
342370

343371

344-
def rdf(graph):
372+
def rdf(graph, strict=False):
345373
"""Output the graph as RDF."""
346-
from rdflib import ConjunctiveGraph
347-
from rdflib.plugin import register, Parser
374+
xml = _conjunctive_graph(graph).serialize(format='application/rdf+xml')
375+
if strict:
376+
r, _, t = validate_graph(xml, format='xml')
348377

349-
register('json-ld', Parser, 'rdflib_jsonld.parser', 'JsonLDParser')
378+
if not r:
379+
raise SHACLValidationError(
380+
"{}\nCouldn't get log: Invalid Knowledge Graph data".format(t)
381+
)
350382

351-
click.echo(
352-
ConjunctiveGraph().parse(
353-
data=_jsonld(graph, 'expand'),
354-
format='json-ld',
355-
).serialize(format='application/rdf+xml')
356-
)
383+
click.echo(xml)
357384

358385

359386
FORMATS = {

0 commit comments

Comments
 (0)