Skip to content

Commit 164d143

Browse files
committed
Add SciMeta validation to GMN
1 parent a1c99b4 commit 164d143

File tree

18 files changed

+499
-72
lines changed

18 files changed

+499
-72
lines changed

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,8 @@ Download the source from GitHub:
198198

199199
Add the DataONE packages to the Python path, and install their dependencies:
200200

201-
$ sh $D1ROOT/dev_tools/develop.sh
201+
cd ~/d1_python
202+
sudo dev_tools/src/d1_dev/setup-all.py --root . develop
202203

203204
Run the following commands, except, change the "createuser" line to:
204205

@@ -208,7 +209,7 @@ Run the following commands, except, change the "createuser" line to:
208209

209210
Run the following commands (all sections), except, change the location for openssl.cnf, so the line that copies it becomes:
210211

211-
$ sudo cp /home/dahl/d1_python/d1_mn_generic/src/deployment/openssl.cnf .
212+
$ sudo cp <your_d1_python_path>/d1_mn_generic/src/deployment/openssl.cnf .
212213

213214
https://pythonhosted.org/dataone.generic_member_node/setup-local-authn-ca.html
214215

client_cli/src/d1_cli/tests/test_session.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
"""
2323
from __future__ import absolute_import
2424

25-
import os
2625
import StringIO
2726
import sys
2827
import uuid
@@ -134,17 +133,16 @@ def test_1100(self):
134133
assert len(out) > 100
135134
assert type(out) is str
136135

137-
def test_1110(self):
136+
def test_1110(self, tmpdir):
138137
"""Session is successfully saved and then loaded (pickled and unpickled)"""
139-
tmp_pickle = './pickle.tmp'
140-
try:
141-
os.unlink(tmp_pickle)
142-
except OSError:
143-
pass
138+
tmp_pickle_path = str(tmpdir.join('session.pickle'))
139+
print type(tmp_pickle_path)
140+
print dir(tmp_pickle_path)
141+
print tmp_pickle_path
144142
s1 = session.Session(nodes, format_ids)
145143
u = str(uuid.uuid1())
146144
s1.set('rights-holder', u)
147-
s1.save(tmp_pickle)
145+
s1.save(tmp_pickle_path)
148146
s2 = session.Session(nodes, format_ids)
149-
s2.load(tmp_pickle)
147+
s2.load(tmp_pickle_path)
150148
assert s2.get('rights-holder') == u

conftest.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@
5151
import django.db
5252
import django.db.utils
5353

54-
DEFAULT_DEBUG_PYCHARM_BIN_PATH = os.path.expanduser('~/bin/JetBrains/pycharm')
54+
DEFAULT_DEBUG_PYCHARM_BIN_PATH = os.path.expanduser(
55+
'~/bin/JetBrains/pycharm.sh'
56+
)
5557
D1_SKIP_LIST = 'skip_passed/list'
5658
D1_SKIP_COUNT = 'skip_passed/count'
5759

@@ -240,6 +242,8 @@ def _open_error_in_pycharm(call):
240242
logging.debug('Unable to find location of error')
241243
return
242244
try:
245+
assert os.path.isfile(DEFAULT_DEBUG_PYCHARM_BIN_PATH), \
246+
'Path to PyCharm is incorrect'
243247
subprocess.call(
244248
[DEFAULT_DEBUG_PYCHARM_BIN_PATH, '--line', str(src_line), str(src_path)]
245249
)

dev_tools/src/d1_dev/setup-all.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
PKG_PATH_LIST = [
4242
'dev_tools',
4343
'lib_common',
44+
'lib_scimeta',
4445
'lib_client',
4546
'client_cli',
4647
'client_onedrive',

gmn/src/d1_gmn/app/scimeta.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# This work was created by participants in the DataONE project, and is
4+
# jointly copyrighted by participating institutions in DataONE. For
5+
# more information on DataONE, see our web site at http://dataone.org.
6+
#
7+
# Copyright 2009-2016 DataONE
8+
#
9+
# Licensed under the Apache License, Version 2.0 (the "License");
10+
# you may not use this file except in compliance with the License.
11+
# You may obtain a copy of the License at
12+
#
13+
# http://www.apache.org/licenses/LICENSE-2.0
14+
#
15+
# Unless required by applicable law or agreed to in writing, software
16+
# distributed under the License is distributed on an "AS IS" BASIS,
17+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18+
# See the License for the specific language governing permissions and
19+
# limitations under the License.
20+
"""Utilities for Science Metadata
21+
"""
22+
23+
from __future__ import absolute_import
24+
25+
import d1_scimeta.xml_schema
26+
27+
import d1_common.const
28+
import d1_common.date_time
29+
import d1_common.types
30+
import d1_common.types.dataoneTypes
31+
import d1_common.types.dataoneTypes_v2_0
32+
import d1_common.types.exceptions
33+
import d1_common.util
34+
import d1_common.wrap.access_policy
35+
import d1_common.xml
36+
37+
import django.conf
38+
39+
40+
def assert_valid(sysmeta_pyxb, sciobj_path):
41+
"""Validate file at {sciobj_path} against schema selected via formatId and
42+
raise InvalidRequest if invalid
43+
44+
Validation is only performed when:
45+
46+
- SciMeta validation is enabled
47+
- and Object size is below size limit for validation
48+
- and formatId designates object as a Science Metadata object which is recognized
49+
and parsed by DataONE CNs
50+
- and XML Schema (XSD) files for formatId are present on local system
51+
"""
52+
if not (
53+
_is_validation_enabled() and
54+
_is_installed_scimeta_format_id(sysmeta_pyxb)
55+
):
56+
return
57+
58+
if _is_above_size_limit(sysmeta_pyxb):
59+
if _is_action_accept():
60+
return
61+
else:
62+
raise d1_common.types.exceptions.InvalidRequest(
63+
0, 'Science Metadata file is above size limit for validation and this '
64+
'node has been configured to reject unvalidated Science Metadata '
65+
'files. For more information, see the SCIMETA_VALIDATE* settings. '
66+
'size={} size_limit={}'.format(
67+
sysmeta_pyxb.size, django.conf.settings.SCIMETA_VALIDATION_MAX_SIZE
68+
)
69+
)
70+
71+
with open(sciobj_path, 'rb') as f:
72+
try:
73+
d1_scimeta.xml_schema.validate(sysmeta_pyxb.formatId, f.read())
74+
except d1_scimeta.xml_schema.SciMetaValidationError as e:
75+
raise d1_common.types.exceptions.InvalidRequest(0, str(e))
76+
77+
78+
def _is_validation_enabled():
79+
return django.conf.settings.SCIMETA_VALIDATION_ENABLED
80+
81+
82+
def _is_installed_scimeta_format_id(sysmeta_pyxb):
83+
return d1_scimeta.xml_schema.is_installed_scimeta_format_id(
84+
sysmeta_pyxb.formatId
85+
)
86+
87+
88+
def _is_above_size_limit(sysmeta_pyxb):
89+
return (
90+
django.conf.settings.SCIMETA_VALIDATION_MAX_SIZE == -1 or
91+
sysmeta_pyxb.size > django.conf.settings.SCIMETA_VALIDATION_MAX_SIZE
92+
)
93+
94+
95+
def _is_action_accept():
96+
return django.conf.settings.SCIMETA_VALIDATION_OVER_SIZE_ACTION == 'accept'

gmn/src/d1_gmn/app/settings_default.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,10 @@
112112

113113
RESOURCE_MAP_CREATE = 'block'
114114

115+
SCIMETA_VALIDATION_ENABLED = True
116+
SCIMETA_VALIDATION_MAX_SIZE = 100 * 1024**2
117+
SCIMETA_VALIDATION_OVER_SIZE_ACTION = 'reject'
118+
115119
PROXY_MODE_BASIC_AUTH_ENABLED = False
116120
PROXY_MODE_BASIC_AUTH_USERNAME = ''
117121
PROXY_MODE_BASIC_AUTH_PASSWORD = ''

gmn/src/d1_gmn/app/startup.py

Lines changed: 51 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,61 @@ class GMNStartupChecks(django.apps.AppConfig):
4343
name = 'd1_gmn.app.startup'
4444

4545
def ready(self):
46-
self._check_cert_file('CLIENT_CERT_PATH')
47-
self._check_cert_file('CLIENT_CERT_PRIVATE_KEY_PATH')
46+
self._assert_readable_file_if_set('CLIENT_CERT_PATH')
47+
self._assert_readable_file_if_set('CLIENT_CERT_PRIVATE_KEY_PATH')
48+
49+
self._assert_is_type('SCIMETA_VALIDATION_ENABLED', bool)
50+
self._assert_is_type('SCIMETA_VALIDATION_MAX_SIZE', int)
51+
self._assert_is_in(
52+
'SCIMETA_VALIDATION_OVER_SIZE_ACTION', ('reject', 'accept')
53+
)
54+
4855
self._warn_unsafe_for_prod()
4956
self._check_resource_map_create()
5057
self._create_sciobj_store_root()
5158

59+
def _assert_is_type(self, setting_name, valid_type):
60+
v = getattr(django.conf.settings, setting_name, None)
61+
if not isinstance(v, valid_type):
62+
self.raise_config_error(setting_name, valid_type)
63+
64+
def _assert_is_in(self, setting_name, valid_list):
65+
if getattr(django.conf.settings, setting_name, None) not in valid_list:
66+
self.raise_config_error(setting_name, valid_list)
67+
68+
def _assert_readable_file_if_set(self, setting_name):
69+
v = getattr(django.conf.settings, setting_name, None)
70+
if v is None:
71+
return
72+
self._assert_is_type(setting_name, str)
73+
if not os.path.isfile(v):
74+
self.raise_config_error(
75+
setting_name, v, str, valid_str='a path to a readable file'
76+
)
77+
try:
78+
with open(v, 'r') as f:
79+
f.read(1)
80+
except EnvironmentError as e:
81+
self.raise_config_error(
82+
setting_name, v, str, valid_str='a path to a readable file. error="{}"'
83+
.format(str(e))
84+
)
85+
86+
def raise_config_error(self, setting_name, cur_val, exp_type, valid_str=None):
87+
valid_str = valid_str or (
88+
'a whole number' if isinstance(exp_type, int) else 'a number'
89+
if isinstance(exp_type, float) else 'a string'
90+
if isinstance(exp_type, str) else 'True or False' if
91+
isinstance(exp_type, bool) else 'one of {}'.format(', '.join(exp_type))
92+
if isinstance(exp_type,
93+
(list, tuple)) else 'of type {}'.format(exp_type.__name__)
94+
)
95+
msg_str = u'Configuration error: Setting {} must be {}. current="{}"'.format(
96+
setting_name, valid_str, str(cur_val)
97+
)
98+
logging.error(msg_str)
99+
raise django.core.exceptions.ImproperlyConfigured(msg_str)
100+
52101
def _warn_unsafe_for_prod(self):
53102
"""Warn on settings that are not safe for production"""
54103
safe_settings_list = [
@@ -65,21 +114,6 @@ def _warn_unsafe_for_prod(self):
65114
'safe="{}"'.format(setting_str, setting_current, setting_safe)
66115
)
67116

68-
def _check_cert_file(self, cert_pem_setting):
69-
cert_pem_path = getattr(django.conf.settings, cert_pem_setting, None)
70-
if cert_pem_path is None:
71-
logging.warn(
72-
'Certificate path not set. setting="{}"'.format(cert_pem_setting)
73-
)
74-
return
75-
try:
76-
self._assert_readable_file(cert_pem_path)
77-
except ValueError as e:
78-
raise django.core.exceptions.ImproperlyConfigured(
79-
u'Configuration error: Invalid certificate path. '
80-
u'setting="{}". msg="{}"'.format(cert_pem_setting, str(e))
81-
)
82-
83117
def _check_resource_map_create(self):
84118
if (
85119
django.conf.settings.RESOURCE_MAP_CREATE not in RESOURCE_MAP_CREATE_MODE_LIST
@@ -144,15 +178,3 @@ def _create_sciobj_store_root(self):
144178
d1_gmn.app.sciobj_store.get_gmn_version(),
145179
)
146180
)
147-
148-
def _assert_readable_file(self, file_path):
149-
if not os.path.isfile(file_path):
150-
raise ValueError('Not a valid file path. path="{}"'.format(file_path))
151-
try:
152-
with open(file_path, 'r') as f:
153-
f.read(1)
154-
except EnvironmentError as e:
155-
raise ValueError(
156-
'Unable to read file. path="{}" error="{}"'.
157-
format(file_path, e.message)
158-
)

gmn/src/d1_gmn/app/views/create.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import d1_gmn.app.event_log
2727
import d1_gmn.app.resource_map
2828
import d1_gmn.app.revision
29+
import d1_gmn.app.scimeta
2930
import d1_gmn.app.sciobj_store
3031
import d1_gmn.app.sysmeta
3132
import d1_gmn.app.util
@@ -75,6 +76,7 @@ def create_sciobj(request, sysmeta_pyxb):
7576
_create_resource_map(pid, request, sciobj_path, sysmeta_pyxb, url)
7677
else:
7778
_save_sciobj_bytes_from_request(request, sciobj_path)
79+
d1_gmn.app.scimeta.assert_valid(sysmeta_pyxb, sciobj_path)
7880

7981
d1_gmn.app.sysmeta.create_or_update(sysmeta_pyxb, url)
8082

gmn/src/d1_gmn/settings_template.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,53 @@
424424
# object is created by the Resource Map subject.
425425
RESOURCE_MAP_CREATE = 'block'
426426

427+
# Validate Science Metadata objects against local XML Schema (XSD)
428+
# True (default):
429+
# - When a SciMeta format that is recognized and parsed by CNs is
430+
# received in MNStorage.create() or MNStorage.update(), GMN rejects the
431+
# operation and returns an InvalidRequest with details to the client if the
432+
# object is not well formed, valid, and matching the SysMeta formatId.
433+
# False:
434+
# - SciMeta objects are not validated. This is not recommended, as any invalid
435+
# objects will be rejected by the CN during synchronization.
436+
# Notes:
437+
# - Objects affected by this setting have formatType of METADATA in the CN's
438+
# objectFormatList.
439+
# - Actual validation is performed by the d1_sciobj package, which may not
440+
# support all SciMeta formats. Validation is silently skipped for any
441+
# unsupported formats.
442+
# - Objects that are stored remotely (using GMN's proxy support), are not
443+
# validated.
444+
SCIMETA_VALIDATION_ENABLED = True
445+
446+
# The maximum size in bytes of SciMeta objects received in MNStorage.create()
447+
# and MNStorage.update() that will be validated. SciMeta objects larger than
448+
# this size are not validated and are handled according to the
449+
# SCIMETA_VALIDATION_OVER_SIZE_ACTION setting.
450+
#
451+
# This setting applies only when SCIMETA_VALIDATION is set to True.
452+
#
453+
# As SciMeta documents are read into memory for validation, limiting the maximum
454+
# size of objects that will be validated helps reduce the chance of the server
455+
# running out of memory.
456+
#
457+
# E.g.: 100 MiB = 1024**2 (default)
458+
# To validate SciMeta of any size, set to -1 (not recommended).
459+
SCIMETA_VALIDATION_MAX_SIZE = 100 * 1024**2
460+
461+
# The action to perform for SciMeta objects received in MNStorage.create()
462+
# and MNStorage.update() larger than size set in SCIMETA_VALIDATION_MAX_SIZE.
463+
#
464+
# This setting applies only when SCIMETA_VALIDATION_ENABLED is set to True and
465+
# SCIMETA_VALIDATION_MAX_SIZE is not set to -1.
466+
#
467+
# - 'reject' (default): SciMeta over Max Size is rejected and GMN returns an
468+
# InvalidRequest with explanation to the client.
469+
# - 'accept': SciMeta over Max Size is accepted but not validated. This is not
470+
# recommended, as any invalid objects will later be rejected by the CN during
471+
# synchronization.
472+
SCIMETA_VALIDATION_OVER_SIZE_ACTION = 'reject'
473+
427474
# GMN implements a vendor specific extension for MNStorage.create(). Instead of
428475
# providing an object for GMN to manage, the object can be left empty and the
429476
# URL of the object on a 3rd party server be provided instead. In that case, GMN

0 commit comments

Comments
 (0)