Skip to content

Commit e97b74d

Browse files
feat: Neo4j 4.x support (#1942)
* added more configuration to support newer neo4j Signed-off-by: Allison Suarez Miranda <[email protected]> * added condition to all drivers and added db option Signed-off-by: Allison Suarez Miranda <[email protected]> * fix driver Signed-off-by: Allison Suarez Miranda <[email protected]> * implemented feedback Signed-off-by: Allison Suarez Miranda <[email protected]> * implemented feedback Signed-off-by: Allison Suarez Miranda <[email protected]> * Implemented more feedback Signed-off-by: Allison Suarez Miranda <[email protected]> * fix imports Signed-off-by: Allison Suarez Miranda <[email protected]> * fixed unit tests Signed-off-by: Allison Suarez Miranda <[email protected]> * fixing neo4j config Signed-off-by: Allison Suarez Miranda <[email protected]> * property patch Signed-off-by: Allison Suarez Miranda <[email protected]> * struggling with how to patch the driver creatioon method * fixed patching Signed-off-by: Allison Suarez Miranda <[email protected]> * removed unused imports Signed-off-by: Allison Suarez Miranda <[email protected]> * typiong fix Signed-off-by: Allison Suarez Miranda <[email protected]> * missing an any in tuple Signed-off-by: Allison Suarez Miranda <[email protected]> * sort imports Signed-off-by: Allison Suarez Miranda <[email protected]> * check URI scheme security and set default trust and encrypted values accordingly Signed-off-by: Allison Suarez Miranda <[email protected]> * self.conf needed in neo4j extractor Signed-off-by: Allison Suarez Miranda <[email protected]> * updating unit tests Signed-off-by: Allison Suarez Miranda <[email protected]> * fix driver Signed-off-by: Allison Suarez Miranda <[email protected]> * fixed uri in neo4j search data extractor test Signed-off-by: Allison Suarez Miranda <[email protected]> * fix improts and lint Signed-off-by: Allison Suarez Miranda <[email protected]>
1 parent c3dab36 commit e97b74d

File tree

8 files changed

+170
-83
lines changed

8 files changed

+170
-83
lines changed

databuilder/databuilder/extractor/neo4j_extractor.py

Lines changed: 43 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99

1010
import neo4j
1111
from neo4j import GraphDatabase
12+
from neo4j.api import (
13+
SECURITY_TYPE_SECURE, SECURITY_TYPE_SELF_SIGNED_CERTIFICATE, parse_neo4j_uri,
14+
)
1215
from pyhocon import ConfigFactory, ConfigTree
1316

1417
from databuilder.extractor.base_extractor import Extractor
@@ -25,15 +28,19 @@ class Neo4jExtractor(Extractor):
2528
MODEL_CLASS_CONFIG_KEY = 'model_class'
2629
NEO4J_AUTH_USER = 'neo4j_auth_user'
2730
NEO4J_AUTH_PW = 'neo4j_auth_pw'
31+
# in Neo4j (v4.0+), we can create and use more than one active database at the same time
32+
NEO4J_DATABASE_NAME = 'neo4j_database'
2833
NEO4J_MAX_CONN_LIFE_TIME_SEC = 'neo4j_max_conn_life_time_sec'
2934
NEO4J_ENCRYPTED = 'neo4j_encrypted'
3035
"""NEO4J_ENCRYPTED is a boolean indicating whether to use SSL/TLS when connecting."""
3136
NEO4J_VALIDATE_SSL = 'neo4j_validate_ssl'
3237
"""NEO4J_VALIDATE_SSL is a boolean indicating whether to validate the server's SSL/TLS cert against system CAs."""
38+
NEO4J_DRIVER = 'neo4j_driver'
3339

34-
DEFAULT_CONFIG = ConfigFactory.from_dict({NEO4J_MAX_CONN_LIFE_TIME_SEC: 50,
35-
NEO4J_ENCRYPTED: True,
36-
NEO4J_VALIDATE_SSL: False})
40+
DEFAULT_CONFIG = ConfigFactory.from_dict({
41+
NEO4J_MAX_CONN_LIFE_TIME_SEC: 50,
42+
NEO4J_DATABASE_NAME: neo4j.DEFAULT_DATABASE
43+
})
3744

3845
def init(self, conf: ConfigTree) -> None:
3946
"""
@@ -43,8 +50,36 @@ def init(self, conf: ConfigTree) -> None:
4350
self.conf = conf.with_fallback(Neo4jExtractor.DEFAULT_CONFIG)
4451
self.graph_url = conf.get_string(Neo4jExtractor.GRAPH_URL_CONFIG_KEY)
4552
self.cypher_query = conf.get_string(Neo4jExtractor.CYPHER_QUERY_CONFIG_KEY)
46-
self.driver = self._get_driver()
47-
53+
self.db_name = self.conf.get_string(Neo4jExtractor.NEO4J_DATABASE_NAME)
54+
driver = conf.get(Neo4jExtractor.NEO4J_DRIVER, None)
55+
if driver:
56+
self.driver = driver
57+
else:
58+
uri = conf.get_string(Neo4jExtractor.GRAPH_URL_CONFIG_KEY)
59+
driver_args = {
60+
'uri': uri,
61+
'max_connection_lifetime': self.conf.get_int(Neo4jExtractor.NEO4J_MAX_CONN_LIFE_TIME_SEC),
62+
'auth': (conf.get_string(Neo4jExtractor.NEO4J_AUTH_USER),
63+
conf.get_string(Neo4jExtractor.NEO4J_AUTH_PW)),
64+
}
65+
66+
# if URI scheme not secure set `trust`` and `encrypted` to default values
67+
# https://neo4j.com/docs/api/python-driver/current/api.html#uri
68+
_, security_type, _ = parse_neo4j_uri(uri=uri)
69+
if security_type not in [SECURITY_TYPE_SELF_SIGNED_CERTIFICATE, SECURITY_TYPE_SECURE]:
70+
default_security_conf = {'trust': neo4j.TRUST_ALL_CERTIFICATES, 'encrypted': True}
71+
driver_args.update(default_security_conf)
72+
73+
# if NEO4J_VALIDATE_SSL or NEO4J_ENCRYPTED are set in config pass them to the driver
74+
validate_ssl_conf = conf.get(Neo4jExtractor.NEO4J_VALIDATE_SSL, None)
75+
encrypted_conf = conf.get(Neo4jExtractor.NEO4J_ENCRYPTED, None)
76+
if validate_ssl_conf is not None:
77+
driver_args['trust'] = neo4j.TRUST_SYSTEM_CA_SIGNED_CERTIFICATES if validate_ssl_conf \
78+
else neo4j.TRUST_ALL_CERTIFICATES
79+
if encrypted_conf is not None:
80+
driver_args['encrypted'] = encrypted_conf
81+
82+
self.driver = GraphDatabase.driver(**driver_args)
4883
self._extract_iter: Union[None, Iterator] = None
4984

5085
model_class = conf.get(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY, None)
@@ -62,20 +97,6 @@ def close(self) -> None:
6297
except Exception as e:
6398
LOGGER.error("Exception encountered while closing the graph driver", e)
6499

65-
def _get_driver(self) -> Any:
66-
"""
67-
Create a Neo4j connection to Database
68-
"""
69-
trust = neo4j.TRUST_SYSTEM_CA_SIGNED_CERTIFICATES if self.conf.get_bool(Neo4jExtractor.NEO4J_VALIDATE_SSL) \
70-
else neo4j.TRUST_ALL_CERTIFICATES
71-
return GraphDatabase.driver(uri=self.graph_url,
72-
max_connection_lifetime=self.conf.get_int(
73-
Neo4jExtractor.NEO4J_MAX_CONN_LIFE_TIME_SEC),
74-
auth=(self.conf.get_string(Neo4jExtractor.NEO4J_AUTH_USER),
75-
self.conf.get_string(Neo4jExtractor.NEO4J_AUTH_PW)),
76-
encrypted=self.conf.get_bool(Neo4jExtractor.NEO4J_ENCRYPTED),
77-
trust=trust)
78-
79100
def _execute_query(self, tx: Any) -> Any:
80101
"""
81102
Create an iterator to execute sql.
@@ -88,7 +109,9 @@ def _get_extract_iter(self) -> Iterator[Any]:
88109
"""
89110
Execute {cypher_query} and yield result one at a time
90111
"""
91-
with self.driver.session() as session:
112+
with self.driver.session(
113+
database=self.db_name
114+
) as session:
92115
if not hasattr(self, 'results'):
93116
self.results = session.read_transaction(self._execute_query)
94117

databuilder/databuilder/publisher/neo4j_csv_publisher.py

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
import pandas
1717
from jinja2 import Template
1818
from neo4j import GraphDatabase, Transaction
19+
from neo4j.api import (
20+
SECURITY_TYPE_SECURE, SECURITY_TYPE_SELF_SIGNED_CERTIFICATE, parse_neo4j_uri,
21+
)
1922
from neo4j.exceptions import Neo4jError, TransientError
2023
from pyhocon import ConfigFactory, ConfigTree
2124

@@ -51,13 +54,17 @@
5154

5255
NEO4J_USER = 'neo4j_user'
5356
NEO4J_PASSWORD = 'neo4j_password'
57+
# in Neo4j (v4.0+), we can create and use more than one active database at the same time
58+
NEO4J_DATABASE_NAME = 'neo4j_database'
59+
60+
NEO4J_DRIVER = 'neo4j_driver'
61+
5462
# NEO4J_ENCRYPTED is a boolean indicating whether to use SSL/TLS when connecting
5563
NEO4J_ENCRYPTED = 'neo4j_encrypted'
5664
# NEO4J_VALIDATE_SSL is a boolean indicating whether to validate the server's SSL/TLS
5765
# cert against system CAs
5866
NEO4J_VALIDATE_SSL = 'neo4j_validate_ssl'
5967

60-
6168
# This will be used to provide unique tag to the node and relationship
6269
JOB_PUBLISH_TAG = 'job_publish_tag'
6370

@@ -109,8 +116,7 @@
109116
NEO4J_PROGRESS_REPORT_FREQUENCY: 500,
110117
NEO4J_RELATIONSHIP_CREATION_CONFIRM: False,
111118
NEO4J_MAX_CONN_LIFE_TIME_SEC: 50,
112-
NEO4J_ENCRYPTED: True,
113-
NEO4J_VALIDATE_SSL: False,
119+
NEO4J_DATABASE_NAME: neo4j.DEFAULT_DATABASE,
114120
ADDITIONAL_FIELDS: {},
115121
ADD_PUBLISHER_METADATA: True,
116122
RELATION_PREPROCESSOR: NoopRelationPreprocessor()})
@@ -148,16 +154,39 @@ def init(self, conf: ConfigTree) -> None:
148154
self._relation_files = self._list_files(conf, RELATION_FILES_DIR)
149155
self._relation_files_iter = iter(self._relation_files)
150156

151-
trust = neo4j.TRUST_SYSTEM_CA_SIGNED_CERTIFICATES if conf.get_bool(NEO4J_VALIDATE_SSL) \
152-
else neo4j.TRUST_ALL_CERTIFICATES
153-
self._driver = \
154-
GraphDatabase.driver(uri=conf.get_string(NEO4J_END_POINT_KEY),
155-
max_connection_lifetime=conf.get_int(NEO4J_MAX_CONN_LIFE_TIME_SEC),
156-
auth=(conf.get_string(NEO4J_USER), conf.get_string(NEO4J_PASSWORD)),
157-
encrypted=conf.get_bool(NEO4J_ENCRYPTED),
158-
trust=trust)
157+
driver = conf.get(NEO4J_DRIVER, None)
158+
if driver:
159+
self._driver = driver
160+
else:
161+
uri = conf.get_string(NEO4J_END_POINT_KEY)
162+
driver_args = {
163+
'uri': uri,
164+
'max_connection_lifetime': conf.get_int(NEO4J_MAX_CONN_LIFE_TIME_SEC),
165+
'auth': (conf.get_string(NEO4J_USER), conf.get_string(NEO4J_PASSWORD)),
166+
}
167+
168+
# if URI scheme not secure set `trust`` and `encrypted` to default values
169+
# https://neo4j.com/docs/api/python-driver/current/api.html#uri
170+
_, security_type, _ = parse_neo4j_uri(uri=uri)
171+
if security_type not in [SECURITY_TYPE_SELF_SIGNED_CERTIFICATE, SECURITY_TYPE_SECURE]:
172+
default_security_conf = {'trust': neo4j.TRUST_ALL_CERTIFICATES, 'encrypted': True}
173+
driver_args.update(default_security_conf)
174+
175+
# if NEO4J_VALIDATE_SSL or NEO4J_ENCRYPTED are set in config pass them to the driver
176+
validate_ssl_conf = conf.get(NEO4J_VALIDATE_SSL, None)
177+
encrypted_conf = conf.get(NEO4J_ENCRYPTED, None)
178+
if validate_ssl_conf is not None:
179+
driver_args['trust'] = neo4j.TRUST_SYSTEM_CA_SIGNED_CERTIFICATES if validate_ssl_conf \
180+
else neo4j.TRUST_ALL_CERTIFICATES
181+
if encrypted_conf is not None:
182+
driver_args['encrypted'] = encrypted_conf
183+
184+
self._driver = GraphDatabase.driver(**driver_args)
185+
186+
self._db_name = conf.get_string(NEO4J_DATABASE_NAME)
187+
self._session = self._driver.session(database=self._db_name)
188+
159189
self._transaction_size = conf.get_int(NEO4J_TRANSACTION_SIZE)
160-
self._session = self._driver.session()
161190
self._confirm_rel_created = conf.get_bool(NEO4J_RELATIONSHIP_CREATION_CONFIRM)
162191

163192
# config is list of node label.
@@ -488,7 +517,7 @@ def _try_create_index(self, label: str) -> None:
488517
""").render(LABEL=label)
489518

490519
LOGGER.info(f'Trying to create index for label {label} if not exist: {stmt}')
491-
with self._driver.session() as session:
520+
with self._driver.session(self._db_name) as session:
492521
try:
493522
session.run(stmt)
494523
except Neo4jError as e:

databuilder/databuilder/task/neo4j_staleness_removal_task.py

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010

1111
import neo4j
1212
from neo4j import GraphDatabase
13+
from neo4j.api import (
14+
SECURITY_TYPE_SECURE, SECURITY_TYPE_SELF_SIGNED_CERTIFICATE, parse_neo4j_uri,
15+
)
1316
from pyhocon import ConfigFactory, ConfigTree
1417

1518
from databuilder import Scoped
@@ -21,11 +24,13 @@
2124
NEO4J_MAX_CONN_LIFE_TIME_SEC = 'neo4j_max_conn_life_time_sec'
2225
NEO4J_USER = 'neo4j_user'
2326
NEO4J_PASSWORD = 'neo4j_password'
27+
# in Neo4j (v4.0+), we can create and use more than one active database at the same time
28+
NEO4J_DATABASE_NAME = 'neo4j_database'
29+
NEO4J_DRIVER = 'neo4j_driver'
2430
NEO4J_ENCRYPTED = 'neo4j_encrypted'
2531
"""NEO4J_ENCRYPTED is a boolean indicating whether to use SSL/TLS when connecting."""
2632
NEO4J_VALIDATE_SSL = 'neo4j_validate_ssl'
2733
"""NEO4J_VALIDATE_SSL is a boolean indicating whether to validate the server's SSL/TLS cert against system CAs."""
28-
2934
TARGET_NODES = "target_nodes"
3035
TARGET_RELATIONS = "target_relations"
3136
BATCH_SIZE = "batch_size"
@@ -41,8 +46,7 @@
4146

4247
DEFAULT_CONFIG = ConfigFactory.from_dict({BATCH_SIZE: 100,
4348
NEO4J_MAX_CONN_LIFE_TIME_SEC: 50,
44-
NEO4J_ENCRYPTED: True,
45-
NEO4J_VALIDATE_SSL: False,
49+
NEO4J_DATABASE_NAME: neo4j.DEFAULT_DATABASE,
4650
STALENESS_MAX_PCT: 5,
4751
TARGET_NODES: [],
4852
TARGET_RELATIONS: [],
@@ -127,14 +131,36 @@ def init(self, conf: ConfigTree) -> None:
127131
else:
128132
self.marker = conf.get_string(JOB_PUBLISH_TAG)
129133

130-
trust = neo4j.TRUST_SYSTEM_CA_SIGNED_CERTIFICATES if conf.get_bool(NEO4J_VALIDATE_SSL) \
131-
else neo4j.TRUST_ALL_CERTIFICATES
132-
self._driver = \
133-
GraphDatabase.driver(uri=conf.get_string(NEO4J_END_POINT_KEY),
134-
max_connection_lifetime=conf.get_int(NEO4J_MAX_CONN_LIFE_TIME_SEC),
135-
auth=(conf.get_string(NEO4J_USER), conf.get_string(NEO4J_PASSWORD)),
136-
encrypted=conf.get_bool(NEO4J_ENCRYPTED),
137-
trust=trust)
134+
driver = conf.get(NEO4J_DRIVER, None)
135+
if driver:
136+
self._driver = driver
137+
else:
138+
uri = conf.get_string(NEO4J_END_POINT_KEY)
139+
driver_args = {
140+
'uri': uri,
141+
'max_connection_lifetime': conf.get_int(NEO4J_MAX_CONN_LIFE_TIME_SEC),
142+
'auth': (conf.get_string(NEO4J_USER), conf.get_string(NEO4J_PASSWORD)),
143+
}
144+
145+
# if URI scheme not secure set `trust`` and `encrypted` to default values
146+
# https://neo4j.com/docs/api/python-driver/current/api.html#uri
147+
_, security_type, _ = parse_neo4j_uri(uri=uri)
148+
if security_type not in [SECURITY_TYPE_SELF_SIGNED_CERTIFICATE, SECURITY_TYPE_SECURE]:
149+
default_security_conf = {'trust': neo4j.TRUST_ALL_CERTIFICATES, 'encrypted': True}
150+
driver_args.update(default_security_conf)
151+
152+
# if NEO4J_VALIDATE_SSL or NEO4J_ENCRYPTED are set in config pass them to the driver
153+
validate_ssl_conf = conf.get(NEO4J_VALIDATE_SSL, None)
154+
encrypted_conf = conf.get(NEO4J_ENCRYPTED, None)
155+
if validate_ssl_conf is not None:
156+
driver_args['trust'] = neo4j.TRUST_SYSTEM_CA_SIGNED_CERTIFICATES if validate_ssl_conf \
157+
else neo4j.TRUST_ALL_CERTIFICATES
158+
if encrypted_conf is not None:
159+
driver_args['encrypted'] = encrypted_conf
160+
161+
self._driver = GraphDatabase.driver(**driver_args)
162+
163+
self.db_name = conf.get(NEO4J_DATABASE_NAME)
138164

139165
def run(self) -> None:
140166
"""
@@ -304,7 +330,7 @@ def _execute_cypher_query(self,
304330

305331
start = time.time()
306332
try:
307-
with self._driver.session() as session:
333+
with self._driver.session(database=self.db_name) as session:
308334
result = session.run(statement, **param_dict)
309335
return [record for record in result]
310336

databuilder/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from setuptools import find_packages, setup
77

8-
__version__ = '7.0.0'
8+
__version__ = '7.1.0'
99

1010
requirements_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
1111
'requirements.txt')

databuilder/tests/unit/extractor/test_neo4j_extractor.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import Any
66

77
from mock import patch
8+
from neo4j import GraphDatabase
89
from pyhocon import ConfigFactory
910

1011
from databuilder import Scoped
@@ -16,10 +17,11 @@ class TestNeo4jExtractor(unittest.TestCase):
1617

1718
def setUp(self) -> None:
1819
config_dict = {
19-
f'extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': 'TEST_GRAPH_URL',
20+
f'extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': 'bolt://example.com:7687',
2021
f'extractor.neo4j.{Neo4jExtractor.CYPHER_QUERY_CONFIG_KEY}': 'TEST_QUERY',
2122
f'extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': 'TEST_USER',
22-
f'extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': 'TEST_PW'
23+
f'extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': 'TEST_PW',
24+
f'extractor.neo4j.{Neo4jExtractor.NEO4J_MAX_CONN_LIFE_TIME_SEC}': 50,
2325
}
2426

2527
self.conf = ConfigFactory.from_dict(config_dict)
@@ -28,7 +30,7 @@ def text_extraction_with_empty_query_result(self: Any) -> None:
2830
"""
2931
Test Extraction with empty results from query
3032
"""
31-
with patch.object(Neo4jExtractor, '_get_driver'):
33+
with patch.object(GraphDatabase, 'driver'):
3234
extractor = Neo4jExtractor()
3335
extractor.init(Scoped.get_scoped_conf(conf=self.conf,
3436
scope=extractor.get_scope()))
@@ -41,7 +43,7 @@ def test_extraction_with_single_query_result(self: Any) -> None:
4143
"""
4244
Test Extraction with single result from query
4345
"""
44-
with patch.object(Neo4jExtractor, '_get_driver'):
46+
with patch.object(GraphDatabase, 'driver'):
4547
extractor = Neo4jExtractor()
4648
extractor.init(Scoped.get_scoped_conf(conf=self.conf,
4749
scope=extractor.get_scope()))
@@ -58,7 +60,7 @@ def test_extraction_with_multiple_query_result(self: Any) -> None:
5860
"""
5961
Test Extraction with multiple result from query
6062
"""
61-
with patch.object(Neo4jExtractor, '_get_driver'):
63+
with patch.object(GraphDatabase, 'driver'):
6264
extractor = Neo4jExtractor()
6365
extractor.init(Scoped.get_scoped_conf(conf=self.conf,
6466
scope=extractor.get_scope()))
@@ -83,17 +85,18 @@ def test_extraction_with_model_class(self: Any) -> None:
8385
Test Extraction using model class
8486
"""
8587
config_dict = {
86-
f'extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': 'TEST_GRAPH_URL',
88+
f'extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': 'bolt://example.com:7687',
8789
f'extractor.neo4j.{Neo4jExtractor.CYPHER_QUERY_CONFIG_KEY}': 'TEST_QUERY',
8890
f'extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': 'TEST_USER',
8991
f'extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': 'TEST_PW',
92+
f'extractor.neo4j.{Neo4jExtractor.NEO4J_MAX_CONN_LIFE_TIME_SEC}': 50,
9093
f'extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}':
9194
'databuilder.models.table_elasticsearch_document.TableESDocument'
9295
}
9396

9497
self.conf = ConfigFactory.from_dict(config_dict)
9598

96-
with patch.object(Neo4jExtractor, '_get_driver'):
99+
with patch.object(GraphDatabase, 'driver'):
97100
extractor = Neo4jExtractor()
98101
extractor.init(Scoped.get_scoped_conf(conf=self.conf,
99102
scope=extractor.get_scope()))

0 commit comments

Comments
 (0)