From 700bda2657510ec36de133b536e2c407980e353c Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Fri, 17 Jun 2016 11:52:26 -0400 Subject: [PATCH 1/7] PEDANTRY: Sort requirements.txt. --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index a37f44c..d476c99 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -tox>=2.3 SQLAlchemy>=1.0.5 alembic>=0.7.6 +click>=3.3 psycopg2>=2.6.1 requests>=2.7.0 six>=1.9.0 -click>=3.3 +tox>=2.3 From d37982cdb31d2d5c398d8ce15620b683c351b9b0 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 15 Jun 2016 15:37:16 -0400 Subject: [PATCH 2/7] ENH: Add encryption support. Adds new configurable ``crypto`` attributes to PostgresContentsManager and PostgresCheckpoints. ``crypto`` objects must provide ``encrypt`` and ``decrypt`` methods, which will be called on data being written to and read from the database, respectively. The default ``crypto`` implementation is ``NoEncryption``, which returns its inputs unchanged. The built-in ``crypto`` implementation is ``FernetEncryption``, which uses ``cryptography.fernet`` to implement symmetric-key encryption of all notebooks and files in the database. --- pgcontents/api_utils.py | 19 +++- pgcontents/checkpoints.py | 19 +++- pgcontents/crypto.py | 67 +++++++++++++ pgcontents/db_utils.py | 37 +++++-- pgcontents/error.py | 4 + pgcontents/managerbase.py | 23 ++++- pgcontents/pgmanager.py | 56 +++++++---- pgcontents/query.py | 107 ++++++++++++++------ pgcontents/tests/test_hybrid_manager.py | 3 + pgcontents/tests/test_pgcontents_api.py | 118 ++++++++++++++++++++--- pgcontents/tests/test_pgmanager.py | 59 ++++++++++-- pgcontents/tests/test_synchronization.py | 2 + pgcontents/tests/utils.py | 6 ++ pgcontents/utils/ipycompat.py | 3 + pgcontents/utils/sync.py | 3 +- requirements.txt | 1 + 16 files changed, 442 insertions(+), 85 deletions(-) create mode 100644 pgcontents/crypto.py diff --git a/pgcontents/api_utils.py b/pgcontents/api_utils.py index d5daa19..3911ba8 100644 --- a/pgcontents/api_utils.py +++ b/pgcontents/api_utils.py @@ -12,7 +12,7 @@ import posixpath from tornado.web import HTTPError -from .error import PathOutsideRoot +from .error import CorruptedFile, PathOutsideRoot from .utils.ipycompat import reads, writes NBFORMAT_VERSION = 4 @@ -117,7 +117,10 @@ def reads_base64(nb, as_version=NBFORMAT_VERSION): """ Read a notebook from base64. """ - return reads(b64decode(nb).decode('utf-8'), as_version=as_version) + try: + return reads(b64decode(nb).decode('utf-8'), as_version=as_version) + except Exception as e: + raise CorruptedFile(e) def _decode_text_from_base64(path, bcontent): @@ -161,7 +164,17 @@ def from_b64(path, bcontent, format): 'text': _decode_text_from_base64, None: _decode_unknown_from_base64, } - content, real_format = decoders[format](path, bcontent) + + try: + content, real_format = decoders[format](path, bcontent) + except HTTPError: + # Pass through HTTPErrors, since we intend for them to bubble all the + # way back to the API layer. + raise + except Exception as e: + # Anything else should be wrapped in a CorruptedFile, since it likely + # indicates misconfiguration of encryption. + raise CorruptedFile(e) default_mimes = { 'text': 'text/plain', diff --git a/pgcontents/checkpoints.py b/pgcontents/checkpoints.py index 13d4e4d..324fba5 100644 --- a/pgcontents/checkpoints.py +++ b/pgcontents/checkpoints.py @@ -40,7 +40,14 @@ def create_notebook_checkpoint(self, nb, path): """ b64_content = writes_base64(nb) with self.engine.begin() as db: - return save_remote_checkpoint(db, self.user_id, path, b64_content) + return save_remote_checkpoint( + db, + self.user_id, + path, + b64_content, + self.crypto.encrypt, + self.max_file_size_bytes, + ) @outside_root_to_404 def create_file_checkpoint(self, content, format, path): @@ -53,7 +60,14 @@ def create_file_checkpoint(self, content, format, path): except ValueError as e: self.do_400(str(e)) with self.engine.begin() as db: - return save_remote_checkpoint(db, self.user_id, path, b64_content) + return save_remote_checkpoint( + db, + self.user_id, + path, + b64_content, + self.crypto.encrypt, + self.max_file_size_bytes, + ) @outside_root_to_404 def delete_checkpoint(self, checkpoint_id, path): @@ -71,6 +85,7 @@ def _get_checkpoint(self, checkpoint_id, path): self.user_id, path, checkpoint_id, + self.crypto.decrypt, )['content'] @outside_root_to_404 diff --git a/pgcontents/crypto.py b/pgcontents/crypto.py new file mode 100644 index 0000000..6998b98 --- /dev/null +++ b/pgcontents/crypto.py @@ -0,0 +1,67 @@ +""" +Interface definition for encryption/decryption plugins for +PostgresContentsManager. + +Encryption backends should raise pgcontents.error.CorruptedFile if they +encounter an input that they cannot decrypt. +""" +from .error import CorruptedFile + + +class NoEncryption(object): + """ + No-op encryption backend. + + encrypt() and decrypt() simply return their inputs. + + Methods + ------- + encrypt : callable[bytes -> bytes] + decrypt : callable[bytes -> bytes] + """ + def encrypt(self, b): + return b + + def decrypt(self, b): + return b + + +class FernetEncryption(object): + """ + Notebook encryption using cryptography.fernet for symmetric-key encryption. + + Parameters + ---------- + fernet : cryptography.fernet.Fernet + The Fernet object to use for encryption. + + Methods + ------- + encrypt : callable[bytes -> bytes] + decrypt : callable[bytes -> bytes] + """ + __slots__ = ('_fernet',) + + def __init__(self, fernet): + self._fernet = fernet + + def encrypt(self, s): + return self._fernet.encrypt(s) + + def decrypt(self, s): + try: + return self._fernet.decrypt(s) + except Exception as e: + raise CorruptedFile(e) + + def __copy__(self, memo): + # Any value that appears in an IPython/Jupyter Config object needs to + # be deepcopy-able. Cryptography's Fernet objects aren't deepcopy-able, + # so we copy our underlying state to a new FernetEncryption object. + return FernetEncryption(self._fernet) + + def __deepcopy__(self, memo): + # Any value that appears in an IPython/Jupyter Config object needs to + # be deepcopy-able. Cryptography's Fernet objects aren't deepcopy-able, + # so we copy our underlying state to a new FernetEncryption object. + return FernetEncryption(self._fernet) diff --git a/pgcontents/db_utils.py b/pgcontents/db_utils.py index 2be0aca..91a68d4 100644 --- a/pgcontents/db_utils.py +++ b/pgcontents/db_utils.py @@ -17,7 +17,7 @@ """ from contextlib import contextmanager -from six.moves import zip +from six.moves import map, zip from psycopg2.errorcodes import ( FOREIGN_KEY_VIOLATION, @@ -65,14 +65,37 @@ def _get_name(column_like): return column_like.clause.name -def to_dict(fields, row): +def to_dict_no_content(fields, row): """ - Convert a SQLAlchemy row to a dict. + Convert a SQLAlchemy row that does not contain a 'content' field to a dict. If row is None, return None. + + Raises AssertionError if there is a field named 'content' in ``fields``. + """ + assert(len(fields) == len(row)) + + field_names = list(map(_get_name, fields)) + assert 'content' not in field_names, "Unexpected content field." + + return dict(zip(field_names, row)) + + +def to_dict_with_content(fields, row, decrypt_func): + """ + Convert a SQLAlchemy row that contains a 'content' field to a dict. + + ``decrypt_func`` will be applied to the ``content`` field of the row. + + If row is None, return None. + + Raises AssertionError if there is no field named 'content' in ``fields``. """ assert(len(fields) == len(row)) - return { - _get_name(field): value - for field, value in zip(fields, row) - } + + field_names = list(map(_get_name, fields)) + assert 'content' in field_names, "Missing content field." + + result = dict(zip(field_names, row)) + result['content'] = decrypt_func(result['content']) + return result diff --git a/pgcontents/error.py b/pgcontents/error.py index 2ba000a..46113be 100644 --- a/pgcontents/error.py +++ b/pgcontents/error.py @@ -37,3 +37,7 @@ class FileTooLarge(Exception): class RenameRoot(Exception): pass + + +class CorruptedFile(Exception): + pass diff --git a/pgcontents/managerbase.py b/pgcontents/managerbase.py index 9df3f09..e1f97aa 100644 --- a/pgcontents/managerbase.py +++ b/pgcontents/managerbase.py @@ -8,15 +8,16 @@ from sqlalchemy.engine.base import Engine from tornado.web import HTTPError +from .constants import UNLIMITED +from .crypto import NoEncryption from .query import ensure_db_user -from .utils.ipycompat import Bool, Instance, HasTraits, Unicode +from .utils.ipycompat import Any, Bool, Instance, Integer, HasTraits, Unicode class PostgresManagerMixin(HasTraits): """ - Shared for Postgres-backed ContentsManagers. + Shared behavior for Postgres-backed ContentsManagers. """ - db_url = Unicode( default_value="postgresql://{user}@/pgcontents".format( user=getuser(), @@ -38,6 +39,22 @@ class PostgresManagerMixin(HasTraits): help="Create a user for user_id automatically?", ) + max_file_size_bytes = Integer( + default_value=UNLIMITED, + config=True, + help="Maximum size in bytes of a file that will be saved.", + ) + + crypto = Any( + default_value=NoEncryption(), + allow_none=False, + config=True, + help=( + "Object with encrypt() and decrypt() methods to " + "call on data entering/exiting the database.", + ) + ) + engine = Instance(Engine) def _engine_default(self): diff --git a/pgcontents/pgmanager.py b/pgcontents/pgmanager.py index bff7e28..3faf756 100644 --- a/pgcontents/pgmanager.py +++ b/pgcontents/pgmanager.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -PostgreSQL implementation of IPython ContentsManager API. +PostgreSQL implementation of IPython/Jupyter ContentsManager API. """ from __future__ import unicode_literals from itertools import chain @@ -30,8 +30,8 @@ writes_base64, ) from .checkpoints import PostgresCheckpoints -from .constants import UNLIMITED from .error import ( + CorruptedFile, DirectoryExists, DirectoryNotEmpty, FileExists, @@ -56,7 +56,7 @@ rename_file, save_file, ) -from .utils.ipycompat import Bool, ContentsManager, Integer, from_dict +from .utils.ipycompat import Bool, ContentsManager, from_dict class PostgresContentsManager(PostgresManagerMixin, ContentsManager): @@ -64,12 +64,6 @@ class PostgresContentsManager(PostgresManagerMixin, ContentsManager): ContentsManager that persists to a postgres database rather than to the local filesystem. """ - max_file_size_bytes = Integer( - default_value=UNLIMITED, - config=True, - help="Maximum size in bytes of a file that will be saved.", - ) - create_directory_on_startup = Bool( config=True, help="Create a user for user_id automatically?", @@ -79,14 +73,14 @@ def _checkpoints_class_default(self): return PostgresCheckpoints def _checkpoints_kwargs_default(self): - kw = super(PostgresContentsManager, self)._checkpoints_kwargs_default() - kw.update( - { - 'db_url': self.db_url, - 'user_id': self.user_id, - 'create_user_on_startup': self.create_user_on_startup, - } - ) + kw = super(PostgresContentsManager, self)._checkpoint_kwargs_default() + kw.update({ + 'create_user_on_startup': self.create_user_on_startup, + 'crypto': self.crypto, + 'db_url': self.db_url, + 'max_file_size_bytes': self.max_file_size_bytes, + 'user_id': self.user_id, + }) return kw def _create_directory_on_startup_default(self): @@ -149,7 +143,15 @@ def get(self, path, content=True, type=None, format=None): }[type] except KeyError: raise ValueError("Unknown type passed: '{}'".format(type)) - return fn(path=path, content=content, format=format) + + try: + return fn(path=path, content=content, format=format) + except CorruptedFile as e: + self.log.error( + u'Corrupted file encountered at path %r. %s', + path, e, exc_info=True, + ) + self.do_500("Unable to read stored content at path %r." % path) @outside_root_to_404 def get_file_id(self, path): @@ -171,7 +173,13 @@ def _get_notebook(self, path, content, format): """ with self.engine.begin() as db: try: - record = get_file(db, self.user_id, path, content) + record = get_file( + db, + self.user_id, + path, + content, + self.crypto.decrypt, + ) except NoSuchFile: self.no_such_entity(path) @@ -265,7 +273,13 @@ def _file_model_from_db(self, record, content, format): def _get_file(self, path, content, format): with self.engine.begin() as db: try: - record = get_file(db, self.user_id, path, content) + record = get_file( + db, + self.user_id, + path, + content, + self.crypto.decrypt, + ) except NoSuchFile: if self.dir_exists(path): # TODO: It's awkward/expensive to have to check this to @@ -288,6 +302,7 @@ def _save_notebook(self, db, model, path): self.user_id, path, writes_base64(nb_contents), + self.crypto.encrypt, self.max_file_size_bytes, ) # It's awkward that this writes to the model instead of returning. @@ -303,6 +318,7 @@ def _save_file(self, db, model, path): self.user_id, path, to_b64(model['content'], model.get('format', None)), + self.crypto.encrypt, self.max_file_size_bytes, ) return None diff --git a/pgcontents/query.py b/pgcontents/query.py index 0afd024..4e6b9cd 100644 --- a/pgcontents/query.py +++ b/pgcontents/query.py @@ -25,7 +25,8 @@ ignore_unique_violation, is_unique_violation, is_foreign_key_violation, - to_dict, + to_dict_no_content, + to_dict_with_content, ) from .error import ( DirectoryNotEmpty, @@ -37,13 +38,40 @@ NoSuchFile, RenameRoot, ) -from .schema import( +from .schema import ( directories, files, remote_checkpoints, users, ) +# =============================== +# Encryption/Decryption Utilities +# =============================== + + +def preprocess_incoming_content(content, encrypt_func, max_size_bytes): + """ + Apply preprocessing steps to file/notebook content that we're going to + write to the database. + + Applies ``encrypt_func`` to ``content`` and checks that the result is + smaller than ``max_size_bytes``. + """ + encrypted = encrypt_func(content) + if max_size_bytes != UNLIMITED and len(encrypted) > max_size_bytes: + raise FileTooLarge() + return encrypted + + +def unused_decrypt_func(s): + """ + Used by invocations of ``get_file`` that don't expect decrypt_func to be + called. + """ + raise AssertionError("Unexpected decrypt call.") + + # ===== # Users # ===== @@ -201,7 +229,7 @@ def files_in_directory(db, user_id, db_dirname): files.c.user_id, files.c.parent_name, files.c.name, ) ) - return [to_dict(fields, row) for row in rows] + return [to_dict_no_content(fields, row) for row in rows] def directories_in_directory(db, user_id, db_dirname): @@ -216,7 +244,7 @@ def directories_in_directory(db, user_id, db_dirname): _is_in_directory(directories, user_id, db_dirname), ) ) - return [to_dict(fields, row) for row in rows] + return [to_dict_no_content(fields, row) for row in rows] def get_directory(db, user_id, api_dirname, content): @@ -300,7 +328,7 @@ def _file_default_fields(): ] -def _get_file(db, user_id, api_path, query_fields): +def _get_file(db, user_id, api_path, query_fields, decrypt_func): """ Get file data for the given user_id, path, and query_fields. The query_fields parameter specifies which database fields should be @@ -312,10 +340,14 @@ def _get_file(db, user_id, api_path, query_fields): if result is None: raise NoSuchFile(api_path) - return to_dict(query_fields, result) + + if files.c.content in query_fields: + return to_dict_with_content(query_fields, result, decrypt_func) + else: + return to_dict_no_content(query_fields, result) -def get_file(db, user_id, api_path, include_content): +def get_file(db, user_id, api_path, include_content, decrypt_func): """ Get file data for the given user_id and path. @@ -325,7 +357,7 @@ def get_file(db, user_id, api_path, include_content): if include_content: query_fields.append(files.c.content) - return _get_file(db, user_id, api_path, query_fields) + return _get_file(db, user_id, api_path, query_fields, decrypt_func) def get_file_id(db, user_id, api_path): @@ -333,7 +365,13 @@ def get_file_id(db, user_id, api_path): Get the value in the 'id' column for the file with the given user_id and path. """ - return _get_file(db, user_id, api_path, [files.c.id])['id'] + return _get_file( + db, + user_id, + api_path, + [files.c.id], + unused_decrypt_func, + )['id'] def delete_file(db, user_id, api_path): @@ -360,7 +398,13 @@ def file_exists(db, user_id, path): Check if a file exists. """ try: - get_file(db, user_id, path, include_content=False) + get_file( + db, + user_id, + path, + include_content=False, + decrypt_func=unused_decrypt_func, + ) return True except NoSuchFile: return False @@ -458,21 +502,17 @@ def rename_directory(db, user_id, old_api_path, new_api_path): ) -def check_content(content, max_size_bytes): - """ - Check that the content to be saved isn't too large to store. - """ - if max_size_bytes != UNLIMITED and len(content) > max_size_bytes: - raise FileTooLarge() - - -def save_file(db, user_id, path, content, max_size_bytes): +def save_file(db, user_id, path, content, encrypt_func, max_size_bytes): """ Save a file. TODO: Update-then-insert is probably cheaper than insert-then-update. """ - check_content(content, max_size_bytes) + content = preprocess_incoming_content( + content, + encrypt_func, + max_size_bytes, + ) directory, name = split_api_filepath(path) with db.begin_nested() as savepoint: try: @@ -556,7 +596,7 @@ def list_remote_checkpoints(db, user_id, api_path): ), ) - return [to_dict(fields, row) for row in results] + return [to_dict_no_content(fields, row) for row in results] def move_single_remote_checkpoint(db, @@ -597,7 +637,7 @@ def move_remote_checkpoints(db, user_id, src_api_path, dest_api_path): ) -def get_remote_checkpoint(db, user_id, api_path, checkpoint_id): +def get_remote_checkpoint(db, user_id, api_path, checkpoint_id, decrypt_func): db_path = from_api_filename(api_path) fields = [remote_checkpoints.c.content] result = db.execute( @@ -610,15 +650,25 @@ def get_remote_checkpoint(db, user_id, api_path, checkpoint_id): remote_checkpoints.c.id == int(checkpoint_id), ), ) - ).first() + ).first() # NOTE: This applies a LIMIT 1 to the query. if result is None: raise NoSuchCheckpoint(api_path, checkpoint_id) - return to_dict(fields, result) + return to_dict_with_content(fields, result, decrypt_func) -def save_remote_checkpoint(db, user_id, api_path, content): +def save_remote_checkpoint(db, + user_id, + api_path, + content, + encrypt_func, + max_size_bytes): + content = preprocess_incoming_content( + content, + encrypt_func, + max_size_bytes, + ) return_fields = _remote_checkpoint_default_fields() result = db.execute( remote_checkpoints.insert().values( @@ -630,7 +680,7 @@ def save_remote_checkpoint(db, user_id, api_path, content): ), ).first() - return to_dict(return_fields, result) + return to_dict_no_content(return_fields, result) def purge_remote_checkpoints(db, user_id): @@ -662,4 +712,7 @@ def latest_remote_checkpoints(db, user_id): remote_checkpoints.c.path, ) results = db.execute(query) - return (to_dict(query_fields, row) for row in results) + return ( + to_dict_no_content(query_fields, row) + for row in results + ) diff --git a/pgcontents/tests/test_hybrid_manager.py b/pgcontents/tests/test_hybrid_manager.py index c090eb7..38ab665 100644 --- a/pgcontents/tests/test_hybrid_manager.py +++ b/pgcontents/tests/test_hybrid_manager.py @@ -24,6 +24,7 @@ from .test_pgmanager import PostgresContentsManagerTestCase from .utils import ( assertRaisesHTTPError, + make_fernet, remigrate_test_schema, TEST_DB_URL, ) @@ -67,9 +68,11 @@ def make_dir(self, api_path): class PostgresTestCase(PostgresContentsManagerTestCase): def setUp(self): + self.crypto = make_fernet() self._pgmanager = PostgresContentsManager( user_id='test', db_url=TEST_DB_URL, + crypto=self.crypto, ) self._pgmanager.ensure_user() self._pgmanager.ensure_root_directory() diff --git a/pgcontents/tests/test_pgcontents_api.py b/pgcontents/tests/test_pgcontents_api.py index 026de2d..804f266 100644 --- a/pgcontents/tests/test_pgcontents_api.py +++ b/pgcontents/tests/test_pgcontents_api.py @@ -27,6 +27,7 @@ from requests import HTTPError from ..constants import UNLIMITED +from ..crypto import FernetEncryption, NoEncryption from ..hybridmanager import HybridContentsManager from ..pgmanager import ( PostgresContentsManager, @@ -43,6 +44,7 @@ ) from .utils import ( clear_test_db, + make_fernet, _norm_unicode, remigrate_test_schema, TEST_DB_URL, @@ -162,12 +164,20 @@ def test_list_checkpoints_sorting(self): ) -class PostgresContentsAPITest(_APITestBase): - +def postgres_contents_config(): + """ + Shared setup code for PostgresContentsAPITest and subclasses. + """ config = Config() config.NotebookApp.contents_manager_class = PostgresContentsManager config.PostgresContentsManager.user_id = 'test' config.PostgresContentsManager.db_url = TEST_DB_URL + return config + + +class PostgresContentsAPITest(_APITestBase): + + config = postgres_contents_config() # Don't support hidden directories. hidden_dirs = [] @@ -196,6 +206,10 @@ def user_id(self): def engine(self): return self.pg_manager.engine + @property + def crypto(self): + return self.pg_manager.crypto + # Superclass method overrides. def make_dir(self, api_path): with self.engine.begin() as db: @@ -208,16 +222,31 @@ def make_txt(self, api_path, txt): self.user_id, api_path, b64encode(txt.encode('utf-8')), + self.crypto.encrypt, UNLIMITED, ) def make_blob(self, api_path, blob): with self.engine.begin() as db: - save_file(db, self.user_id, api_path, b64encode(blob), UNLIMITED) + save_file( + db, + self.user_id, + api_path, + b64encode(blob), + self.crypto.encrypt, + UNLIMITED, + ) def make_nb(self, api_path, nb): with self.engine.begin() as db: - save_file(db, self.user_id, api_path, writes_base64(nb), UNLIMITED) + save_file( + db, + self.user_id, + api_path, + writes_base64(nb), + self.crypto.encrypt, + UNLIMITED, + ) def delete_dir(self, api_path, db=None): if self.isdir(api_path): @@ -257,9 +286,28 @@ def test_mkdir_hidden_400(self): def test_checkpoints_separate_root(self): pass + def test_crypto_types(self): + self.assertIsInstance(self.pg_manager.crypto, NoEncryption) + self.assertIsInstance(self.pg_manager.checkpoints.crypto, NoEncryption) -class PostgresContentsFileCheckpointsAPITest(PostgresContentsAPITest): +class EncryptedPostgresContentsAPITest(PostgresContentsAPITest): + config = postgres_contents_config() + # The key for this needs to be a b64-encoded 32-byte string. + config.PostgresContentsManager.crypto = make_fernet() + + def test_crypto_types(self): + self.assertIsInstance(self.pg_manager.crypto, FernetEncryption) + self.assertIsInstance( + self.pg_manager.checkpoints.crypto, + FernetEncryption, + ) + + +class PostgresContentsFileCheckpointsAPITest(PostgresContentsAPITest): + """ + Test using PostgresContents and FileCheckpoints. + """ config = Config() config.NotebookApp.contents_manager_class = PostgresContentsManager config.PostgresContentsManager.checkpoints_class = GenericFileCheckpoints @@ -281,17 +329,25 @@ def teardown_class(cls): cls.td.cleanup() -class PostgresCheckpointsAPITest(_APITestBase): +def postgres_checkpoints_config(): """ - Test using PostgresCheckpoints with the built-in FileContentsManager. + Shared setup for PostgresCheckpointsAPITest and subclasses. """ - config = Config() config.NotebookApp.contents_manager_class = FileContentsManager config.ContentsManager.checkpoints_class = PostgresCheckpoints config.PostgresCheckpoints.user_id = 'test' config.PostgresCheckpoints.db_url = TEST_DB_URL + return config + + +class PostgresCheckpointsAPITest(_APITestBase): + """ + Test using PostgresCheckpoints with the built-in FileContentsManager. + """ + config = postgres_checkpoints_config() + @property def checkpoints(self): return self.notebook.contents_manager.checkpoints @@ -312,6 +368,14 @@ def test_checkpoints_separate_root(self): pass +class EncryptedPostgresCheckpointsAPITest(PostgresCheckpointsAPITest): + config = postgres_checkpoints_config() + config.PostgresCheckpoints.crypto = make_fernet() + + def test_crypto_types(self): + self.assertIsInstance(self.checkpoints.crypto, FernetEncryption) + + class HybridContentsPGRootAPITest(PostgresContentsAPITest): """ Test using a HybridContentsManager splitting between files and Postgres. @@ -320,19 +384,23 @@ class HybridContentsPGRootAPITest(PostgresContentsAPITest): files_test_cls = APITest @classmethod - def setup_class(cls): - cls.td = TemporaryDirectory() - - cls.config = Config() - cls.config.NotebookApp.contents_manager_class = HybridContentsManager - cls.config.HybridContentsManager.manager_classes = { + def make_config(cls, td): + config = Config() + config.NotebookApp.contents_manager_class = HybridContentsManager + config.HybridContentsManager.manager_classes = { '': PostgresContentsManager, cls.files_prefix: FileContentsManager, } - cls.config.HybridContentsManager.manager_kwargs = { + config.HybridContentsManager.manager_kwargs = { '': {'user_id': 'test', 'db_url': TEST_DB_URL}, - cls.files_prefix: {'root_dir': cls.td.name}, + cls.files_prefix: {'root_dir': td.name}, } + return config + + @classmethod + def setup_class(cls): + cls.td = TemporaryDirectory() + cls.config = cls.make_config(cls.td) super(HybridContentsPGRootAPITest, cls).setup_class() @property @@ -379,6 +447,7 @@ def _method(self, api_path, *args): l[method_name] = __api_path_dispatch(method_name) del __methods_to_multiplex del __api_path_dispatch + del l # Override to not delete the root of the file subsystem. def test_delete_dirs(self): @@ -400,5 +469,22 @@ def test_delete_dirs(self): self.assertEqual(listing[0]['path'], self.files_prefix) +class EncryptedHybridContentsAPITest(HybridContentsPGRootAPITest): + + @classmethod + def make_config(cls, td): + config = super(EncryptedHybridContentsAPITest, cls).make_config(td) + config.HybridContentsManager.manager_kwargs['']['crypto'] = ( + make_fernet() + ) + return config + + def test_crypto_types(self): + self.assertIsInstance(self.pg_manager.crypto, FernetEncryption) + self.assertIsInstance( + self.pg_manager.checkpoints.crypto, + FernetEncryption, + ) + # This needs to be removed or else we'll run the main IPython tests as well. del APITest diff --git a/pgcontents/tests/test_pgmanager.py b/pgcontents/tests/test_pgmanager.py index f4d7a65..7821b54 100644 --- a/pgcontents/tests/test_pgmanager.py +++ b/pgcontents/tests/test_pgmanager.py @@ -18,15 +18,18 @@ from __future__ import unicode_literals from base64 import b64encode +from cryptography.fernet import Fernet from itertools import combinations from pgcontents.pgmanager import PostgresContentsManager from .utils import ( assertRaisesHTTPError, clear_test_db, + make_fernet, TEST_DB_URL, remigrate_test_schema, ) +from ..crypto import FernetEncryption from ..utils.ipycompat import TestContentsManager @@ -41,9 +44,11 @@ def tearDownClass(cls): pass def setUp(self): + self.crypto = make_fernet() self.contents_manager = PostgresContentsManager( user_id='test', db_url=TEST_DB_URL, + crypto=self.crypto, ) self.contents_manager.ensure_user() self.contents_manager.ensure_root_directory() @@ -55,7 +60,8 @@ def set_pgmgr_attribute(self, name, value): """ Overridable method for setting attributes on our pgmanager. - This exists so that HybridContentsManager can use + This exists so that we can re-use the tests here in + test_hybrid_manager. """ setattr(self.contents_manager, name, value) @@ -207,11 +213,16 @@ def test_rename_directory(self): def test_max_file_size(self): cm = self.contents_manager - max_size = 68 + max_size = 120 self.set_pgmgr_attribute('max_file_size_bytes', max_size) - good = 'a' * 51 - self.assertEqual(len(b64encode(good.encode('utf-8'))), max_size) + def size_in_db(s): + return len(self.crypto.encrypt(b64encode(s.encode('utf-8')))) + + # max_file_size_bytes should be based on the size in the database, not + # the size of the input. + good = 'a' * 10 + self.assertEqual(size_in_db(good), max_size) cm.save( model={ 'content': good, @@ -223,8 +234,8 @@ def test_max_file_size(self): result = cm.get('good.txt') self.assertEqual(result['content'], good) - bad = 'a' * 52 - self.assertGreater(len(b64encode(bad.encode('utf-8'))), max_size) + bad = 'a' * 30 + self.assertGreater(size_in_db(bad), max_size) with assertRaisesHTTPError(self, 413): cm.save( model={ @@ -235,6 +246,42 @@ def test_max_file_size(self): path='bad.txt', ) + def test_changing_crypto_disables_ability_to_read(self): + cm = self.contents_manager + + _, _, nb_path = self.new_notebook() + nb_model = cm.get(nb_path) + + file_path = 'file.txt' + cm.save( + model={ + 'content': 'not encrypted', + 'format': 'text', + 'type': 'file', + }, + path=file_path, + ) + file_model = cm.get(file_path) + + alt_key = b64encode(b'fizzbuzz' * 4) + self.set_pgmgr_attribute('crypto', FernetEncryption(Fernet(alt_key))) + + with assertRaisesHTTPError(self, 500): + cm.get(nb_path) + + with assertRaisesHTTPError(self, 500): + cm.get(file_path) + + # Restore the original crypto instance and verify that we can still + # decrypt. + self.set_pgmgr_attribute('crypto', self.crypto) + + decrypted_nb_model = cm.get(nb_path) + self.assertEqual(nb_model, decrypted_nb_model) + + decrypted_file_model = cm.get(file_path) + self.assertEqual(file_model, decrypted_file_model) + def test_relative_paths(self): cm = self.contents_manager diff --git a/pgcontents/tests/test_synchronization.py b/pgcontents/tests/test_synchronization.py index cb7cf1c..1afbde0 100644 --- a/pgcontents/tests/test_synchronization.py +++ b/pgcontents/tests/test_synchronization.py @@ -18,6 +18,7 @@ from six import iteritems from ..checkpoints import PostgresCheckpoints +from ..crypto import NoEncryption from .utils import ( clear_test_db, _norm_unicode, @@ -95,6 +96,7 @@ def test_download_checkpoints(self): self.checkpoints.db_url, td, user='test', + crypto=NoEncryption(), ) fm = FileContentsManager(root_dir=td) diff --git a/pgcontents/tests/utils.py b/pgcontents/tests/utils.py index ea6480b..994e654 100644 --- a/pgcontents/tests/utils.py +++ b/pgcontents/tests/utils.py @@ -4,6 +4,7 @@ """ from __future__ import unicode_literals from contextlib import contextmanager +from cryptography.fernet import Fernet from getpass import getuser from itertools import starmap import posixpath @@ -15,6 +16,7 @@ from tornado.web import HTTPError from ..api_utils import api_path_join +from ..crypto import FernetEncryption from ..schema import metadata from ..utils.ipycompat import ( new_code_cell, @@ -29,6 +31,10 @@ ) +def make_fernet(): + return FernetEncryption(Fernet(Fernet.generate_key())) + + def _norm_unicode(s): """Normalize unicode strings""" return normalize('NFC', py3compat.cast_unicode(s)) diff --git a/pgcontents/utils/ipycompat.py b/pgcontents/utils/ipycompat.py index 9342207..f8c0e67 100644 --- a/pgcontents/utils/ipycompat.py +++ b/pgcontents/utils/ipycompat.py @@ -37,6 +37,7 @@ ) from IPython.nbformat.v4.rwbase import strip_transient from IPython.utils.traitlets import ( + Any, Bool, Dict, Instance, @@ -71,6 +72,7 @@ ) from nbformat.v4.rwbase import strip_transient from traitlets import ( + Any, Bool, Dict, Instance, @@ -81,6 +83,7 @@ __all__ = [ 'APITest', + 'Any', 'Bool', 'Checkpoints', 'Config', diff --git a/pgcontents/utils/sync.py b/pgcontents/utils/sync.py index 5498d18..1995b8f 100644 --- a/pgcontents/utils/sync.py +++ b/pgcontents/utils/sync.py @@ -24,7 +24,7 @@ def create_user(db_url, user): ) -def download_checkpoints(db_url, directory, user): +def download_checkpoints(db_url, directory, user, crypto): """ Download users' most recent checkpoints to the given directory. """ @@ -37,6 +37,7 @@ def download_checkpoints(db_url, directory, user): db_url=db_url, user_id=user, create_user_on_startup=False, + crypto=crypto, ) cp_mgr.dump(contents_mgr) print("Done") diff --git a/requirements.txt b/requirements.txt index d476c99..babf0b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ SQLAlchemy>=1.0.5 alembic>=0.7.6 click>=3.3 +cryptography>=1.4 psycopg2>=2.6.1 requests>=2.7.0 six>=1.9.0 From a333ef49875753060dba6a6e20876b6819ccc560 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Mon, 20 Jun 2016 11:19:56 -0400 Subject: [PATCH 3/7] DOC: Note usage of MultiFernet. --- pgcontents/crypto.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pgcontents/crypto.py b/pgcontents/crypto.py index 6998b98..9f21a44 100644 --- a/pgcontents/crypto.py +++ b/pgcontents/crypto.py @@ -39,6 +39,15 @@ class FernetEncryption(object): ------- encrypt : callable[bytes -> bytes] decrypt : callable[bytes -> bytes] + + Notes + ----- + ``cryptography.fernet.MultiFernet`` can be used instead of a vanilla + ``Fernet`` to allow zero-downtime key rotation. + + See Also + -------- + :func:`pgcontents.utils.sync.reencrypt_user` """ __slots__ = ('_fernet',) From 4a2069eeaa22416789e46773d122bfc563cf538a Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Sat, 18 Jun 2016 18:56:34 -0400 Subject: [PATCH 4/7] ENH: Replace checkpoint download with re-encryption utilities. --- bin/pgcontents | 36 ---- pgcontents/checkpoints.py | 35 +--- pgcontents/managerbase.py | 2 +- pgcontents/pgmanager.py | 4 +- pgcontents/query.py | 114 ++++++++++-- pgcontents/tests/test_synchronization.py | 227 +++++++++++------------ pgcontents/utils/sync.py | 116 +++++++----- tox.ini | 2 + 8 files changed, 280 insertions(+), 256 deletions(-) diff --git a/bin/pgcontents b/bin/pgcontents index 3c8d411..1111ba9 100755 --- a/bin/pgcontents +++ b/bin/pgcontents @@ -1,7 +1,6 @@ #!/usr/bin/env python from getpass import getuser from os import getcwd -from os.path import join import subprocess from textwrap import dedent @@ -16,10 +15,6 @@ from pgcontents.utils.migrate import ( temp_alembic_ini, upgrade, ) -from pgcontents.utils.sync import ( - checkpoint_all, - download_checkpoints, -) @click.group(context_settings=dict(help_option_names=['-h', '--help'])) @@ -114,36 +109,5 @@ def gen_migration(db_url): ) -@main.command('download_checkpoints') -@_db_url -@_directory -@_users -def _download_checkpoints(db_url, directory, users): - """ - Download checkpoints to a directory. - """ - users = users.split(',') - if len(users) == 1: - download_checkpoints(db_url, directory, users[0]) - else: - for user in users: - download_checkpoints(db_url, join(directory, user), user) - - -@main.command('checkpoint_all') -@_db_url -@_directory -@_users -def _checkpoint_all(db_url, directory, users): - """ - Upload a checkpoint for every file in a directory. - """ - users = users.split(',') - if len(users) == 1: - checkpoint_all(db_url, directory, users[0]) - else: - for user in users: - checkpoint_all(db_url, join(directory, user), user) - if __name__ == "__main__": main() diff --git a/pgcontents/checkpoints.py b/pgcontents/checkpoints.py index 324fba5..9c8aef1 100644 --- a/pgcontents/checkpoints.py +++ b/pgcontents/checkpoints.py @@ -6,7 +6,6 @@ from .api_utils import ( _decode_unknown_from_base64, outside_root_to_404, - prefix_dirs, reads_base64, to_b64, writes_base64, @@ -16,7 +15,6 @@ delete_remote_checkpoints, delete_single_remote_checkpoint, get_remote_checkpoint, - latest_remote_checkpoints, list_remote_checkpoints, move_remote_checkpoints, purge_remote_checkpoints, @@ -77,7 +75,7 @@ def delete_checkpoint(self, checkpoint_id, path): db, self.user_id, path, checkpoint_id, ) - def _get_checkpoint(self, checkpoint_id, path): + def get_checkpoint_content(self, checkpoint_id, path): """Get the content of a checkpoint.""" with self.engine.begin() as db: return get_remote_checkpoint( @@ -90,7 +88,7 @@ def _get_checkpoint(self, checkpoint_id, path): @outside_root_to_404 def get_notebook_checkpoint(self, checkpoint_id, path): - b64_content = self._get_checkpoint(checkpoint_id, path) + b64_content = self.get_checkpoint_content(checkpoint_id, path) return { 'type': 'notebook', 'content': reads_base64(b64_content), @@ -98,7 +96,7 @@ def get_notebook_checkpoint(self, checkpoint_id, path): @outside_root_to_404 def get_file_checkpoint(self, checkpoint_id, path): - b64_content = self._get_checkpoint(checkpoint_id, path) + b64_content = self.get_checkpoint_content(checkpoint_id, path) content, format = _decode_unknown_from_base64(path, b64_content) return { 'type': 'file', @@ -135,30 +133,3 @@ def purge_db(self): """ with self.engine.begin() as db: purge_remote_checkpoints(db, self.user_id) - - def dump(self, contents_mgr): - """ - Synchronize the state of our database with the specified - ContentsManager. - - Gets the most recent checkpoint for each file and passes it to the - supplied ContentsManager to be saved. - """ - with self.engine.begin() as db: - records = latest_remote_checkpoints(db, self.user_id) - for record in records: - path = record['path'] - if not path.endswith('.ipynb'): - self.log.warn('Ignoring non-notebook file: {}', path) - continue - for dirname in prefix_dirs(path): - self.log.info("Ensuring directory [%s]" % dirname) - contents_mgr.save( - model={'type': 'directory'}, - path=dirname, - ) - self.log.info("Writing notebook [%s]" % path) - contents_mgr.save( - self.get_notebook_checkpoint(record['id'], path), - path, - ) diff --git a/pgcontents/managerbase.py b/pgcontents/managerbase.py index e1f97aa..088ef54 100644 --- a/pgcontents/managerbase.py +++ b/pgcontents/managerbase.py @@ -58,7 +58,7 @@ class PostgresManagerMixin(HasTraits): engine = Instance(Engine) def _engine_default(self): - return create_engine(self.db_url) + return create_engine(self.db_url, echo=True) def __init__(self, *args, **kwargs): super(PostgresManagerMixin, self).__init__(*args, **kwargs) diff --git a/pgcontents/pgmanager.py b/pgcontents/pgmanager.py index 3faf756..75246d6 100644 --- a/pgcontents/pgmanager.py +++ b/pgcontents/pgmanager.py @@ -66,14 +66,14 @@ class PostgresContentsManager(PostgresManagerMixin, ContentsManager): """ create_directory_on_startup = Bool( config=True, - help="Create a user for user_id automatically?", + help="Create a root directory automatically?", ) def _checkpoints_class_default(self): return PostgresCheckpoints def _checkpoints_kwargs_default(self): - kw = super(PostgresContentsManager, self)._checkpoint_kwargs_default() + kw = super(PostgresContentsManager, self)._checkpoints_kwargs_default() kw.update({ 'create_user_on_startup': self.create_user_on_startup, 'crypto': self.crypto, diff --git a/pgcontents/query.py b/pgcontents/query.py index 4e6b9cd..15c6c85 100644 --- a/pgcontents/query.py +++ b/pgcontents/query.py @@ -76,6 +76,9 @@ def unused_decrypt_func(s): # Users # ===== +def list_users(db): + return db.execute(select([users.c.id])) + def ensure_db_user(db, user_id): """ @@ -664,6 +667,9 @@ def save_remote_checkpoint(db, content, encrypt_func, max_size_bytes): + # IMPORTANT NOTE: Read the long comment at the top of + # ``reencrypt_user_content`` before you change this function. + content = preprocess_incoming_content( content, encrypt_func, @@ -694,25 +700,99 @@ def purge_remote_checkpoints(db, user_id): ) -def latest_remote_checkpoints(db, user_id): +########################## +# Reencryption Utilities # +########################## +def reencrypt_row_content(db, + table, + row_id, + decrypt_func, + encrypt_func, + logger): """ - Get the latest version of each file for the user. + Re-encrypt a row from ``table`` with ``id`` of ``row_id``. """ - query_fields = [ - remote_checkpoints.c.id, - remote_checkpoints.c.path, - ] + q = select([table.c.content]).where(table.c.id == row_id) + [(content,)] = db.execute(q) - query = select(query_fields).where( - remote_checkpoints.c.user_id == user_id, - ).order_by( - remote_checkpoints.c.path, - desc(remote_checkpoints.c.last_modified), - ).distinct( - remote_checkpoints.c.path, + logger.info("Begin encrypting %s row %s.", table.name, row_id) + db.execute( + table + .update() + .where(table.c.id == row_id) + .values(content=encrypt_func(decrypt_func(content))) + ) + logger.info("Done encrypting %s row %s.", table.name, row_id) + + +def select_file_ids(db, user_id): + """ + Get all file ids for a user. + """ + return list( + db.execute( + select([files.c.id]) + .where(files.c.user_id == user_id) + ) ) - results = db.execute(query) - return ( - to_dict_no_content(query_fields, row) - for row in results + + +def select_remote_checkpoint_ids(db, user_id): + """ + Get all file ids for a user. + """ + return list( + db.execute( + select([remote_checkpoints.c.id]) + .where(remote_checkpoints.c.user_id == user_id) + ) ) + + +def reencrypt_user_content(engine, + user_id, + old_decrypt_func, + new_encrypt_func, + logger): + """ + Re-encrypt all of the files and checkpoints for a single user. + """ + logger.info("Begin re-encryption for user %s", user_id) + with engine.begin() as db: + # NOTE: Doing both of these operations in one transaction depends for + # correctness on the fact that the creation of new checkpoints always + # involves writing new data into the database from Python, rather than + # simply copying data inside the DB. + + # If we change checkpoint creation so that it does an in-database copy, + # then we need to split this transaction to ensure that + # file-reencryption is complete before checkpoint-reencryption starts. + + # If that doesn't happen, it will be possible for a user to create a + # new checkpoint in a transaction that hasn't seen the completed + # file-reencryption process, but we might not see that checkpoint here, + # which means that we would never update the content of that checkpoint + # to the new encryption key. + + logger.info("Re-encrypting files for %s", user_id) + for (file_id,) in select_file_ids(db, user_id): + reencrypt_row_content( + db, + files, + file_id, + old_decrypt_func, + new_encrypt_func, + logger, + ) + + logger.info("Re-encrypting checkpoints for %s", user_id) + for (cp_id,) in select_remote_checkpoint_ids(db, user_id): + reencrypt_row_content( + db, + remote_checkpoints, + cp_id, + old_decrypt_func, + new_encrypt_func, + logger, + ) + logger.info("Finished re-encryption for user %s", user_id) diff --git a/pgcontents/tests/test_synchronization.py b/pgcontents/tests/test_synchronization.py index 1afbde0..8d8567e 100644 --- a/pgcontents/tests/test_synchronization.py +++ b/pgcontents/tests/test_synchronization.py @@ -2,56 +2,33 @@ Tests for synchronization tools. """ from __future__ import unicode_literals +from base64 import b64encode +from logging import Logger from unittest import TestCase -try: - from nbformat.v4 import new_markdown_cell - from nbformat.v4.rwbase import strip_transient - from notebook.services.contents.filemanager import \ - FileContentsManager -except ImportError: - from IPython.nbformat.v4 import new_markdown_cell - from IPython.nbformat.v4.rwbase import strip_transient - from IPython.html.services.contents.filemanager import \ - FileContentsManager -from IPython.utils.tempdir import TemporaryDirectory -from six import iteritems - -from ..checkpoints import PostgresCheckpoints -from ..crypto import NoEncryption +from cryptography.fernet import Fernet +from sqlalchemy import create_engine + +from pgcontents import PostgresContentsManager +from pgcontents.crypto import FernetEncryption, NoEncryption +from pgcontents.utils.ipycompat import new_markdown_cell + from .utils import ( + assertRaisesHTTPError, clear_test_db, - _norm_unicode, remigrate_test_schema, populate, TEST_DB_URL, ) -from ..utils.sync import ( - checkpoint_all, - download_checkpoints, -) - -setup_module = remigrate_test_schema +from ..utils.sync import reencrypt_all_users -class TestUploadDownload(TestCase): +class TestReEncryption(TestCase): def setUp(self): - - self.td = TemporaryDirectory() - self.checkpoints = PostgresCheckpoints( - user_id='test', - db_url=TEST_DB_URL, - ) - self.contents = FileContentsManager( - root_dir=self.td.name, - checkpoints=self.checkpoints, - ) - - self.checkpoints.ensure_user() + remigrate_test_schema() def tearDown(self): - self.td.cleanup() clear_test_db() def add_markdown_cell(self, path): @@ -65,98 +42,108 @@ def add_markdown_cell(self, path): self.contents.save(model, path=path) return model - def test_download_checkpoints(self): + def test_reencryption(self): """ - Create two checkpoints for two notebooks, then call - download_checkpoints. - - Assert that we get the correct version of both notebooks. + Create two unencrypted notebooks and a file, create checkpoints for + each, then encrypt and check that content is unchanged, then re-encrypt + and check the same. """ - self.contents.new({'type': 'directory'}, 'subdir') - paths = ('a.ipynb', 'subdir/a.ipynb') - expected_content = {} - for path in paths: - # Create and checkpoint. - self.contents.new(path=path) + db_url = TEST_DB_URL + user_id = 'test_reencryption' + + no_crypto = NoEncryption() + no_crypto_manager = PostgresContentsManager( + user_id=user_id, + db_url=db_url, + crypto=no_crypto, + create_user_on_startup=True, + ) - self.contents.create_checkpoint(path) + key1 = b'fizzbuzz' * 4 + crypto1 = FernetEncryption(Fernet(b64encode(key1))) + manager1 = PostgresContentsManager( + user_id=user_id, + db_url=db_url, + crypto=crypto1, + ) - model = self.add_markdown_cell(path) - self.contents.create_checkpoint(path) + key2 = key1[::-1] + crypto2 = FernetEncryption(Fernet(b64encode(key2))) + manager2 = PostgresContentsManager( + user_id=user_id, + db_url=db_url, + crypto=crypto2, + ) - # Assert greater because FileContentsManager creates a checkpoint - # on creation, but this isn't part of the spec. - self.assertGreater(len(self.contents.list_checkpoints(path)), 2) + # Populate an unencrypted user. + paths = populate(no_crypto_manager) - # Store the content to verify correctness after download. - expected_content[path] = model['content'] + original_content = {} + for path in paths: + # Create a checkpoint of the original content and store what we + # expect it to look like. + no_crypto_manager.create_checkpoint(path) + original_content[path] = no_crypto_manager.get(path)['content'] - with TemporaryDirectory() as td: - download_checkpoints( - self.checkpoints.db_url, - td, - user='test', - crypto=NoEncryption(), + updated_content = {} + for path in paths: + # Create a new version of each notebook with a cell appended. + model = no_crypto_manager.get(path=path) + model['content'].cells.append( + new_markdown_cell('Created by test: ' + path) ) + no_crypto_manager.save(model, path=path) - fm = FileContentsManager(root_dir=td) - root_entries = sorted(m['path'] for m in fm.get('')['content']) - self.assertEqual(root_entries, ['a.ipynb', 'subdir']) - subdir_entries = sorted( - m['path'] for m in fm.get('subdir')['content'] - ) - self.assertEqual(subdir_entries, ['subdir/a.ipynb']) - for path in paths: - content = fm.get(path)['content'] - self.assertEqual(expected_content[path], content) + # Store the updated content. + updated_content[path] = no_crypto_manager.get(path)['content'] - def test_checkpoint_all(self): - """ - Test that checkpoint_all correctly makes a checkpoint for all files. - """ - paths = populate(self.contents) - original_content_minus_trust = { - # Remove metadata that we expect to have dropped - path: strip_transient(self.contents.get(path)['content']) - for path in paths - } - - original_cps = {} - for path in paths: - # Create a checkpoint, then update the file. - original_cps[path] = self.contents.create_checkpoint(path) - self.add_markdown_cell(path) - - # Verify that we still have the old version checkpointed. - cp_content = { - path: self.checkpoints.get_notebook_checkpoint( - cp['id'], - path, - )['content'] - for path, cp in iteritems(original_cps) - } - self.assertEqual(original_content_minus_trust, cp_content) - - new_cps = checkpoint_all( - self.checkpoints.db_url, - self.td.name, - self.checkpoints.user_id, - ) + # Create a checkpoint of the new content. + no_crypto_manager.create_checkpoint(path) - new_cp_content = { - path: self.checkpoints.get_notebook_checkpoint( - cp['id'], - path, - )['content'] - for path, cp in iteritems(new_cps) - } - for path, new_content in iteritems(new_cp_content): - old_content = original_content_minus_trust[_norm_unicode(path)] - self.assertEqual( - new_content['cells'][:-1], - old_content['cells'], - ) - self.assertEqual( - new_content['cells'][-1], - new_markdown_cell('Created by test: ' + _norm_unicode(path)), - ) + def check_path_content(path, mgr, expected): + retrieved = mgr.get(path)['content'] + self.assertEqual(retrieved, expected[path]) + + def check_reencryption(old, new): + for path in paths: + # We should no longer be able to retrieve notebooks from the + # no-crypto manager. + with assertRaisesHTTPError(self, 500): + old.get(path) + + # The new manager should read the latest version of each file. + check_path_content(path, new, updated_content) + + # We should have two checkpoints available, one from the + # original version of the file, and one for the updated + # version. + (new_cp, old_cp) = new.list_checkpoints(path) + self.assertGreater( + new_cp['last_modified'], + old_cp['last_modified'], + ) + + # The old checkpoint should restore us to the original state. + new.restore_checkpoint(old_cp['id'], path) + check_path_content(path, new, original_content) + + # The new checkpoint should put us back into our updated state. + # state. + new.restore_checkpoint(new_cp['id'], path) + check_path_content(path, new, updated_content) + + engine = create_engine(db_url) + logger = Logger('Reencryption Testing') + + no_crypto_factory = {user_id: no_crypto}.__getitem__ + crypto1_factory = {user_id: crypto1}.__getitem__ + crypto2_factory = {user_id: crypto2}.__getitem__ + + reencrypt_all_users(engine, no_crypto_factory, crypto1_factory, logger) + check_reencryption(no_crypto_manager, manager1) + + reencrypt_all_users(engine, crypto1_factory, crypto2_factory, logger) + check_reencryption(manager1, manager2) + + reencrypt_all_users(engine, crypto2_factory, no_crypto_factory, logger) + check_reencryption(manager2, no_crypto_manager) diff --git a/pgcontents/utils/sync.py b/pgcontents/utils/sync.py index 1995b8f..d1aaf9c 100644 --- a/pgcontents/utils/sync.py +++ b/pgcontents/utils/sync.py @@ -6,11 +6,11 @@ unicode_literals, ) - -from IPython.utils.path import ensure_dir_exists - from ..checkpoints import PostgresCheckpoints -from ..utils.ipycompat import FileContentsManager +from ..query import ( + list_users, + reencrypt_user_content, +) def create_user(db_url, user): @@ -24,50 +24,6 @@ def create_user(db_url, user): ) -def download_checkpoints(db_url, directory, user, crypto): - """ - Download users' most recent checkpoints to the given directory. - """ - print("Synchronizing user {user} to {directory}".format( - user=user, directory=directory, - )) - ensure_dir_exists(directory) - contents_mgr = FileContentsManager(root_dir=directory) - cp_mgr = PostgresCheckpoints( - db_url=db_url, - user_id=user, - create_user_on_startup=False, - crypto=crypto, - ) - cp_mgr.dump(contents_mgr) - print("Done") - - -def checkpoint_all(db_url, directory, user): - """ - Upload the current state of a directory for each user. - """ - print("Checkpointing directory {directory} for user {user}".format( - directory=directory, user=user, - )) - - cp_mgr = PostgresCheckpoints( - db_url=db_url, - user_id=user, - create_user_on_startup=False, - ) - contents_mgr = FileContentsManager( - root_dir=directory, - checkpoints=cp_mgr, - ) - cps = {} - for dirname, subdirs, files in walk(contents_mgr): - for fname in files: - if fname.endswith('.ipynb'): - cps[fname] = contents_mgr.create_checkpoint(fname) - return cps - - def _separate_dirs_files(models): """ Split an iterable of models into a list of file paths and a list of @@ -108,3 +64,67 @@ def walk_dirs(mgr, dirs): if dirs: for entry in walk_dirs(mgr, dirs): yield entry + + +def walk_files(mgr): + """ + Iterate over all files visible to ``mgr``. + """ + for dir_, subdirs, files in walk_files(mgr): + for file_ in files: + yield file_ + + +def all_user_ids(engine): + """ + Get a list of user_ids from an engine. + """ + with engine.begin() as db: + return [row[0] for row in list_users(db)] + + +def reencrypt_all_users(engine, + old_crypto_factory, + new_crypto_factory, + logger): + """ + Re-encrypt data for all users. + + Parameters + ---------- + engine : SQLAlchemy.engine + Engine encapsulating database connections. + old_crypto_factory : function[str -> Any] + A function from user_id to an object providing the interface required + by PostgresContentsManager.crypto. Results of this will be used for + decryption of existing database content. + new_crypto_factory : function[str -> Any] + A function from user_id to an object providing the interface required + by PostgresContentsManager.crypto. Results of this will be used for + re-encryption of database content. + logger : logging.Logger, optional + A logger to user during re-encryption. + """ + logger.info("Beginning re-encryption for all users.") + for user_id in all_user_ids(engine): + reencrypt_user( + engine, + user_id, + old_crypto=old_crypto_factory(user_id), + new_crypto=new_crypto_factory(user_id), + logger=logger, + ) + logger.info("Finished re-encryption for all users.") + + +def reencrypt_user(engine, user_id, old_crypto, new_crypto, logger): + """ + Re-encrypt all files and checkpoints for a single user. + """ + reencrypt_user_content( + engine, + user_id, + old_crypto.decrypt, + new_crypto.encrypt, + logger=logger, + ) diff --git a/tox.ini b/tox.ini index 83000c3..dabfcb8 100644 --- a/tox.ini +++ b/tox.ini @@ -10,8 +10,10 @@ deps = py{27,34}-ipython3: .[test,ipy3] py{27,34}-ipython4: .[test,ipy4] flake8: flake8 + notest: .[ipy4] commands = py{27,34}-ipython{3,4}: -createdb pgcontents_testing py{27,34}-ipython{3,4}: nosetests pgcontents/tests flake8: flake8 pgcontents + notest: python -c 'from pgcontents.utils.ipycompat import *' From 23b0edc3f838fa975a34951fb7307c3e74f1327f Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Thu, 23 Jun 2016 12:39:18 -0400 Subject: [PATCH 5/7] ENH: Make re-encryption reentrant. --- pgcontents/crypto.py | 53 +++++++++++++++++ pgcontents/managerbase.py | 2 +- pgcontents/tests/test_synchronization.py | 21 ++++++- pgcontents/utils/sync.py | 76 ++++++++++++++++++++++-- 4 files changed, 143 insertions(+), 9 deletions(-) diff --git a/pgcontents/crypto.py b/pgcontents/crypto.py index 9f21a44..9925cd1 100644 --- a/pgcontents/crypto.py +++ b/pgcontents/crypto.py @@ -74,3 +74,56 @@ def __deepcopy__(self, memo): # be deepcopy-able. Cryptography's Fernet objects aren't deepcopy-able, # so we copy our underlying state to a new FernetEncryption object. return FernetEncryption(self._fernet) + + +class FallbackCrypto(object): + """ + Notebook encryption that accepts a list of crypto instances and decrypts by + trying them in order. + + Sub-cryptos should raise ``CorruptedFile`` if they're unable to decrypt an + input. + + This is conceptually similar to the technique used by + ``cryptography.fernet.MultiFernet`` for implementing key rotation. + + Parameters + ---------- + cryptos : list[object] + A sequence of cryptos to use for decryption. cryptos[0] will always be + used for encryption. + + Methods + ------- + encrypt : callable[bytes -> bytes] + decrypt : callable[bytes -> bytes] + + Notes + ----- + Since NoEncryption will always succeed, it is only supported as the last + entry in ``cryptos``. Passing a list with a NoEncryption not in the last + location will raise a ValueError. + """ + __slots__ = ('_cryptos',) + + def __init__(self, cryptos): + # Only the last crypto can be a ``NoEncryption``. + for c in cryptos[:-1]: + if isinstance(c, NoEncryption): + raise ValueError( + "NoEncryption is only supported as the last fallback." + ) + + self._cryptos = cryptos + + def encrypt(self, s): + return self._cryptos[0].encrypt(s) + + def decrypt(self, s): + errors = [] + for c in self._cryptos: + try: + return c.decrypt(s) + except CorruptedFile as e: + errors.append(e) + raise CorruptedFile(errors) diff --git a/pgcontents/managerbase.py b/pgcontents/managerbase.py index 088ef54..fc75604 100644 --- a/pgcontents/managerbase.py +++ b/pgcontents/managerbase.py @@ -58,7 +58,7 @@ class PostgresManagerMixin(HasTraits): engine = Instance(Engine) def _engine_default(self): - return create_engine(self.db_url, echo=True) + return create_engine(self.db_url, echo=False) def __init__(self, *args, **kwargs): super(PostgresManagerMixin, self).__init__(*args, **kwargs) diff --git a/pgcontents/tests/test_synchronization.py b/pgcontents/tests/test_synchronization.py index 8d8567e..a851501 100644 --- a/pgcontents/tests/test_synchronization.py +++ b/pgcontents/tests/test_synchronization.py @@ -20,7 +20,10 @@ populate, TEST_DB_URL, ) -from ..utils.sync import reencrypt_all_users +from ..utils.sync import ( + reencrypt_all_users, + unencrypt_all_users, +) class TestReEncryption(TestCase): @@ -145,5 +148,19 @@ def check_reencryption(old, new): reencrypt_all_users(engine, crypto1_factory, crypto2_factory, logger) check_reencryption(manager1, manager2) - reencrypt_all_users(engine, crypto2_factory, no_crypto_factory, logger) + with self.assertRaises(ValueError): + # Using reencrypt_all_users with a no-encryption target isn't + # supported. + reencrypt_all_users( + engine, + crypto2_factory, + no_crypto_factory, + logger, + ) + # There should have been no changes from the failed attempt. + check_reencryption(manager1, manager2) + + # Unencrypt and verify that we can now read everything with the no + # crypto manager. + unencrypt_all_users(engine, crypto2_factory, logger) check_reencryption(manager2, no_crypto_manager) diff --git a/pgcontents/utils/sync.py b/pgcontents/utils/sync.py index d1aaf9c..d58e7e0 100644 --- a/pgcontents/utils/sync.py +++ b/pgcontents/utils/sync.py @@ -7,6 +7,7 @@ ) from ..checkpoints import PostgresCheckpoints +from ..crypto import FallbackCrypto from ..query import ( list_users, reencrypt_user_content, @@ -90,6 +91,19 @@ def reencrypt_all_users(engine, """ Re-encrypt data for all users. + This function is idempotent, meaning that it should be possible to apply + the same re-encryption process multiple times without having any effect on + the database. Idempotency is achieved by first attempting to decrypt with + the old crypto and falling back to the new crypto on failure. + + An important consequence of this strategy is that **decrypting** a database + is not supported with this function, because ``NoEncryption.decrypt`` + always succeeds. To decrypt an already-encrypted database, use + ``unencrypt_all_users`` instead. + + It is, however, possible to perform an initial encryption of a database by + passing a function returning a ``NoEncryption`` as ``old_crypto_factory``. + Parameters ---------- engine : SQLAlchemy.engine @@ -102,12 +116,20 @@ def reencrypt_all_users(engine, A function from user_id to an object providing the interface required by PostgresContentsManager.crypto. Results of this will be used for re-encryption of database content. + + This **must not** return instances of ``NoEncryption``. Use + ``unencrypt_all_users`` if you want to unencrypt a database. logger : logging.Logger, optional A logger to user during re-encryption. + + See Also + -------- + reencrypt_user + unencrypt_all_users """ logger.info("Beginning re-encryption for all users.") for user_id in all_user_ids(engine): - reencrypt_user( + reencrypt_single_user( engine, user_id, old_crypto=old_crypto_factory(user_id), @@ -117,14 +139,56 @@ def reencrypt_all_users(engine, logger.info("Finished re-encryption for all users.") -def reencrypt_user(engine, user_id, old_crypto, new_crypto, logger): +def reencrypt_single_user(engine, user_id, old_crypto, new_crypto, logger): """ Re-encrypt all files and checkpoints for a single user. """ + # Use FallbackCrypto so that we're re-entrant if we halt partway through. + crypto = FallbackCrypto([new_crypto, old_crypto]) + + reencrypt_user_content( + engine=engine, + user_id=user_id, + old_decrypt_func=crypto.decrypt, + new_encrypt_func=crypto.encrypt, + logger=logger, + ) + + +def unencrypt_all_users(engine, old_crypto_factory, logger): + """ + Unencrypt data for all users. + + Parameters + ---------- + engine : SQLAlchemy.engine + Engine encapsulating database connections. + old_crypto_factory : function[str -> Any] + A function from user_id to an object providing the interface required + by PostgresContentsManager.crypto. Results of this will be used for + decryption of existing database content. + logger : logging.Logger, optional + A logger to user during re-encryption. + """ + logger.info("Beginning re-encryption for all users.") + for user_id in all_user_ids(engine): + unencrypt_single_user( + engine=engine, + user_id=user_id, + old_crypto=old_crypto_factory(user_id), + logger=logger, + ) + logger.info("Finished re-encryption for all users.") + + +def unencrypt_single_user(engine, user_id, old_crypto, logger): + """ + Unencrypt all files and checkpoints for a single user. + """ reencrypt_user_content( - engine, - user_id, - old_crypto.decrypt, - new_crypto.encrypt, + engine=engine, + user_id=user_id, + old_decrypt_func=old_crypto.decrypt, + new_encrypt_func=lambda s: s, logger=logger, ) From 144f195da54aea7c74db024783b31454fe39ab82 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Thu, 23 Jun 2016 12:42:34 -0400 Subject: [PATCH 6/7] TEST: Explicitly test re-entrancy. --- pgcontents/tests/test_synchronization.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/pgcontents/tests/test_synchronization.py b/pgcontents/tests/test_synchronization.py index a851501..3d10d1a 100644 --- a/pgcontents/tests/test_synchronization.py +++ b/pgcontents/tests/test_synchronization.py @@ -142,11 +142,24 @@ def check_reencryption(old, new): crypto1_factory = {user_id: crypto1}.__getitem__ crypto2_factory = {user_id: crypto2}.__getitem__ - reencrypt_all_users(engine, no_crypto_factory, crypto1_factory, logger) - check_reencryption(no_crypto_manager, manager1) + # Verify that reencryption is idempotent: + for _ in range(2): + reencrypt_all_users( + engine, + no_crypto_factory, + crypto1_factory, + logger, + ) + check_reencryption(no_crypto_manager, manager1) - reencrypt_all_users(engine, crypto1_factory, crypto2_factory, logger) - check_reencryption(manager1, manager2) + for _ in range(2): + reencrypt_all_users( + engine, + crypto1_factory, + crypto2_factory, + logger, + ) + check_reencryption(manager1, manager2) with self.assertRaises(ValueError): # Using reencrypt_all_users with a no-encryption target isn't From 7b8bad87336183f8335231d68f5612f9121b1779 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Thu, 23 Jun 2016 16:16:19 -0400 Subject: [PATCH 7/7] DOC: Remove old docstring. --- pgcontents/tests/test_pgcontents_api.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pgcontents/tests/test_pgcontents_api.py b/pgcontents/tests/test_pgcontents_api.py index 804f266..3eeffc9 100644 --- a/pgcontents/tests/test_pgcontents_api.py +++ b/pgcontents/tests/test_pgcontents_api.py @@ -293,7 +293,6 @@ def test_crypto_types(self): class EncryptedPostgresContentsAPITest(PostgresContentsAPITest): config = postgres_contents_config() - # The key for this needs to be a b64-encoded 32-byte string. config.PostgresContentsManager.crypto = make_fernet() def test_crypto_types(self):