diff --git a/RELEASE.md b/RELEASE.md index afec39d6a6..de34aeca18 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -7,6 +7,7 @@ * Allowed modern versions of JupyterLab and Jupyter Notebooks. * Removed setuptools dependency * Added `source_dir` explicitly in `pyproject.toml` for non-src layout project. +* `MemoryDataset` entries are now included in free outputs. ## Breaking changes to the API * Added logging about not using async mode in `SequentiallRunner` and `ParallelRunner`. diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index 0570f64444..0aa9f07e13 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -94,9 +94,16 @@ def run( f"Pipeline input(s) {unsatisfied} not found in the DataCatalog" ) + # Identify MemoryDataset in the catalog + memory_datasets = { + ds_name + for ds_name, ds in catalog._datasets.items() + if isinstance(ds, MemoryDataset) + } + # Check if there's any output datasets that aren't in the catalog and don't match a pattern - # in the catalog. - free_outputs = pipeline.outputs() - set(registered_ds) + # in the catalog and include MemoryDataset. + free_outputs = pipeline.outputs() - (set(registered_ds) - memory_datasets) # Register the default dataset pattern with the catalog catalog = catalog.shallow_copy( diff --git a/tests/runner/conftest.py b/tests/runner/conftest.py index 0ce581e624..25ca233e97 100644 --- a/tests/runner/conftest.py +++ b/tests/runner/conftest.py @@ -165,3 +165,13 @@ def two_branches_crossed_pipeline(): node(identity, "ds3_B", "ds4_B", name="node4_B"), ] ) + + +@pytest.fixture +def pipeline_with_memory_datasets(): + return pipeline( + [ + node(func=identity, inputs="Input1", outputs="MemOutput1", name="node1"), + node(func=identity, inputs="Input2", outputs="MemOutput2", name="node2"), + ] + ) diff --git a/tests/runner/test_sequential_runner.py b/tests/runner/test_sequential_runner.py index 3ee297c0a7..0e28feed6d 100644 --- a/tests/runner/test_sequential_runner.py +++ b/tests/runner/test_sequential_runner.py @@ -7,7 +7,13 @@ import pytest from kedro.framework.hooks import _create_hook_manager -from kedro.io import AbstractDataset, DataCatalog, DatasetError, LambdaDataset +from kedro.io import ( + AbstractDataset, + DataCatalog, + DatasetError, + LambdaDataset, + MemoryDataset, +) from kedro.pipeline import node from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import SequentialRunner @@ -279,3 +285,29 @@ def test_suggest_resume_scenario( hook_manager=_create_hook_manager(), ) assert re.search(expected_pattern, caplog.text) + + +class TestMemoryDatasetBehaviour: + def test_run_includes_memory_datasets(self, pipeline_with_memory_datasets): + # Create a catalog with MemoryDataset entries and inputs for the pipeline + catalog = DataCatalog( + { + "Input1": LambdaDataset(load=lambda: "data1", save=lambda data: None), + "Input2": LambdaDataset(load=lambda: "data2", save=lambda data: None), + "MemOutput1": MemoryDataset(), + "MemOutput2": MemoryDataset(), + } + ) + + # Add a regular dataset to the catalog + catalog.add("RegularOutput", LambdaDataset(None, None, lambda: True)) + + # Run the pipeline + output = SequentialRunner().run(pipeline_with_memory_datasets, catalog) + + # Check that MemoryDataset outputs are included in the run results + assert "MemOutput1" in output + assert "MemOutput2" in output + assert ( + "RegularOutput" not in output + ) # This output is registered in DataCatalog and so should not be in free outputs