Skip to content

Calling ducklake_rewrite_data_files on partitioned table throws exception "DuckLakeCompactor: Files have different hive partition path" #692

@svevang

Description

@svevang

What happens?

I'm developing an incremental update pattern that deletes+inserts data into a partitioned table in ducklake.

When I call ducklake_rewrite_data_files I get an exception. INTERNAL Error: DuckLakeCompactor: Files have different hive partition path

Here is the full stack trace:

INTERNAL Error: DuckLakeCompactor: Files have different hive partition path

Stack Trace:

0        duckdb::Exception::Exception(duckdb::ExceptionType, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&) + 52
1        duckdb::InternalException::InternalException(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&) + 20
2        duckdb::DuckLakeCompactor::GenerateCompactionCommand(duckdb::vector<duckdb::DuckLakeCompactionFileEntry, true, std::__1::allocator<duckdb::DuckLakeCompactionFileEntry>>) + 5232
3        duckdb::DuckLakeCompactor::GenerateCompactions(duckdb::DuckLakeTableEntry&, duckdb::vector<duckdb::unique_ptr<duckdb::LogicalOperator, std::__1::default_delete<duckdb::LogicalOperator>, true>, true, std::__1::allocator<duckdb::unique_ptr<duckdb::LogicalOperator, std::__1::default_delete<duckdb::LogicalOperator>, true>>>&) + 924
4        duckdb::GenerateCompaction(duckdb::ClientContext&, duckdb::DuckLakeTransaction&, duckdb::DuckLakeCatalog&, duckdb::TableFunctionBindInput&, duckdb::DuckLakeTableEntry&, duckdb::CompactionType, double, unsigned long long, duckdb::optional_idx, duckdb::optional_idx, duckdb::vector<duckdb::unique_ptr<duckdb::LogicalOperator, std::__1::default_delete<duckdb::LogicalOperator>, true>, true, std::__1::allocator<duckdb::unique_ptr<duckdb::LogicalOperator, std::__1::default_delete<duckdb::LogicalOperator>, true>>>&) + 204
5        duckdb::BindCompaction(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, unsigned long long, duckdb::CompactionType) + 1624
6        duckdb::RewriteFilesBind(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, unsigned long long, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>>&) + 124
7        duckdb::Binder::BindTableFunctionInternal(duckdb::TableFunction&, duckdb::TableFunctionRef const&, duckdb::vector<duckdb::Value, true, std::__1::allocator<duckdb::Value>>, std::__1::unordered_map<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, duckdb::Value, duckdb::CaseInsensitiveStringHashFunction, duckdb::CaseInsensitiveStringEquality, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const, duckdb::Value>>>, duckdb::vector<duckdb::LogicalType, true, std::__1::allocator<duckdb::LogicalType>>, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>>) + 368
8        duckdb::Binder::Bind(duckdb::TableFunctionRef&) + 2552
9        duckdb::Binder::Bind(duckdb::TableRef&) + 360
10       duckdb::Binder::BindNode(duckdb::SelectNode&) + 68
11       duckdb::Binder::BindNode(duckdb::QueryNode&) + 524
12       duckdb::Binder::Bind(duckdb::CallStatement&) + 736
13       duckdb::Planner::CreatePlan(duckdb::SQLStatement&) + 156
14       duckdb::ClientContext::CreatePreparedStatementInternal(duckdb::ClientContextLock&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, duckdb::unique_ptr<duckdb::SQLStatement, std::__1::default_delete<duckdb::SQLStatement>, true>, duckdb::PendingQueryParameters) + 544
15       duckdb::ClientContext::CreatePreparedStatement(duckdb::ClientContextLock&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, duckdb::unique_ptr<duckdb::SQLStatement, std::__1::default_delete<duckdb::SQLStatement>, true>, duckdb::PendingQueryParameters, duckdb::PreparedStatementMode) + 1048
16       duckdb::ClientContext::PendingStatementInternal(duckdb::ClientContextLock&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, duckdb::unique_ptr<duckdb::SQLStatement, std::__1::default_delete<duckdb::SQLStatement>, true>, duckdb::PendingQueryParameters const&) + 132
17       duckdb::ClientContext::PendingStatementOrPreparedStatement(duckdb::ClientContextLock&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, duckdb::unique_ptr<duckdb::SQLStatement, std::__1::default_delete<duckdb::SQLStatement>, true>, duckdb::shared_ptr<duckdb::PreparedStatementData, true>&, duckdb::PendingQueryParameters const&) + 276
18       duckdb::ClientContext::PendingStatementOrPreparedStatementInternal(duckdb::ClientContextLock&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, duckdb::unique_ptr<duckdb::SQLStatement, std::__1::default_delete<duckdb::SQLStatement>, true>, duckdb::shared_ptr<duckdb::PreparedStatementData, true>&, duckdb::PendingQueryParameters const&) + 1648
19       duckdb::ClientContext::PendingQueryInternal(duckdb::ClientContextLock&, duckdb::unique_ptr<duckdb::SQLStatement, std::__1::default_delete<duckdb::SQLStatement>, true>, duckdb::PendingQueryParameters const&, bool) + 132
20       duckdb::ClientContext::Query(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, duckdb::QueryParameters) + 368
21       duckdb::Command::ExecuteQuery(duckdb::ExecuteContext&, duckdb::Connection*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, unsigned long long) const + 152
22       duckdb::Statement::ExecuteInternal(duckdb::ExecuteContext&) const + 284
23       duckdb::SQLLogicTestRunner::ExecuteCommand(duckdb::unique_ptr<duckdb::Command, std::__1::default_delete<duckdb::Command>, true>) + 112
24       duckdb::SQLLogicTestRunner::ExecuteFile(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>) + 16768
25       void testRunner<false, false>() + 1808
26       Catch::RunContext::invokeActiveTestCase() + 216
27       Catch::RunContext::runCurrentTest(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>&) + 484
28       Catch::RunContext::runTest(Catch::TestCase const&) + 320
29       Catch::Session::runInternal() + 4012
30       Catch::Session::run() + 156
31       main + 1688
32       start + 6076

The problem appears to be in how DuckLakeCompactor::GenerateCompactions handles partition groups. Basically in the path handling deletes, the early return appears to skip files being grouped by partition.

To Reproduce

I generated a test to reproduce the problem. To set it up, create a new test file test/sql/rewrite_data_files/test_rewrite_partitioned_table.test with the following:

# name: test/sql/rewrite_data_files/test_rewrite_partitioned_table.test
# description: Test rewrite_data_files on partitioned tables with delete files created via MERGE
# group: [rewrite_data_files]

require ducklake

require parquet

test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db

test-env DATA_PATH __TEST_DIR__


statement ok
ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '${DATA_PATH}/rewrite_partitioned', METADATA_CATALOG 'ducklake_metadata')

statement ok
USE ducklake

# Create a partitioned table
statement ok
CREATE TABLE partitioned(part_key INTEGER, id INTEGER, value INTEGER);

statement ok
ALTER TABLE partitioned SET PARTITIONED BY (part_key);

# Initial data load into multiple partitions
statement ok
INSERT INTO partitioned VALUES (1, 1, 10), (1, 2, 20);

statement ok
INSERT INTO partitioned VALUES (2, 1, 100), (2, 2, 200);

# We should have 2 data files (one per insert, but inserts had multiple rows)
query I
SELECT COUNT(*) FROM GLOB('${DATA_PATH}/rewrite_partitioned/**/*.parquet')
----
2

# Simulate incremental load with MERGE (upsert pattern)
# This updates existing rows and inserts new ones, creating delete files
statement ok
CREATE TEMP TABLE _merge_source AS
SELECT * FROM (VALUES (1, 1, 15), (1, 3, 30), (2, 1, 150), (2, 3, 300)) AS t(part_key, id, value);

statement ok
MERGE INTO partitioned AS target
USING _merge_source AS source
ON (target.part_key = source.part_key AND target.id = source.id)
WHEN MATCHED THEN UPDATE SET value = source.value
WHEN NOT MATCHED THEN INSERT *;

# Verify delete files were created by MERGE
query I
SELECT COUNT(*) FROM ducklake_metadata.ducklake_delete_file WHERE end_snapshot IS NULL
----
2

# Verify data is correct after merge
query III
SELECT * FROM partitioned ORDER BY part_key, id
----
1	1	15
1	2	20
1	3	30
2	1	150
2	2	200
2	3	300

# This should rewrite files with deletes, grouping by partition
# Currently this fails with: "DuckLakeCompactor: Files have different hive partition path"
statement ok
CALL ducklake_rewrite_data_files('ducklake', 'partitioned', delete_threshold => 0);

# After rewrite, delete files should be gone (applied to new data files)
query I
SELECT COUNT(*) FROM ducklake_metadata.ducklake_delete_file WHERE end_snapshot IS NULL
----
0

# Verify data is still correct after rewrite
query III
SELECT * FROM partitioned ORDER BY part_key, id
----
1	1	15
1	2	20
1	3	30
2	1	150
2	2	200
2	3	300

# Merge adjacent files to consolidate (rewrite doesn't merge, just applies deletes)
statement ok
CALL ducklake_merge_adjacent_files('ducklake', 'partitioned');

# Cleanup old files
statement ok
CALL ducklake_cleanup_old_files('ducklake', cleanup_all => true);

# Verify partition info is preserved correctly - should have 2 files (one per partition)
query II
SELECT partition_id, partition_value
FROM ducklake_metadata.ducklake_data_file
JOIN ducklake_metadata.ducklake_file_partition_value USING (data_file_id)
WHERE end_snapshot IS NULL
ORDER BY partition_value
----
2	1
2	2

OS:

MacOS 15.7.3: Darwin Mac.lan 24.6.0 Darwin Kernel Version 24.6.0: Wed Nov 5 21:32:38 PST 2025; root:xnu-11417.140.69.705.2~1/RELEASE_ARM64_T6031 arm64

DuckDB Version:

using 1.4.3 in project

DuckLake Version:

confirmed in ducklake main 06e9377

DuckDB Client:

Python

Hardware:

No response

Full Name:

Sam Vevang

Affiliation:

PRX

What is the latest build you tested with? If possible, we recommend testing with the latest nightly build.

I have tested with a source build

Did you include all relevant data sets for reproducing the issue?

Yes

Did you include all code required to reproduce the issue?

  • Yes, I have

Did you include all relevant configuration (e.g., CPU architecture, Python version, Linux distribution) to reproduce the issue?

  • Yes, I have

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions