-
Notifications
You must be signed in to change notification settings - Fork 132
Description
What happens?
I'm developing an incremental update pattern that deletes+inserts data into a partitioned table in ducklake.
When I call ducklake_rewrite_data_files I get an exception. INTERNAL Error: DuckLakeCompactor: Files have different hive partition path
Here is the full stack trace:
INTERNAL Error: DuckLakeCompactor: Files have different hive partition path
Stack Trace:
0 duckdb::Exception::Exception(duckdb::ExceptionType, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&) + 52
1 duckdb::InternalException::InternalException(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&) + 20
2 duckdb::DuckLakeCompactor::GenerateCompactionCommand(duckdb::vector<duckdb::DuckLakeCompactionFileEntry, true, std::__1::allocator<duckdb::DuckLakeCompactionFileEntry>>) + 5232
3 duckdb::DuckLakeCompactor::GenerateCompactions(duckdb::DuckLakeTableEntry&, duckdb::vector<duckdb::unique_ptr<duckdb::LogicalOperator, std::__1::default_delete<duckdb::LogicalOperator>, true>, true, std::__1::allocator<duckdb::unique_ptr<duckdb::LogicalOperator, std::__1::default_delete<duckdb::LogicalOperator>, true>>>&) + 924
4 duckdb::GenerateCompaction(duckdb::ClientContext&, duckdb::DuckLakeTransaction&, duckdb::DuckLakeCatalog&, duckdb::TableFunctionBindInput&, duckdb::DuckLakeTableEntry&, duckdb::CompactionType, double, unsigned long long, duckdb::optional_idx, duckdb::optional_idx, duckdb::vector<duckdb::unique_ptr<duckdb::LogicalOperator, std::__1::default_delete<duckdb::LogicalOperator>, true>, true, std::__1::allocator<duckdb::unique_ptr<duckdb::LogicalOperator, std::__1::default_delete<duckdb::LogicalOperator>, true>>>&) + 204
5 duckdb::BindCompaction(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, unsigned long long, duckdb::CompactionType) + 1624
6 duckdb::RewriteFilesBind(duckdb::ClientContext&, duckdb::TableFunctionBindInput&, unsigned long long, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>>&) + 124
7 duckdb::Binder::BindTableFunctionInternal(duckdb::TableFunction&, duckdb::TableFunctionRef const&, duckdb::vector<duckdb::Value, true, std::__1::allocator<duckdb::Value>>, std::__1::unordered_map<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, duckdb::Value, duckdb::CaseInsensitiveStringHashFunction, duckdb::CaseInsensitiveStringEquality, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const, duckdb::Value>>>, duckdb::vector<duckdb::LogicalType, true, std::__1::allocator<duckdb::LogicalType>>, duckdb::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, true, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>>) + 368
8 duckdb::Binder::Bind(duckdb::TableFunctionRef&) + 2552
9 duckdb::Binder::Bind(duckdb::TableRef&) + 360
10 duckdb::Binder::BindNode(duckdb::SelectNode&) + 68
11 duckdb::Binder::BindNode(duckdb::QueryNode&) + 524
12 duckdb::Binder::Bind(duckdb::CallStatement&) + 736
13 duckdb::Planner::CreatePlan(duckdb::SQLStatement&) + 156
14 duckdb::ClientContext::CreatePreparedStatementInternal(duckdb::ClientContextLock&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, duckdb::unique_ptr<duckdb::SQLStatement, std::__1::default_delete<duckdb::SQLStatement>, true>, duckdb::PendingQueryParameters) + 544
15 duckdb::ClientContext::CreatePreparedStatement(duckdb::ClientContextLock&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, duckdb::unique_ptr<duckdb::SQLStatement, std::__1::default_delete<duckdb::SQLStatement>, true>, duckdb::PendingQueryParameters, duckdb::PreparedStatementMode) + 1048
16 duckdb::ClientContext::PendingStatementInternal(duckdb::ClientContextLock&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, duckdb::unique_ptr<duckdb::SQLStatement, std::__1::default_delete<duckdb::SQLStatement>, true>, duckdb::PendingQueryParameters const&) + 132
17 duckdb::ClientContext::PendingStatementOrPreparedStatement(duckdb::ClientContextLock&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, duckdb::unique_ptr<duckdb::SQLStatement, std::__1::default_delete<duckdb::SQLStatement>, true>, duckdb::shared_ptr<duckdb::PreparedStatementData, true>&, duckdb::PendingQueryParameters const&) + 276
18 duckdb::ClientContext::PendingStatementOrPreparedStatementInternal(duckdb::ClientContextLock&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, duckdb::unique_ptr<duckdb::SQLStatement, std::__1::default_delete<duckdb::SQLStatement>, true>, duckdb::shared_ptr<duckdb::PreparedStatementData, true>&, duckdb::PendingQueryParameters const&) + 1648
19 duckdb::ClientContext::PendingQueryInternal(duckdb::ClientContextLock&, duckdb::unique_ptr<duckdb::SQLStatement, std::__1::default_delete<duckdb::SQLStatement>, true>, duckdb::PendingQueryParameters const&, bool) + 132
20 duckdb::ClientContext::Query(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, duckdb::QueryParameters) + 368
21 duckdb::Command::ExecuteQuery(duckdb::ExecuteContext&, duckdb::Connection*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, unsigned long long) const + 152
22 duckdb::Statement::ExecuteInternal(duckdb::ExecuteContext&) const + 284
23 duckdb::SQLLogicTestRunner::ExecuteCommand(duckdb::unique_ptr<duckdb::Command, std::__1::default_delete<duckdb::Command>, true>) + 112
24 duckdb::SQLLogicTestRunner::ExecuteFile(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>) + 16768
25 void testRunner<false, false>() + 1808
26 Catch::RunContext::invokeActiveTestCase() + 216
27 Catch::RunContext::runCurrentTest(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>&) + 484
28 Catch::RunContext::runTest(Catch::TestCase const&) + 320
29 Catch::Session::runInternal() + 4012
30 Catch::Session::run() + 156
31 main + 1688
32 start + 6076
The problem appears to be in how DuckLakeCompactor::GenerateCompactions handles partition groups. Basically in the path handling deletes, the early return appears to skip files being grouped by partition.
To Reproduce
I generated a test to reproduce the problem. To set it up, create a new test file test/sql/rewrite_data_files/test_rewrite_partitioned_table.test with the following:
# name: test/sql/rewrite_data_files/test_rewrite_partitioned_table.test
# description: Test rewrite_data_files on partitioned tables with delete files created via MERGE
# group: [rewrite_data_files]
require ducklake
require parquet
test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db
test-env DATA_PATH __TEST_DIR__
statement ok
ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '${DATA_PATH}/rewrite_partitioned', METADATA_CATALOG 'ducklake_metadata')
statement ok
USE ducklake
# Create a partitioned table
statement ok
CREATE TABLE partitioned(part_key INTEGER, id INTEGER, value INTEGER);
statement ok
ALTER TABLE partitioned SET PARTITIONED BY (part_key);
# Initial data load into multiple partitions
statement ok
INSERT INTO partitioned VALUES (1, 1, 10), (1, 2, 20);
statement ok
INSERT INTO partitioned VALUES (2, 1, 100), (2, 2, 200);
# We should have 2 data files (one per insert, but inserts had multiple rows)
query I
SELECT COUNT(*) FROM GLOB('${DATA_PATH}/rewrite_partitioned/**/*.parquet')
----
2
# Simulate incremental load with MERGE (upsert pattern)
# This updates existing rows and inserts new ones, creating delete files
statement ok
CREATE TEMP TABLE _merge_source AS
SELECT * FROM (VALUES (1, 1, 15), (1, 3, 30), (2, 1, 150), (2, 3, 300)) AS t(part_key, id, value);
statement ok
MERGE INTO partitioned AS target
USING _merge_source AS source
ON (target.part_key = source.part_key AND target.id = source.id)
WHEN MATCHED THEN UPDATE SET value = source.value
WHEN NOT MATCHED THEN INSERT *;
# Verify delete files were created by MERGE
query I
SELECT COUNT(*) FROM ducklake_metadata.ducklake_delete_file WHERE end_snapshot IS NULL
----
2
# Verify data is correct after merge
query III
SELECT * FROM partitioned ORDER BY part_key, id
----
1 1 15
1 2 20
1 3 30
2 1 150
2 2 200
2 3 300
# This should rewrite files with deletes, grouping by partition
# Currently this fails with: "DuckLakeCompactor: Files have different hive partition path"
statement ok
CALL ducklake_rewrite_data_files('ducklake', 'partitioned', delete_threshold => 0);
# After rewrite, delete files should be gone (applied to new data files)
query I
SELECT COUNT(*) FROM ducklake_metadata.ducklake_delete_file WHERE end_snapshot IS NULL
----
0
# Verify data is still correct after rewrite
query III
SELECT * FROM partitioned ORDER BY part_key, id
----
1 1 15
1 2 20
1 3 30
2 1 150
2 2 200
2 3 300
# Merge adjacent files to consolidate (rewrite doesn't merge, just applies deletes)
statement ok
CALL ducklake_merge_adjacent_files('ducklake', 'partitioned');
# Cleanup old files
statement ok
CALL ducklake_cleanup_old_files('ducklake', cleanup_all => true);
# Verify partition info is preserved correctly - should have 2 files (one per partition)
query II
SELECT partition_id, partition_value
FROM ducklake_metadata.ducklake_data_file
JOIN ducklake_metadata.ducklake_file_partition_value USING (data_file_id)
WHERE end_snapshot IS NULL
ORDER BY partition_value
----
2 1
2 2
OS:
MacOS 15.7.3: Darwin Mac.lan 24.6.0 Darwin Kernel Version 24.6.0: Wed Nov 5 21:32:38 PST 2025; root:xnu-11417.140.69.705.2~1/RELEASE_ARM64_T6031 arm64
DuckDB Version:
using 1.4.3 in project
DuckLake Version:
confirmed in ducklake main 06e9377
DuckDB Client:
Python
Hardware:
No response
Full Name:
Sam Vevang
Affiliation:
PRX
What is the latest build you tested with? If possible, we recommend testing with the latest nightly build.
I have tested with a source build
Did you include all relevant data sets for reproducing the issue?
Yes
Did you include all code required to reproduce the issue?
- Yes, I have
Did you include all relevant configuration (e.g., CPU architecture, Python version, Linux distribution) to reproduce the issue?
- Yes, I have