fix(query): fix incorrect order of group by items with CTE or subquery (#18692)

sundy-li · web-flow · commit 960f44e6291b · 2025-09-07T17:56:56.000+08:00
* fix(query): fix incorrect order of group by items with CTE or subquery

* fix(query): fix incorrect order of group by items with CTE or subquery
diff --git a/.gitignore b/.gitignore
@@ -80,4 +80,5 @@ benchmark/clickbench/results
 # lychee
 .lycheecache
 
-tests/nox/cache
+# tmp
+tmp
diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs
@@ -18,6 +18,7 @@ use std::sync::atomic::Ordering;
 use std::sync::Arc;
 
 use bumpalo::Bump;
+use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 
 use super::partitioned_payload::PartitionedPayload;
@@ -176,6 +177,20 @@ impl AggregateHashTable {
         agg_states: ProjectedBlock,
         row_count: usize,
     ) -> Result<usize> {
+        #[cfg(debug_assertions)]
+        {
+            for (i, group_column) in group_columns.iter().enumerate() {
+                if group_column.data_type() != self.payload.group_types[i] {
+                    return Err(ErrorCode::UnknownException(format!(
+                        "group_column type not match in index {}, expect: {:?}, actual: {:?}",
+                        i,
+                        self.payload.group_types[i],
+                        group_column.data_type()
+                    )));
+                }
+            }
+        }
+
         state.row_count = row_count;
         group_hash_columns(group_columns, &mut state.group_hashes);
 
diff --git a/src/query/sql/src/planner/optimizer/optimizers/rule/agg_rules/rule_hierarchical_grouping_sets.rs b/src/query/sql/src/planner/optimizer/optimizers/rule/agg_rules/rule_hierarchical_grouping_sets.rs
@@ -66,10 +66,16 @@ const ID: RuleID = RuleID::HierarchicalGroupingSetsToUnion;
 pub struct RuleHierarchicalGroupingSetsToUnion {
     id: RuleID,
     matchers: Vec<Matcher>,
+    cte_channel_size: usize,
 }
 
 impl RuleHierarchicalGroupingSetsToUnion {
-    pub fn new(_ctx: Arc<OptimizerContext>) -> Self {
+    pub fn new(ctx: Arc<OptimizerContext>) -> Self {
+        let cte_channel_size = ctx
+            .get_table_ctx()
+            .get_settings()
+            .get_grouping_sets_channel_size()
+            .unwrap();
         Self {
             id: ID,
             matchers: vec![Matcher::MatchOp {
@@ -79,21 +85,36 @@ impl RuleHierarchicalGroupingSetsToUnion {
                     children: vec![Matcher::Leaf],
                 }],
             }],
+            cte_channel_size: cte_channel_size as usize,
         }
     }
 
     /// Analyzes grouping sets to build a true hierarchical dependency DAG
-    fn build_hierarchy_dag(&self, grouping_sets: &[Vec<IndexType>]) -> HierarchyDAG {
+    fn build_hierarchy_dag(
+        &self,
+        grouping_sets: &[Vec<IndexType>],
+        agg: &Aggregate,
+    ) -> HierarchyDAG {
         let mut levels: Vec<GroupingLevel> = grouping_sets
             .iter()
             .enumerate()
-            .map(|(idx, set)| GroupingLevel {
-                set_index: idx,
-                columns: set.clone(),
-                direct_children: Vec::new(),
-                possible_parents: Vec::new(),
-                chosen_parent: None,
-                level: set.len(),
+            .map(|(idx, set)| {
+                // Sort columns according to their order in group_items for consistent schema ordering
+                let mut sorted_columns = set.clone();
+                sorted_columns.sort_by_key(|&col_idx| {
+                    agg.group_items
+                        .iter()
+                        .position(|item| item.index == col_idx)
+                        .unwrap_or(usize::MAX) // Put unknown columns at the end
+                });
+                GroupingLevel {
+                    set_index: idx,
+                    columns: sorted_columns,
+                    direct_children: Vec::new(),
+                    possible_parents: Vec::new(),
+                    chosen_parent: None,
+                    level: set.len(),
+                }
             })
             .collect();
 
@@ -406,7 +427,7 @@ impl RuleHierarchicalGroupingSetsToUnion {
                     cte_name: cte_name.to_string(),
                     cte_output_columns: None,
                     ref_count: 1,
-                    channel_size: None,
+                    channel_size: Some(self.cte_channel_size),
                 }
                 .into(),
             ),
@@ -457,7 +478,7 @@ impl RuleHierarchicalGroupingSetsToUnion {
                     cte_name: cte_name.to_string(),
                     cte_output_columns: None,
                     ref_count: 1,
-                    channel_size: None,
+                    channel_size: Some(self.cte_channel_size),
                 }
                 .into(),
             ),
@@ -497,7 +518,7 @@ impl RuleHierarchicalGroupingSetsToUnion {
                     cte_name: cte_name.to_string(),
                     cte_output_columns: None,
                     ref_count: 1,
-                    channel_size: None,
+                    channel_size: Some(self.cte_channel_size),
                 }
                 .into(),
             ),
@@ -632,10 +653,11 @@ impl RuleHierarchicalGroupingSetsToUnion {
         // Create parent CTE consumer
         let parent_output_columns: Vec<IndexType> = {
             let mut output_cols = Vec::new();
-            // Then: aggregate function output columns
+            // First: aggregate function output columns
             for agg_item in &agg.aggregate_functions {
                 output_cols.push(agg_item.index);
             }
+            // Then: parent level columns (already sorted from build_hierarchy_dag)
             for &col_idx in &parent_level.columns {
                 output_cols.push(col_idx);
             }
@@ -666,7 +688,7 @@ impl RuleHierarchicalGroupingSetsToUnion {
                     cte_name: cte_name.to_string(),
                     cte_output_columns: None,
                     ref_count: 1,
-                    channel_size: None,
+                    channel_size: Some(self.cte_channel_size),
                 }
                 .into(),
             ),
@@ -850,7 +872,7 @@ impl Rule for RuleHierarchicalGroupingSetsToUnion {
         }
 
         // Build hierarchy DAG
-        let hierarchy = self.build_hierarchy_dag(&grouping_sets.sets);
+        let hierarchy = self.build_hierarchy_dag(&grouping_sets.sets, &agg);
         // Check if we have meaningful hierarchical structure
         let hierarchical_levels = hierarchy
             .levels
diff --git a/tests/sqllogictests/suites/duckdb/sql/aggregate/group/group_by_grouping_sets.test b/tests/sqllogictests/suites/duckdb/sql/aggregate/group/group_by_grouping_sets.test
@@ -261,6 +261,41 @@ a A 1 4 NULL A
 a B 1 5 NULL B
 a A 1 5 NULL NULL
 
+
+## group with CTE
+query ?TTT
+WITH cte_0 AS
+(
+       SELECT  try_cast(number % 10 AS String) AS media_source,
+             try_cast(number % 12 AS String) AS site_name,
+              try_cast(number AS Float64)  AS bi_cost,
+             try_cast((today() +  ( number % 11)) AS Date) AS created_at
+       FROM   numbers(1000)
+),
+cte_1 AS
+(
+         SELECT   sum(bi_cost)     AS bi_cost_agg,
+                  media_source,
+                  created_at,
+                  site_name
+         FROM     cte_0
+         GROUP BY cube (media_source, created_at, site_name)
+         HAVING   1 = 1)SELECT *
+FROM   cte_1 order by bi_cost_agg desc LIMIT 10;
+---
+----
+499500.0 NULL NULL NULL
+50400.0 9 NULL NULL
+50300.0 8 NULL NULL
+50200.0 7 NULL NULL
+50100.0 6 NULL NULL
+50000.0 5 NULL NULL
+49900.0 4 NULL NULL
+49800.0 3 NULL NULL
+49700.0 2 NULL NULL
+49600.0 1 NULL NULL
+
+
 statement ok
 drop table t all;