|
7 | 7 |
|
8 | 8 | namespace duckdb {
|
9 | 9 |
|
10 |
| -void ListFlattenFunction(DataChunk &args, ExpressionState &state, Vector &result) { |
11 |
| - D_ASSERT(args.ColumnCount() == 1); |
| 10 | +static void ListFlattenFunction(DataChunk &args, ExpressionState &, Vector &result) { |
12 | 11 |
|
13 |
| - Vector &input = args.data[0]; |
14 |
| - if (input.GetType().id() == LogicalTypeId::SQLNULL) { |
15 |
| - result.Reference(input); |
| 12 | + const auto flat_list_data = FlatVector::GetData<list_entry_t>(result); |
| 13 | + auto &flat_list_mask = FlatVector::Validity(result); |
| 14 | + |
| 15 | + UnifiedVectorFormat outer_format; |
| 16 | + UnifiedVectorFormat inner_format; |
| 17 | + UnifiedVectorFormat items_format; |
| 18 | + |
| 19 | + // Setup outer vec; |
| 20 | + auto &outer_vec = args.data[0]; |
| 21 | + const auto outer_count = args.size(); |
| 22 | + outer_vec.ToUnifiedFormat(outer_count, outer_format); |
| 23 | + |
| 24 | + // Special case: outer list is all-null |
| 25 | + if (outer_vec.GetType().id() == LogicalTypeId::SQLNULL) { |
| 26 | + result.Reference(outer_vec); |
16 | 27 | return;
|
17 | 28 | }
|
18 | 29 |
|
19 |
| - idx_t count = args.size(); |
20 |
| - |
21 |
| - // Prepare the result vector |
22 |
| - result.SetVectorType(VectorType::FLAT_VECTOR); |
23 |
| - // This holds the new offsets and lengths |
24 |
| - auto result_entries = FlatVector::GetData<list_entry_t>(result); |
25 |
| - auto &result_validity = FlatVector::Validity(result); |
26 |
| - |
27 |
| - // The outermost list in each row |
28 |
| - UnifiedVectorFormat row_data; |
29 |
| - input.ToUnifiedFormat(count, row_data); |
30 |
| - auto row_entries = UnifiedVectorFormat::GetData<list_entry_t>(row_data); |
31 |
| - |
32 |
| - // The list elements in each row: [HERE, ...] |
33 |
| - auto &row_lists = ListVector::GetEntry(input); |
34 |
| - UnifiedVectorFormat row_lists_data; |
35 |
| - idx_t total_row_lists = ListVector::GetListSize(input); |
36 |
| - row_lists.ToUnifiedFormat(total_row_lists, row_lists_data); |
37 |
| - auto row_lists_entries = UnifiedVectorFormat::GetData<list_entry_t>(row_lists_data); |
38 |
| - |
39 |
| - if (row_lists.GetType().id() == LogicalTypeId::SQLNULL) { |
40 |
| - for (idx_t row_cnt = 0; row_cnt < count; row_cnt++) { |
41 |
| - auto row_idx = row_data.sel->get_index(row_cnt); |
42 |
| - if (!row_data.validity.RowIsValid(row_idx)) { |
43 |
| - result_validity.SetInvalid(row_cnt); |
| 30 | + // Setup inner vec |
| 31 | + auto &inner_vec = ListVector::GetEntry(outer_vec); |
| 32 | + const auto inner_count = ListVector::GetListSize(outer_vec); |
| 33 | + inner_vec.ToUnifiedFormat(inner_count, inner_format); |
| 34 | + |
| 35 | + // Special case: inner list is all-null |
| 36 | + if (inner_vec.GetType().id() == LogicalTypeId::SQLNULL) { |
| 37 | + for (idx_t outer_raw_idx = 0; outer_raw_idx < outer_count; outer_raw_idx++) { |
| 38 | + const auto outer_idx = outer_format.sel->get_index(outer_raw_idx); |
| 39 | + if (!outer_format.validity.RowIsValid(outer_idx)) { |
| 40 | + flat_list_mask.SetInvalid(outer_raw_idx); |
44 | 41 | continue;
|
45 | 42 | }
|
46 |
| - result_entries[row_cnt].offset = 0; |
47 |
| - result_entries[row_cnt].length = 0; |
| 43 | + flat_list_data[outer_raw_idx].offset = 0; |
| 44 | + flat_list_data[outer_raw_idx].length = 0; |
48 | 45 | }
|
49 | 46 | if (args.AllConstant()) {
|
50 | 47 | result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
51 | 48 | }
|
52 | 49 | return;
|
53 | 50 | }
|
54 | 51 |
|
55 |
| - // The actual elements inside each row list: [[HERE, ...], []] |
56 |
| - // This one becomes the child vector of the result. |
57 |
| - auto &elem_vector = ListVector::GetEntry(row_lists); |
| 52 | + // Setup items vec |
| 53 | + auto &items_vec = ListVector::GetEntry(inner_vec); |
| 54 | + const auto items_count = ListVector::GetListSize(inner_vec); |
| 55 | + items_vec.ToUnifiedFormat(items_count, items_format); |
| 56 | + |
| 57 | + // First pass: Figure out the total amount of items. |
| 58 | + // This can be more than items_count if the inner list reference the same item(s) multiple times. |
| 59 | + |
| 60 | + idx_t total_items = 0; |
| 61 | + |
| 62 | + const auto outer_data = UnifiedVectorFormat::GetData<list_entry_t>(outer_format); |
| 63 | + const auto inner_data = UnifiedVectorFormat::GetData<list_entry_t>(inner_format); |
| 64 | + |
| 65 | + for (idx_t outer_raw_idx = 0; outer_raw_idx < outer_count; outer_raw_idx++) { |
| 66 | + const auto outer_idx = outer_format.sel->get_index(outer_raw_idx); |
| 67 | + |
| 68 | + if (!outer_format.validity.RowIsValid(outer_idx)) { |
| 69 | + continue; |
| 70 | + } |
| 71 | + |
| 72 | + const auto &outer_entry = outer_data[outer_idx]; |
| 73 | + |
| 74 | + for (idx_t inner_raw_idx = outer_entry.offset; inner_raw_idx < outer_entry.offset + outer_entry.length; |
| 75 | + inner_raw_idx++) { |
| 76 | + const auto inner_idx = inner_format.sel->get_index(inner_raw_idx); |
58 | 77 |
|
59 |
| - // We'll use this selection vector to slice the elem_vector. |
60 |
| - idx_t child_elem_cnt = ListVector::GetListSize(row_lists); |
61 |
| - SelectionVector sel(child_elem_cnt); |
| 78 | + if (!inner_format.validity.RowIsValid(inner_idx)) { |
| 79 | + continue; |
| 80 | + } |
| 81 | + |
| 82 | + const auto &inner_entry = inner_data[inner_idx]; |
| 83 | + |
| 84 | + total_items += inner_entry.length; |
| 85 | + } |
| 86 | + } |
| 87 | + |
| 88 | + // Now we know the total amount of items, we can create our selection vector. |
| 89 | + SelectionVector sel(total_items); |
62 | 90 | idx_t sel_idx = 0;
|
63 | 91 |
|
64 |
| - // HERE, [[]], ... |
65 |
| - for (idx_t row_cnt = 0; row_cnt < count; row_cnt++) { |
66 |
| - auto row_idx = row_data.sel->get_index(row_cnt); |
| 92 | + // Second pass: Fill the selection vector (and the result list entries) |
| 93 | + |
| 94 | + for (idx_t outer_raw_idx = 0; outer_raw_idx < outer_count; outer_raw_idx++) { |
| 95 | + const auto outer_idx = outer_format.sel->get_index(outer_raw_idx); |
67 | 96 |
|
68 |
| - if (!row_data.validity.RowIsValid(row_idx)) { |
69 |
| - result_validity.SetInvalid(row_cnt); |
| 97 | + if (!outer_format.validity.RowIsValid(outer_idx)) { |
| 98 | + flat_list_mask.SetInvalid(outer_raw_idx); |
70 | 99 | continue;
|
71 | 100 | }
|
72 | 101 |
|
73 |
| - idx_t list_offset = sel_idx; |
74 |
| - idx_t list_length = 0; |
| 102 | + const auto &outer_entry = outer_data[outer_idx]; |
| 103 | + |
| 104 | + list_entry_t list_entry = {sel_idx, 0}; |
75 | 105 |
|
76 |
| - // [HERE, [...], ...] |
77 |
| - auto row_entry = row_entries[row_idx]; |
78 |
| - for (idx_t row_lists_cnt = 0; row_lists_cnt < row_entry.length; row_lists_cnt++) { |
79 |
| - auto row_lists_idx = row_lists_data.sel->get_index(row_entry.offset + row_lists_cnt); |
| 106 | + for (idx_t inner_raw_idx = outer_entry.offset; inner_raw_idx < outer_entry.offset + outer_entry.length; |
| 107 | + inner_raw_idx++) { |
| 108 | + const auto inner_idx = inner_format.sel->get_index(inner_raw_idx); |
80 | 109 |
|
81 |
| - // Skip invalid lists |
82 |
| - if (!row_lists_data.validity.RowIsValid(row_lists_idx)) { |
| 110 | + if (!inner_format.validity.RowIsValid(inner_idx)) { |
83 | 111 | continue;
|
84 | 112 | }
|
85 | 113 |
|
86 |
| - // [[HERE, ...], [.., ...]] |
87 |
| - auto list_entry = row_lists_entries[row_lists_idx]; |
88 |
| - list_length += list_entry.length; |
| 114 | + const auto &inner_entry = inner_data[inner_idx]; |
| 115 | + |
| 116 | + list_entry.length += inner_entry.length; |
| 117 | + |
| 118 | + for (idx_t elem_raw_idx = inner_entry.offset; elem_raw_idx < inner_entry.offset + inner_entry.length; |
| 119 | + elem_raw_idx++) { |
| 120 | + const auto elem_idx = items_format.sel->get_index(elem_raw_idx); |
89 | 121 |
|
90 |
| - for (idx_t elem_cnt = 0; elem_cnt < list_entry.length; elem_cnt++) { |
91 |
| - // offset of the element in the elem_vector. |
92 |
| - idx_t offset = list_entry.offset + elem_cnt; |
93 |
| - sel.set_index(sel_idx, offset); |
| 122 | + sel.set_index(sel_idx, elem_idx); |
94 | 123 | sel_idx++;
|
95 | 124 | }
|
96 | 125 | }
|
97 | 126 |
|
98 |
| - result_entries[row_cnt].offset = list_offset; |
99 |
| - result_entries[row_cnt].length = list_length; |
| 127 | + // Assign the result list entry |
| 128 | + flat_list_data[outer_raw_idx] = list_entry; |
100 | 129 | }
|
101 | 130 |
|
| 131 | + // Now assing the result |
102 | 132 | ListVector::SetListSize(result, sel_idx);
|
103 | 133 |
|
104 | 134 | auto &result_child_vector = ListVector::GetEntry(result);
|
105 |
| - result_child_vector.Slice(elem_vector, sel, sel_idx); |
| 135 | + result_child_vector.Slice(items_vec, sel, sel_idx); |
106 | 136 | result_child_vector.Flatten(sel_idx);
|
107 | 137 |
|
108 | 138 | if (args.AllConstant()) {
|
|
0 commit comments