@@ -17,8 +17,8 @@ static string GenerateColumnName(const idx_t total_cols, const idx_t col_number,
17
17
// Helper function for UTF-8 aware space trimming
18
18
static string TrimWhitespace (const string &col_name) {
19
19
utf8proc_int32_t codepoint;
20
- auto str = reinterpret_cast <const utf8proc_uint8_t *>(col_name.c_str ());
21
- idx_t size = col_name.size ();
20
+ const auto str = reinterpret_cast <const utf8proc_uint8_t *>(col_name.c_str ());
21
+ const idx_t size = col_name.size ();
22
22
// Find the first character that is not left trimmed
23
23
idx_t begin = 0 ;
24
24
while (begin < size) {
@@ -96,6 +96,44 @@ static string NormalizeColumnName(const string &col_name) {
96
96
return col_name_cleaned;
97
97
}
98
98
99
+ static void ReplaceNames (vector<string> &detected_names, CSVStateMachine &state_machine,
100
+ unordered_map<idx_t , vector<LogicalType>> &best_sql_types_candidates_per_column_idx,
101
+ CSVReaderOptions &options, const vector<HeaderValue> &best_header_row,
102
+ CSVErrorHandler &error_handler) {
103
+ auto &dialect_options = state_machine.dialect_options ;
104
+ if (!options.columns_set ) {
105
+ if (options.file_options .hive_partitioning || options.file_options .union_by_name || options.multi_file_reader ) {
106
+ // Just do the replacement
107
+ for (idx_t i = 0 ; i < MinValue<idx_t >(detected_names.size (), options.name_list .size ()); i++) {
108
+ detected_names[i] = options.name_list [i];
109
+ }
110
+ return ;
111
+ }
112
+ if (options.name_list .size () > dialect_options.num_cols ) {
113
+ if (options.null_padding ) {
114
+ // we increase our types
115
+ idx_t col = 0 ;
116
+ for (idx_t i = dialect_options.num_cols ; i < options.name_list .size (); i++) {
117
+ detected_names.push_back (GenerateColumnName (options.name_list .size (), col++));
118
+ best_sql_types_candidates_per_column_idx[i] = {LogicalType::VARCHAR};
119
+ }
120
+
121
+ dialect_options.num_cols = options.name_list .size ();
122
+
123
+ } else {
124
+ // we throw an error
125
+ const auto error = CSVError::HeaderSniffingError (
126
+ options, best_header_row, options.name_list .size (),
127
+ state_machine.dialect_options .state_machine_options .delimiter .GetValue ());
128
+ error_handler.Error (error);
129
+ }
130
+ }
131
+ for (idx_t i = 0 ; i < options.name_list .size (); i++) {
132
+ detected_names[i] = options.name_list [i];
133
+ }
134
+ }
135
+ }
136
+
99
137
// If our columns were set by the user, we verify if their names match with the first row
100
138
bool CSVSniffer::DetectHeaderWithSetColumn (ClientContext &context, vector<HeaderValue> &best_header_row,
101
139
const SetColumns &set_columns, CSVReaderOptions &options) {
@@ -181,11 +219,8 @@ CSVSniffer::DetectHeaderInternal(ClientContext &context, vector<HeaderValue> &be
181
219
detected_names.push_back (GenerateColumnName (dialect_options.num_cols , col));
182
220
}
183
221
// If the user provided names, we must replace our header with the user provided names
184
- if (!options.columns_set ) {
185
- for (idx_t i = 0 ; i < MinValue<idx_t >(best_header_row.size (), options.name_list .size ()); i++) {
186
- detected_names[i] = options.name_list [i];
187
- }
188
- }
222
+ ReplaceNames (detected_names, state_machine, best_sql_types_candidates_per_column_idx, options, best_header_row,
223
+ error_handler);
189
224
return detected_names;
190
225
}
191
226
// information for header detection
@@ -199,11 +234,8 @@ CSVSniffer::DetectHeaderInternal(ClientContext &context, vector<HeaderValue> &be
199
234
detected_names.push_back (GenerateColumnName (dialect_options.num_cols , col));
200
235
}
201
236
dialect_options.rows_until_header += 1 ;
202
- if (!options.columns_set ) {
203
- for (idx_t i = 0 ; i < MinValue<idx_t >(detected_names.size (), options.name_list .size ()); i++) {
204
- detected_names[i] = options.name_list [i];
205
- }
206
- }
237
+ ReplaceNames (detected_names, state_machine, best_sql_types_candidates_per_column_idx, options,
238
+ best_header_row, error_handler);
207
239
return detected_names;
208
240
}
209
241
auto error =
@@ -295,16 +327,17 @@ CSVSniffer::DetectHeaderInternal(ClientContext &context, vector<HeaderValue> &be
295
327
}
296
328
297
329
// If the user provided names, we must replace our header with the user provided names
298
- if (!options.columns_set ) {
299
- for (idx_t i = 0 ; i < MinValue<idx_t >(detected_names.size (), options.name_list .size ()); i++) {
300
- detected_names[i] = options.name_list [i];
301
- }
302
- }
330
+ ReplaceNames (detected_names, state_machine, best_sql_types_candidates_per_column_idx, options, best_header_row,
331
+ error_handler);
303
332
return detected_names;
304
333
}
305
334
void CSVSniffer::DetectHeader () {
306
335
auto &sniffer_state_machine = best_candidate->GetStateMachine ();
307
336
names = DetectHeaderInternal (buffer_manager->context , best_header_row, sniffer_state_machine, set_columns,
308
337
best_sql_types_candidates_per_column_idx, options, *error_handler);
338
+ for (idx_t i = max_columns_found; i < names.size (); i++) {
339
+ detected_types.push_back (LogicalType::VARCHAR);
340
+ }
341
+ max_columns_found = names.size ();
309
342
}
310
343
} // namespace duckdb
0 commit comments