@@ -95,17 +95,46 @@ impl From<i64> for JsonPath<'_> {
95
95
}
96
96
}
97
97
98
- impl < ' s > JsonPath < ' s > {
99
- pub fn extract_path ( args : & ' s [ ColumnarValue ] ) -> Vec < Self > {
100
- args[ 1 ..]
98
+ enum JsonPathArgs < ' a > {
99
+ Array ( & ' a ArrayRef ) ,
100
+ Scalars ( Vec < JsonPath < ' a > > ) ,
101
+ }
102
+
103
+ impl < ' s > JsonPathArgs < ' s > {
104
+ fn extract_path ( path_args : & ' s [ ColumnarValue ] ) -> DataFusionResult < Self > {
105
+ // If there is a single argument as an array, we know how to handle it
106
+ if let Some ( ( ColumnarValue :: Array ( array) , & [ ] ) ) = path_args. split_first ( ) {
107
+ return Ok ( Self :: Array ( array) ) ;
108
+ }
109
+
110
+ path_args
101
111
. iter ( )
102
- . map ( |arg| match arg {
103
- ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( Some ( s) ) | ScalarValue :: LargeUtf8 ( Some ( s) ) ) => Self :: Key ( s) ,
104
- ColumnarValue :: Scalar ( ScalarValue :: UInt64 ( Some ( i) ) ) => ( * i) . into ( ) ,
105
- ColumnarValue :: Scalar ( ScalarValue :: Int64 ( Some ( i) ) ) => ( * i) . into ( ) ,
106
- _ => Self :: None ,
112
+ . enumerate ( )
113
+ . map ( |( pos, arg) | match arg {
114
+ ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( Some ( s) ) | ScalarValue :: LargeUtf8 ( Some ( s) ) ) => {
115
+ Ok ( JsonPath :: Key ( s) )
116
+ }
117
+ ColumnarValue :: Scalar ( ScalarValue :: UInt64 ( Some ( i) ) ) => Ok ( ( * i) . into ( ) ) ,
118
+ ColumnarValue :: Scalar ( ScalarValue :: Int64 ( Some ( i) ) ) => Ok ( ( * i) . into ( ) ) ,
119
+ ColumnarValue :: Scalar (
120
+ ScalarValue :: Null
121
+ | ScalarValue :: Utf8 ( None )
122
+ | ScalarValue :: LargeUtf8 ( None )
123
+ | ScalarValue :: UInt64 ( None )
124
+ | ScalarValue :: Int64 ( None ) ,
125
+ ) => Ok ( JsonPath :: None ) ,
126
+ ColumnarValue :: Array ( _) => {
127
+ // if there was a single arg, which is an array, handled above in the
128
+ // split_first case. So this is multiple args of which one is an array
129
+ exec_err ! ( "More than 1 path element is not supported when querying JSON using an array." )
130
+ }
131
+ ColumnarValue :: Scalar ( arg) => exec_err ! (
132
+ "Unexpected argument type at position {}, expected string or int, got {arg:?}." ,
133
+ pos + 1
134
+ ) ,
107
135
} )
108
- . collect ( )
136
+ . collect :: < DataFusionResult < _ > > ( )
137
+ . map ( JsonPathArgs :: Scalars )
109
138
}
110
139
}
111
140
@@ -116,154 +145,173 @@ pub fn invoke<C: FromIterator<Option<I>> + 'static, I>(
116
145
to_scalar : impl Fn ( Option < I > ) -> ScalarValue ,
117
146
return_dict : bool ,
118
147
) -> DataFusionResult < ColumnarValue > {
119
- let Some ( first_arg) = args. first ( ) else {
120
- // I think this can't happen, but I assumed the same about args[1] and I was wrong, so better to be safe
148
+ let Some ( ( json_arg, path_args) ) = args. split_first ( ) else {
121
149
return exec_err ! ( "expected at least one argument" ) ;
122
150
} ;
123
- match first_arg {
124
- ColumnarValue :: Array ( json_array) => {
125
- let array = match args. get ( 1 ) {
126
- Some ( ColumnarValue :: Array ( a) ) => {
127
- if args. len ( ) > 2 {
128
- // TODO perhaps we could support this by zipping the arrays, but it's not trivial, #23
129
- exec_err ! ( "More than 1 path element is not supported when querying JSON using an array." )
130
- } else {
131
- invoke_array ( json_array, a, to_array, jiter_find, return_dict)
132
- }
133
- }
134
- Some ( ColumnarValue :: Scalar ( _) ) => scalar_apply (
135
- json_array,
136
- & JsonPath :: extract_path ( args) ,
137
- to_array,
138
- jiter_find,
139
- return_dict,
140
- ) ,
141
- None => scalar_apply ( json_array, & [ ] , to_array, jiter_find, return_dict) ,
142
- } ;
143
- array. map ( ColumnarValue :: from)
151
+
152
+ let path = JsonPathArgs :: extract_path ( path_args) ?;
153
+ match ( json_arg, path) {
154
+ ( ColumnarValue :: Array ( json_array) , JsonPathArgs :: Array ( path_array) ) => {
155
+ invoke_array_array ( json_array, path_array, to_array, jiter_find, return_dict) . map ( ColumnarValue :: Array )
156
+ }
157
+ ( ColumnarValue :: Array ( json_array) , JsonPathArgs :: Scalars ( path) ) => {
158
+ invoke_array_scalars ( json_array, & path, to_array, jiter_find, return_dict) . map ( ColumnarValue :: Array )
159
+ }
160
+ ( ColumnarValue :: Scalar ( s) , JsonPathArgs :: Array ( path_array) ) => {
161
+ invoke_scalar_array ( s, path_array, jiter_find, to_array)
162
+ }
163
+ ( ColumnarValue :: Scalar ( s) , JsonPathArgs :: Scalars ( path) ) => {
164
+ invoke_scalar_scalars ( s, & path, jiter_find, to_scalar)
144
165
}
145
- ColumnarValue :: Scalar ( s) => invoke_scalar ( s, args, jiter_find, to_scalar) ,
146
166
}
147
167
}
148
168
149
- fn invoke_array < C : FromIterator < Option < I > > + ' static , I > (
169
+ fn invoke_array_array < C : FromIterator < Option < I > > + ' static , I > (
150
170
json_array : & ArrayRef ,
151
- needle_array : & ArrayRef ,
171
+ path_array : & ArrayRef ,
152
172
to_array : impl Fn ( C ) -> DataFusionResult < ArrayRef > ,
153
173
jiter_find : impl Fn ( Option < & str > , & [ JsonPath ] ) -> Result < I , GetError > ,
154
174
return_dict : bool ,
155
175
) -> DataFusionResult < ArrayRef > {
156
176
downcast_dictionary_array ! (
157
- needle_array => match needle_array. values( ) . data_type( ) {
158
- DataType :: Utf8 => zip_apply( json_array, needle_array. downcast_dict:: <StringArray >( ) . unwrap( ) , to_array, jiter_find, true , return_dict) ,
159
- DataType :: LargeUtf8 => zip_apply( json_array, needle_array. downcast_dict:: <LargeStringArray >( ) . unwrap( ) , to_array, jiter_find, true , return_dict) ,
160
- DataType :: Utf8View => zip_apply( json_array, needle_array. downcast_dict:: <StringViewArray >( ) . unwrap( ) , to_array, jiter_find, true , return_dict) ,
161
- DataType :: Int64 => zip_apply( json_array, needle_array. downcast_dict:: <Int64Array >( ) . unwrap( ) , to_array, jiter_find, false , return_dict) ,
162
- DataType :: UInt64 => zip_apply( json_array, needle_array. downcast_dict:: <UInt64Array >( ) . unwrap( ) , to_array, jiter_find, false , return_dict) ,
163
- other => exec_err!( "unexpected second argument type, expected string or int array, got {:?}" , other) ,
164
- } ,
165
- DataType :: Utf8 => zip_apply( json_array, needle_array. as_string:: <i32 >( ) , to_array, jiter_find, true , return_dict) ,
166
- DataType :: LargeUtf8 => zip_apply( json_array, needle_array. as_string:: <i64 >( ) , to_array, jiter_find, true , return_dict) ,
167
- DataType :: Utf8View => zip_apply( json_array, needle_array. as_string_view( ) , to_array, jiter_find, true , return_dict) ,
168
- DataType :: Int64 => zip_apply( json_array, needle_array. as_primitive:: <Int64Type >( ) , to_array, jiter_find, false , return_dict) ,
169
- DataType :: UInt64 => zip_apply( json_array, needle_array. as_primitive:: <UInt64Type >( ) , to_array, jiter_find, false , return_dict) ,
170
- other => exec_err!( "unexpected second argument type, expected string or int array, got {:?}" , other)
177
+ json_array => {
178
+ let values = invoke_array_array( json_array. values( ) , path_array, to_array, jiter_find, return_dict) ?;
179
+ post_process_dict( json_array, values, return_dict)
180
+ }
181
+ DataType :: Utf8 => zip_apply( json_array. as_string:: <i32 >( ) . iter( ) , path_array, to_array, jiter_find) ,
182
+ DataType :: LargeUtf8 => zip_apply( json_array. as_string:: <i64 >( ) . iter( ) , path_array, to_array, jiter_find) ,
183
+ DataType :: Utf8View => zip_apply( json_array. as_string_view( ) . iter( ) , path_array, to_array, jiter_find) ,
184
+ other => if let Some ( string_array) = nested_json_array( json_array, is_object_lookup_array( path_array. data_type( ) ) ) {
185
+ zip_apply( string_array. iter( ) , path_array, to_array, jiter_find)
186
+ } else {
187
+ exec_err!( "unexpected json array type {:?}" , other)
188
+ }
171
189
)
172
190
}
173
191
174
- fn zip_apply < ' a , P : Into < JsonPath < ' a > > , C : FromIterator < Option < I > > + ' static , I > (
192
+ fn invoke_array_scalars < C : FromIterator < Option < I > > , I > (
175
193
json_array : & ArrayRef ,
176
- path_array : impl ArrayAccessor < Item = P > ,
194
+ path : & [ JsonPath ] ,
177
195
to_array : impl Fn ( C ) -> DataFusionResult < ArrayRef > ,
178
196
jiter_find : impl Fn ( Option < & str > , & [ JsonPath ] ) -> Result < I , GetError > ,
179
- object_lookup : bool ,
180
197
return_dict : bool ,
181
198
) -> DataFusionResult < ArrayRef > {
199
+ fn inner < ' j , C : FromIterator < Option < I > > , I > (
200
+ json_iter : impl IntoIterator < Item = Option < & ' j str > > ,
201
+ path : & [ JsonPath ] ,
202
+ jiter_find : impl Fn ( Option < & str > , & [ JsonPath ] ) -> Result < I , GetError > ,
203
+ ) -> C {
204
+ json_iter
205
+ . into_iter ( )
206
+ . map ( |opt_json| jiter_find ( opt_json, path) . ok ( ) )
207
+ . collect :: < C > ( )
208
+ }
209
+
182
210
let c = downcast_dictionary_array ! (
183
211
json_array => {
184
- let values = zip_apply ( json_array. values( ) , path_array , to_array, jiter_find, object_lookup , false ) ?;
212
+ let values = invoke_array_scalars ( json_array. values( ) , path , to_array, jiter_find, false ) ?;
185
213
return post_process_dict( json_array, values, return_dict) ;
186
214
}
187
- DataType :: Utf8 => zip_apply_iter ( json_array. as_string:: <i32 >( ) . iter ( ) , path_array , jiter_find) ,
188
- DataType :: LargeUtf8 => zip_apply_iter ( json_array. as_string:: <i64 >( ) . iter ( ) , path_array , jiter_find) ,
189
- DataType :: Utf8View => zip_apply_iter ( json_array. as_string_view( ) . iter ( ) , path_array , jiter_find) ,
190
- other => if let Some ( string_array) = nested_json_array( json_array, object_lookup ) {
191
- zip_apply_iter ( string_array. iter ( ) , path_array , jiter_find)
215
+ DataType :: Utf8 => inner ( json_array. as_string:: <i32 >( ) , path , jiter_find) ,
216
+ DataType :: LargeUtf8 => inner ( json_array. as_string:: <i64 >( ) , path , jiter_find) ,
217
+ DataType :: Utf8View => inner ( json_array. as_string_view( ) , path , jiter_find) ,
218
+ other => if let Some ( string_array) = nested_json_array( json_array, is_object_lookup ( path ) ) {
219
+ inner ( string_array, path , jiter_find)
192
220
} else {
193
221
return exec_err!( "unexpected json array type {:?}" , other) ;
194
222
}
195
223
) ;
196
-
197
224
to_array ( c)
198
225
}
199
226
200
- #[ allow( clippy:: needless_pass_by_value) ] // ArrayAccessor is implemented on references
201
- fn zip_apply_iter < ' a , ' j , P : Into < JsonPath < ' a > > , C : FromIterator < Option < I > > + ' static , I > (
202
- json_iter : impl Iterator < Item = Option < & ' j str > > ,
203
- path_array : impl ArrayAccessor < Item = P > ,
227
+ fn invoke_scalar_array < C : FromIterator < Option < I > > + ' static , I > (
228
+ scalar : & ScalarValue ,
229
+ path_array : & ArrayRef ,
204
230
jiter_find : impl Fn ( Option < & str > , & [ JsonPath ] ) -> Result < I , GetError > ,
205
- ) -> C {
206
- json_iter
207
- . enumerate ( )
208
- . map ( | ( i , opt_json ) | {
209
- if path_array . is_null ( i ) {
210
- None
211
- } else {
212
- let path = path_array. value ( i ) . into ( ) ;
213
- jiter_find ( opt_json , & [ path ] ) . ok ( )
214
- }
215
- } )
216
- . collect :: < C > ( )
231
+ to_array : impl Fn ( C ) -> DataFusionResult < ArrayRef > ,
232
+ ) -> DataFusionResult < ColumnarValue > {
233
+ let s = extract_json_scalar ( scalar ) ? ;
234
+ // TODO: possible optimization here if path_array is a dictionary; can apply against the
235
+ // dictionary values directly for less work
236
+ zip_apply (
237
+ std :: iter :: repeat ( s ) . take ( path_array . len ( ) ) ,
238
+ path_array,
239
+ to_array ,
240
+ jiter_find ,
241
+ )
242
+ . map ( ColumnarValue :: Array )
217
243
}
218
244
219
- fn invoke_scalar < I > (
245
+ fn invoke_scalar_scalars < I > (
220
246
scalar : & ScalarValue ,
221
- args : & [ ColumnarValue ] ,
247
+ path : & [ JsonPath ] ,
222
248
jiter_find : impl Fn ( Option < & str > , & [ JsonPath ] ) -> Result < I , GetError > ,
223
249
to_scalar : impl Fn ( Option < I > ) -> ScalarValue ,
224
250
) -> DataFusionResult < ColumnarValue > {
225
- match scalar {
226
- ScalarValue :: Dictionary ( _, b) => invoke_scalar ( b. as_ref ( ) , args, jiter_find, to_scalar) ,
227
- ScalarValue :: Utf8 ( s) | ScalarValue :: Utf8View ( s) | ScalarValue :: LargeUtf8 ( s) => {
228
- let path = JsonPath :: extract_path ( args) ;
229
- let v = jiter_find ( s. as_ref ( ) . map ( String :: as_str) , & path) . ok ( ) ;
230
- Ok ( ColumnarValue :: Scalar ( to_scalar ( v) ) )
231
- }
232
- ScalarValue :: Union ( type_id_value, union_fields, _) => {
233
- let opt_json = json_from_union_scalar ( type_id_value. as_ref ( ) , union_fields) ;
234
- let v = jiter_find ( opt_json, & JsonPath :: extract_path ( args) ) . ok ( ) ;
235
- Ok ( ColumnarValue :: Scalar ( to_scalar ( v) ) )
236
- }
237
- _ => {
238
- exec_err ! ( "unexpected first argument type, expected string or JSON union" )
239
- }
240
- }
251
+ let s = extract_json_scalar ( scalar) ?;
252
+ let v = jiter_find ( s, path) . ok ( ) ;
253
+ Ok ( ColumnarValue :: Scalar ( to_scalar ( v) ) )
241
254
}
242
255
243
- fn scalar_apply < C : FromIterator < Option < I > > , I > (
244
- json_array : & ArrayRef ,
245
- path : & [ JsonPath ] ,
256
+ fn zip_apply < ' a , C : FromIterator < Option < I > > + ' static , I > (
257
+ json_array : impl IntoIterator < Item = Option < & ' a str > > ,
258
+ path_array : & ArrayRef ,
246
259
to_array : impl Fn ( C ) -> DataFusionResult < ArrayRef > ,
247
260
jiter_find : impl Fn ( Option < & str > , & [ JsonPath ] ) -> Result < I , GetError > ,
248
- return_dict : bool ,
249
261
) -> DataFusionResult < ArrayRef > {
262
+ #[ allow( clippy:: needless_pass_by_value) ] // ArrayAccessor is implemented on references
263
+ fn inner < ' a , ' j , P : Into < JsonPath < ' a > > , C : FromIterator < Option < I > > + ' static , I > (
264
+ json_iter : impl IntoIterator < Item = Option < & ' j str > > ,
265
+ path_array : impl ArrayAccessor < Item = P > ,
266
+ jiter_find : impl Fn ( Option < & str > , & [ JsonPath ] ) -> Result < I , GetError > ,
267
+ ) -> C {
268
+ json_iter
269
+ . into_iter ( )
270
+ . enumerate ( )
271
+ . map ( |( i, opt_json) | {
272
+ if path_array. is_null ( i) {
273
+ None
274
+ } else {
275
+ let path = path_array. value ( i) . into ( ) ;
276
+ jiter_find ( opt_json, & [ path] ) . ok ( )
277
+ }
278
+ } )
279
+ . collect :: < C > ( )
280
+ }
281
+
250
282
let c = downcast_dictionary_array ! (
251
- json_array => {
252
- let values = scalar_apply( json_array. values( ) , path, to_array, jiter_find, false ) ?;
253
- return post_process_dict( json_array, values, return_dict) ;
254
- }
255
- DataType :: Utf8 => scalar_apply_iter( json_array. as_string:: <i32 >( ) . iter( ) , path, jiter_find) ,
256
- DataType :: LargeUtf8 => scalar_apply_iter( json_array. as_string:: <i64 >( ) . iter( ) , path, jiter_find) ,
257
- DataType :: Utf8View => scalar_apply_iter( json_array. as_string_view( ) . iter( ) , path, jiter_find) ,
258
- other => if let Some ( string_array) = nested_json_array( json_array, is_object_lookup( path) ) {
259
- scalar_apply_iter( string_array. iter( ) , path, jiter_find)
260
- } else {
261
- return exec_err!( "unexpected json array type {:?}" , other) ;
262
- }
283
+ path_array => match path_array. values( ) . data_type( ) {
284
+ DataType :: Utf8 => inner( json_array, path_array. downcast_dict:: <StringArray >( ) . unwrap( ) , jiter_find) ,
285
+ DataType :: LargeUtf8 => inner( json_array, path_array. downcast_dict:: <LargeStringArray >( ) . unwrap( ) , jiter_find) ,
286
+ DataType :: Utf8View => inner( json_array, path_array. downcast_dict:: <StringViewArray >( ) . unwrap( ) , jiter_find) ,
287
+ DataType :: Int64 => inner( json_array, path_array. downcast_dict:: <Int64Array >( ) . unwrap( ) , jiter_find) ,
288
+ DataType :: UInt64 => inner( json_array, path_array. downcast_dict:: <UInt64Array >( ) . unwrap( ) , jiter_find) ,
289
+ other => return exec_err!( "unexpected second argument type, expected string or int array, got {:?}" , other) ,
290
+ } ,
291
+ DataType :: Utf8 => inner( json_array, path_array. as_string:: <i32 >( ) , jiter_find) ,
292
+ DataType :: LargeUtf8 => inner( json_array, path_array. as_string:: <i64 >( ) , jiter_find) ,
293
+ DataType :: Utf8View => inner( json_array, path_array. as_string_view( ) , jiter_find) ,
294
+ DataType :: Int64 => inner( json_array, path_array. as_primitive:: <Int64Type >( ) , jiter_find) ,
295
+ DataType :: UInt64 => inner( json_array, path_array. as_primitive:: <UInt64Type >( ) , jiter_find) ,
296
+ other => return exec_err!( "unexpected second argument type, expected string or int array, got {:?}" , other)
263
297
) ;
298
+
264
299
to_array ( c)
265
300
}
266
301
302
+ fn extract_json_scalar ( scalar : & ScalarValue ) -> DataFusionResult < Option < & str > > {
303
+ match scalar {
304
+ ScalarValue :: Dictionary ( _, b) => extract_json_scalar ( b. as_ref ( ) ) ,
305
+ ScalarValue :: Utf8 ( s) | ScalarValue :: Utf8View ( s) | ScalarValue :: LargeUtf8 ( s) => Ok ( s. as_deref ( ) ) ,
306
+ ScalarValue :: Union ( type_id_value, union_fields, _) => {
307
+ Ok ( json_from_union_scalar ( type_id_value. as_ref ( ) , union_fields) )
308
+ }
309
+ _ => {
310
+ exec_err ! ( "unexpected first argument type, expected string or JSON union" )
311
+ }
312
+ }
313
+ }
314
+
267
315
/// Take a dictionary array of JSON data and an array of result values and combine them.
268
316
fn post_process_dict < T : ArrowDictionaryKeyType > (
269
317
dict_array : & DictionaryArray < T > ,
@@ -295,12 +343,12 @@ fn is_object_lookup(path: &[JsonPath]) -> bool {
295
343
}
296
344
}
297
345
298
- fn scalar_apply_iter < ' j , C : FromIterator < Option < I > > , I > (
299
- json_iter : impl Iterator < Item = Option < & ' j str > > ,
300
- path : & [ JsonPath ] ,
301
- jiter_find : impl Fn ( Option < & str > , & [ JsonPath ] ) -> Result < I , GetError > ,
302
- ) -> C {
303
- json_iter . map ( |opt_json| jiter_find ( opt_json , path ) . ok ( ) ) . collect :: < C > ( )
346
+ fn is_object_lookup_array ( data_type : & DataType ) -> bool {
347
+ match data_type {
348
+ DataType :: Dictionary ( _ , value_type ) => is_object_lookup_array ( value_type ) ,
349
+ DataType :: Utf8 | DataType :: LargeUtf8 | DataType :: Utf8View => true ,
350
+ _ => false ,
351
+ }
304
352
}
305
353
306
354
pub fn jiter_json_find < ' j > ( opt_json : Option < & ' j str > , path : & [ JsonPath ] ) -> Option < ( Jiter < ' j > , Peek ) > {
0 commit comments