@@ -84,6 +84,7 @@ export class RecordBatchWriter<T extends TypeMap = any> extends ReadableInterop<
84
84
protected _schema : Schema | null = null ;
85
85
protected _dictionaryBlocks : FileBlock [ ] = [ ] ;
86
86
protected _recordBatchBlocks : FileBlock [ ] = [ ] ;
87
+ protected _seenDictionaries = new Map < number , Vector > ( ) ;
87
88
protected _dictionaryDeltaOffsets = new Map < number , number > ( ) ;
88
89
89
90
public toString ( sync : true ) : string ;
@@ -144,6 +145,7 @@ export class RecordBatchWriter<T extends TypeMap = any> extends ReadableInterop<
144
145
this . _started = false ;
145
146
this . _dictionaryBlocks = [ ] ;
146
147
this . _recordBatchBlocks = [ ] ;
148
+ this . _seenDictionaries = new Map ( ) ;
147
149
this . _dictionaryDeltaOffsets = new Map ( ) ;
148
150
149
151
if ( ! schema || ! ( compareSchemas ( schema , this . _schema ) ) ) {
@@ -259,7 +261,6 @@ export class RecordBatchWriter<T extends TypeMap = any> extends ReadableInterop<
259
261
}
260
262
261
263
protected _writeDictionaryBatch ( dictionary : Data , id : number , isDelta = false ) {
262
- this . _dictionaryDeltaOffsets . set ( id , dictionary . length + ( this . _dictionaryDeltaOffsets . get ( id ) || 0 ) ) ;
263
264
const { byteLength, nodes, bufferRegions, buffers } = VectorAssembler . assemble ( new Vector ( [ dictionary ] ) ) ;
264
265
const recordBatch = new metadata . RecordBatch ( dictionary . length , nodes , bufferRegions ) ;
265
266
const dictionaryBatch = new metadata . DictionaryBatch ( recordBatch , id , isDelta ) ;
@@ -284,14 +285,21 @@ export class RecordBatchWriter<T extends TypeMap = any> extends ReadableInterop<
284
285
}
285
286
286
287
protected _writeDictionaries ( batch : RecordBatch < T > ) {
287
- for ( let [ id , dictionary ] of batch . dictionaries ) {
288
- let offset = this . _dictionaryDeltaOffsets . get ( id ) || 0 ;
289
- if ( offset === 0 || ( dictionary = dictionary ?. slice ( offset ) ) . length > 0 ) {
290
- for ( const data of dictionary . data ) {
291
- this . _writeDictionaryBatch ( data , id , offset > 0 ) ;
292
- offset += data . length ;
293
- }
288
+ for ( const [ id , dictionary ] of batch . dictionaries ) {
289
+ const chunks = dictionary ?. data ?? [ ] ;
290
+ const prevDictionary = this . _seenDictionaries . get ( id ) ;
291
+ const offset = this . _dictionaryDeltaOffsets . get ( id ) ?? 0 ;
292
+ // * If no previous dictionary was written, write an initial DictionaryMessage.
293
+ // * If the current dictionary does not share chunks with the previous dictionary, write a replacement DictionaryMessage.
294
+ if ( ! prevDictionary || prevDictionary . data [ 0 ] !== chunks [ 0 ] ) {
295
+ // * If `index > 0`, then `isDelta` is true.
296
+ // * If `index = 0`, then `isDelta` is false, because this is either the initial or a replacement DictionaryMessage.
297
+ for ( const [ index , chunk ] of chunks . entries ( ) ) this . _writeDictionaryBatch ( chunk , id , index > 0 ) ;
298
+ } else if ( offset < chunks . length ) {
299
+ for ( const chunk of chunks . slice ( offset ) ) this . _writeDictionaryBatch ( chunk , id , true ) ;
294
300
}
301
+ this . _seenDictionaries . set ( id , dictionary ) ;
302
+ this . _dictionaryDeltaOffsets . set ( id , chunks . length ) ;
295
303
}
296
304
return this ;
297
305
}
@@ -342,6 +350,13 @@ export class RecordBatchFileWriter<T extends TypeMap = any> extends RecordBatchW
342
350
return this . _writeMagic ( ) . _writePadding ( 2 ) ;
343
351
}
344
352
353
+ protected _writeDictionaryBatch ( dictionary : Data , id : number , isDelta = false ) {
354
+ if ( ! isDelta && this . _seenDictionaries . has ( id ) ) {
355
+ throw new Error ( 'The Arrow File format does not support replacement dictionaries. ' ) ;
356
+ }
357
+ return super . _writeDictionaryBatch ( dictionary , id , isDelta ) ;
358
+ }
359
+
345
360
protected _writeFooter ( schema : Schema < T > ) {
346
361
const buffer = Footer . encode ( new Footer (
347
362
schema , MetadataVersion . V5 ,
@@ -369,13 +384,13 @@ export class RecordBatchJSONWriter<T extends TypeMap = any> extends RecordBatchW
369
384
}
370
385
371
386
private _recordBatches : RecordBatch [ ] ;
372
- private _dictionaries : RecordBatch [ ] ;
387
+ private _recordBatchesWithDictionaries : RecordBatch [ ] ;
373
388
374
389
constructor ( ) {
375
390
super ( ) ;
376
391
this . _autoDestroy = true ;
377
392
this . _recordBatches = [ ] ;
378
- this . _dictionaries = [ ] ;
393
+ this . _recordBatchesWithDictionaries = [ ] ;
379
394
}
380
395
381
396
protected _writeMessage ( ) { return this ; }
@@ -386,12 +401,11 @@ export class RecordBatchJSONWriter<T extends TypeMap = any> extends RecordBatchW
386
401
}
387
402
protected _writeDictionaries ( batch : RecordBatch < T > ) {
388
403
if ( batch . dictionaries . size > 0 ) {
389
- this . _dictionaries . push ( batch ) ;
404
+ this . _recordBatchesWithDictionaries . push ( batch ) ;
390
405
}
391
406
return this ;
392
407
}
393
408
protected _writeDictionaryBatch ( dictionary : Data , id : number , isDelta = false ) {
394
- this . _dictionaryDeltaOffsets . set ( id , dictionary . length + ( this . _dictionaryDeltaOffsets . get ( id ) || 0 ) ) ;
395
409
this . _write ( this . _dictionaryBlocks . length === 0 ? ` ` : `,\n ` ) ;
396
410
this . _write ( dictionaryBatchToJSON ( dictionary , id , isDelta ) ) ;
397
411
this . _dictionaryBlocks . push ( new FileBlock ( 0 , 0 , 0 ) ) ;
@@ -403,9 +417,9 @@ export class RecordBatchJSONWriter<T extends TypeMap = any> extends RecordBatchW
403
417
return this ;
404
418
}
405
419
public close ( ) {
406
- if ( this . _dictionaries . length > 0 ) {
420
+ if ( this . _recordBatchesWithDictionaries . length > 0 ) {
407
421
this . _write ( `,\n "dictionaries": [\n` ) ;
408
- for ( const batch of this . _dictionaries ) {
422
+ for ( const batch of this . _recordBatchesWithDictionaries ) {
409
423
super . _writeDictionaries ( batch ) ;
410
424
}
411
425
this . _write ( `\n ]` ) ;
@@ -424,7 +438,7 @@ export class RecordBatchJSONWriter<T extends TypeMap = any> extends RecordBatchW
424
438
this . _write ( `\n}` ) ;
425
439
}
426
440
427
- this . _dictionaries = [ ] ;
441
+ this . _recordBatchesWithDictionaries = [ ] ;
428
442
this . _recordBatches = [ ] ;
429
443
430
444
return super . close ( ) ;
0 commit comments