Skip to content

Commit fd11b7a

Browse files
authored
apacheGH-23576: [JS] Support DictionaryMessage replacement (apache#41965)
This PR makes the stream reader/writer support replacement dictionary messages. Closes apache#23576 Closes apache#41683 * GitHub Issue: apache#23576
1 parent 6597467 commit fd11b7a

File tree

7 files changed

+149
-46
lines changed

7 files changed

+149
-46
lines changed

js/gulp/argv.js

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ export const argv = args([
2222
{ name: `target`, type: String, defaultValue: `` },
2323
{ name: `module`, type: String, defaultValue: `` },
2424
{ name: `coverage`, type: Boolean, defaultValue: false },
25+
{ name: `tests`, type: String, multiple: true, defaultValue: [`test/unit/`] },
2526
{ name: `targets`, alias: `t`, type: String, multiple: true, defaultValue: [] },
2627
{ name: `modules`, alias: `m`, type: String, multiple: true, defaultValue: [] },
2728
], { partial: true });

js/gulp/test-task.js

+3-14
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,6 @@ import { createRequire } from 'node:module';
3636
const require = createRequire(import.meta.url);
3737

3838
const jestArgv = [];
39-
const testFiles = [
40-
`test/unit/`,
41-
// `test/unit/bit-tests.ts`,
42-
// `test/unit/int-tests.ts`,
43-
// `test/unit/bn-tests.ts`,
44-
// `test/unit/math-tests.ts`,
45-
// `test/unit/table-tests.ts`,
46-
// `test/unit/generated-data-tests.ts`,
47-
// `test/unit/builders/`,
48-
// `test/unit/recordbatch/`,
49-
// `test/unit/table/`,
50-
// `test/unit/ipc/`,
51-
];
5239

5340
if (argv.verbose) {
5441
jestArgv.push(`--verbose`);
@@ -80,8 +67,10 @@ export const testTask = ((cache, execArgv, testOptions) => memoizeTask(cache, fu
8067
args.push(`-c`, `jestconfigs/jest.coverage.config.js`);
8168
} else {
8269
const cfgname = [target, format].filter(Boolean).join('.');
83-
args.push(`-c`, `jestconfigs/jest.${cfgname}.config.js`, ...testFiles);
70+
args.push(`-c`, `jestconfigs/jest.${cfgname}.config.js`);
8471
}
72+
args.push(...(argv._unknown || []).filter((x) => x !== 'test'));
73+
args.push(...argv.tests);
8574
opts.env = {
8675
...opts.env,
8776
TEST_TARGET: target,

js/src/ipc/reader.ts

+5-8
Original file line numberDiff line numberDiff line change
@@ -363,14 +363,11 @@ abstract class RecordBatchReaderImpl<T extends TypeMap = any> implements RecordB
363363
const { id, isDelta } = header;
364364
const { dictionaries, schema } = this;
365365
const dictionary = dictionaries.get(id);
366-
if (isDelta || !dictionary) {
367-
const type = schema.dictionaries.get(id)!;
368-
const data = this._loadVectors(header.data, body, [type]);
369-
return (dictionary && isDelta ? dictionary.concat(
370-
new Vector(data)) :
371-
new Vector(data)).memoize() as Vector;
372-
}
373-
return dictionary.memoize();
366+
const type = schema.dictionaries.get(id)!;
367+
const data = this._loadVectors(header.data, body, [type]);
368+
return (dictionary && isDelta ? dictionary.concat(
369+
new Vector(data)) :
370+
new Vector(data)).memoize() as Vector;
374371
}
375372
protected _loadVectors(header: metadata.RecordBatch, body: any, types: (Field | DataType)[]) {
376373
return new VectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion).visitMany(types);

js/src/ipc/writer.ts

+29-15
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ export class RecordBatchWriter<T extends TypeMap = any> extends ReadableInterop<
8484
protected _schema: Schema | null = null;
8585
protected _dictionaryBlocks: FileBlock[] = [];
8686
protected _recordBatchBlocks: FileBlock[] = [];
87+
protected _seenDictionaries = new Map<number, Vector>();
8788
protected _dictionaryDeltaOffsets = new Map<number, number>();
8889

8990
public toString(sync: true): string;
@@ -144,6 +145,7 @@ export class RecordBatchWriter<T extends TypeMap = any> extends ReadableInterop<
144145
this._started = false;
145146
this._dictionaryBlocks = [];
146147
this._recordBatchBlocks = [];
148+
this._seenDictionaries = new Map();
147149
this._dictionaryDeltaOffsets = new Map();
148150

149151
if (!schema || !(compareSchemas(schema, this._schema))) {
@@ -259,7 +261,6 @@ export class RecordBatchWriter<T extends TypeMap = any> extends ReadableInterop<
259261
}
260262

261263
protected _writeDictionaryBatch(dictionary: Data, id: number, isDelta = false) {
262-
this._dictionaryDeltaOffsets.set(id, dictionary.length + (this._dictionaryDeltaOffsets.get(id) || 0));
263264
const { byteLength, nodes, bufferRegions, buffers } = VectorAssembler.assemble(new Vector([dictionary]));
264265
const recordBatch = new metadata.RecordBatch(dictionary.length, nodes, bufferRegions);
265266
const dictionaryBatch = new metadata.DictionaryBatch(recordBatch, id, isDelta);
@@ -284,14 +285,21 @@ export class RecordBatchWriter<T extends TypeMap = any> extends ReadableInterop<
284285
}
285286

286287
protected _writeDictionaries(batch: RecordBatch<T>) {
287-
for (let [id, dictionary] of batch.dictionaries) {
288-
let offset = this._dictionaryDeltaOffsets.get(id) || 0;
289-
if (offset === 0 || (dictionary = dictionary?.slice(offset)).length > 0) {
290-
for (const data of dictionary.data) {
291-
this._writeDictionaryBatch(data, id, offset > 0);
292-
offset += data.length;
293-
}
288+
for (const [id, dictionary] of batch.dictionaries) {
289+
const chunks = dictionary?.data ?? [];
290+
const prevDictionary = this._seenDictionaries.get(id);
291+
const offset = this._dictionaryDeltaOffsets.get(id) ?? 0;
292+
// * If no previous dictionary was written, write an initial DictionaryMessage.
293+
// * If the current dictionary does not share chunks with the previous dictionary, write a replacement DictionaryMessage.
294+
if (!prevDictionary || prevDictionary.data[0] !== chunks[0]) {
295+
// * If `index > 0`, then `isDelta` is true.
296+
// * If `index = 0`, then `isDelta` is false, because this is either the initial or a replacement DictionaryMessage.
297+
for (const [index, chunk] of chunks.entries()) this._writeDictionaryBatch(chunk, id, index > 0);
298+
} else if (offset < chunks.length) {
299+
for (const chunk of chunks.slice(offset)) this._writeDictionaryBatch(chunk, id, true);
294300
}
301+
this._seenDictionaries.set(id, dictionary);
302+
this._dictionaryDeltaOffsets.set(id, chunks.length);
295303
}
296304
return this;
297305
}
@@ -342,6 +350,13 @@ export class RecordBatchFileWriter<T extends TypeMap = any> extends RecordBatchW
342350
return this._writeMagic()._writePadding(2);
343351
}
344352

353+
protected _writeDictionaryBatch(dictionary: Data, id: number, isDelta = false) {
354+
if (!isDelta && this._seenDictionaries.has(id)) {
355+
throw new Error('The Arrow File format does not support replacement dictionaries. ');
356+
}
357+
return super._writeDictionaryBatch(dictionary, id, isDelta);
358+
}
359+
345360
protected _writeFooter(schema: Schema<T>) {
346361
const buffer = Footer.encode(new Footer(
347362
schema, MetadataVersion.V5,
@@ -369,13 +384,13 @@ export class RecordBatchJSONWriter<T extends TypeMap = any> extends RecordBatchW
369384
}
370385

371386
private _recordBatches: RecordBatch[];
372-
private _dictionaries: RecordBatch[];
387+
private _recordBatchesWithDictionaries: RecordBatch[];
373388

374389
constructor() {
375390
super();
376391
this._autoDestroy = true;
377392
this._recordBatches = [];
378-
this._dictionaries = [];
393+
this._recordBatchesWithDictionaries = [];
379394
}
380395

381396
protected _writeMessage() { return this; }
@@ -386,12 +401,11 @@ export class RecordBatchJSONWriter<T extends TypeMap = any> extends RecordBatchW
386401
}
387402
protected _writeDictionaries(batch: RecordBatch<T>) {
388403
if (batch.dictionaries.size > 0) {
389-
this._dictionaries.push(batch);
404+
this._recordBatchesWithDictionaries.push(batch);
390405
}
391406
return this;
392407
}
393408
protected _writeDictionaryBatch(dictionary: Data, id: number, isDelta = false) {
394-
this._dictionaryDeltaOffsets.set(id, dictionary.length + (this._dictionaryDeltaOffsets.get(id) || 0));
395409
this._write(this._dictionaryBlocks.length === 0 ? ` ` : `,\n `);
396410
this._write(dictionaryBatchToJSON(dictionary, id, isDelta));
397411
this._dictionaryBlocks.push(new FileBlock(0, 0, 0));
@@ -403,9 +417,9 @@ export class RecordBatchJSONWriter<T extends TypeMap = any> extends RecordBatchW
403417
return this;
404418
}
405419
public close() {
406-
if (this._dictionaries.length > 0) {
420+
if (this._recordBatchesWithDictionaries.length > 0) {
407421
this._write(`,\n "dictionaries": [\n`);
408-
for (const batch of this._dictionaries) {
422+
for (const batch of this._recordBatchesWithDictionaries) {
409423
super._writeDictionaries(batch);
410424
}
411425
this._write(`\n ]`);
@@ -424,7 +438,7 @@ export class RecordBatchJSONWriter<T extends TypeMap = any> extends RecordBatchW
424438
this._write(`\n}`);
425439
}
426440

427-
this._dictionaries = [];
441+
this._recordBatchesWithDictionaries = [];
428442
this._recordBatches = [];
429443

430444
return super.close();

js/test/unit/ipc/message-reader-tests.ts

+10-5
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,16 @@ for (const table of generateRandomTables([10, 20, 30])) {
2424

2525
const io = ArrowIOTestHelper.stream(table);
2626
const name = `[\n ${table.schema.fields.join(',\n ')}\n]`;
27-
const numMessages = table.batches.reduce((numMessages, batch) => {
28-
return numMessages +
29-
/* recordBatch message */ 1 +
30-
/* dictionary messages */ batch.dictionaries.size;
31-
}, /* schema message */ 1);
27+
28+
const numDictionaries = table.batches.reduce((dictionaries, batch) => {
29+
return [...batch.dictionaries.values()]
30+
.flatMap((dictionary) => dictionary.data)
31+
.reduce((dictionaries, data) => dictionaries.add(data), dictionaries);
32+
}, new Set()).size;
33+
34+
const numMessages = /* schema message */ 1 +
35+
/* recordBatch messages */ table.batches.length +
36+
/* dictionary messages */ numDictionaries;
3237

3338
const validate = validateMessageReader.bind(0, numMessages);
3439
const validateAsync = validateAsyncMessageReader.bind(0, numMessages);

js/test/unit/ipc/writer/file-writer-tests.ts

+61-1
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,20 @@
1818
import {
1919
generateDictionaryTables, generateRandomTables
2020
} from '../../../data/tables.js';
21+
import * as generate from '../../../generate-test-data.js';
2122
import { validateRecordBatchIterator } from '../validate.js';
2223

23-
import { RecordBatchFileWriter, RecordBatchReader, Table } from 'apache-arrow';
24+
import {
25+
builderThroughIterable,
26+
Dictionary,
27+
Int32,
28+
RecordBatch,
29+
RecordBatchFileWriter,
30+
RecordBatchReader,
31+
Table,
32+
Uint32,
33+
Vector
34+
} from 'apache-arrow';
2435

2536
describe('RecordBatchFileWriter', () => {
2637
for (const table of generateRandomTables([10, 20, 30])) {
@@ -29,6 +40,55 @@ describe('RecordBatchFileWriter', () => {
2940
for (const table of generateDictionaryTables([10, 20, 30])) {
3041
testFileWriter(table, `${table.schema.fields[0]}`);
3142
}
43+
44+
it('should throw if attempting to write replacement dictionary batches', async () => {
45+
const type = new Dictionary<Uint32, Int32>(new Uint32, new Int32, 0);
46+
const writer = new RecordBatchFileWriter();
47+
writer.write(new RecordBatch({
48+
// Clone the data with the original Dictionary type so the cloned chunk has id 0
49+
dictionary_encoded_uint32: generate.dictionary(50, 20, new Uint32, new Int32).vector.data[0].clone(type)
50+
}));
51+
expect(() => {
52+
writer.write(new RecordBatch({
53+
// Clone the data with the original Dictionary type so the cloned chunk has id 0
54+
dictionary_encoded_uint32: generate.dictionary(50, 20, new Uint32, new Int32).vector.data[0].clone(type)
55+
}));
56+
}).toThrow();
57+
});
58+
59+
it('should write delta dictionary batches', async () => {
60+
61+
const name = 'dictionary_encoded_uint32';
62+
const resultChunks: Vector<Dictionary<Uint32, Int32>>[] = [];
63+
const {
64+
vector: sourceVector, values: sourceValues,
65+
} = generate.dictionary(1000, 20, new Uint32(), new Int32());
66+
67+
const writer = RecordBatchFileWriter.writeAll((function* () {
68+
const transform = builderThroughIterable({
69+
type: sourceVector.type, nullValues: [null],
70+
queueingStrategy: 'count', highWaterMark: 50,
71+
});
72+
for (const chunk of transform(sourceValues())) {
73+
resultChunks.push(chunk);
74+
yield new RecordBatch({ [name]: chunk.data[0] });
75+
}
76+
})());
77+
78+
expect(new Vector(resultChunks)).toEqualVector(sourceVector);
79+
80+
type T = { [name]: Dictionary<Uint32, Int32> };
81+
const sourceTable = new Table({ [name]: sourceVector });
82+
const resultTable = new Table(RecordBatchReader.from<T>(await writer.toUint8Array()));
83+
84+
const child = resultTable.getChild(name)!;
85+
const dicts = child.data.map(({ dictionary }) => dictionary!);
86+
const dictionary = dicts[child.data.length - 1];
87+
88+
expect(resultTable).toEqualTable(sourceTable);
89+
expect(dictionary).toBeInstanceOf(Vector);
90+
expect(dictionary.data).toHaveLength(20);
91+
});
3292
});
3393

3494
function testFileWriter(table: Table, name: string) {

js/test/unit/ipc/writer/stream-writer-tests.ts

+40-3
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import { validateRecordBatchIterator } from '../validate.js';
2525
import type { RecordBatchStreamWriterOptions } from 'apache-arrow/ipc/writer';
2626
import {
2727
builderThroughIterable,
28+
Data,
2829
Dictionary,
2930
Field,
3031
Int32,
@@ -81,10 +82,46 @@ describe('RecordBatchStreamWriter', () => {
8182
await validate;
8283
});
8384

85+
it('should write replacement dictionary batches', async () => {
86+
87+
const name = 'dictionary_encoded_uint32';
88+
const type = new Dictionary<Uint32, Int32>(new Uint32, new Int32, 0);
89+
const sourceChunks: Data<Dictionary<Uint32, Int32>>[] = [];
90+
const resultChunks: Data<Dictionary<Uint32, Int32>>[] = [];
91+
92+
const writer = RecordBatchStreamWriter.writeAll((function* () {
93+
for (let i = 0; i < 1000; i += 50) {
94+
const { vector: { data: [chunk] } } = generate.dictionary(50, 20, type.dictionary, type.indices);
95+
sourceChunks.push(chunk);
96+
// Clone the data with the original Dictionary type so the cloned chunk has id 0
97+
resultChunks.push(chunk.clone(type));
98+
yield new RecordBatch({ [name]: resultChunks.at(-1)! });
99+
}
100+
})());
101+
102+
expect(new Vector(resultChunks)).toEqualVector(new Vector(sourceChunks));
103+
104+
type T = { [name]: Dictionary<Uint32, Int32> };
105+
const sourceTable = new Table({ [name]: new Vector(sourceChunks) });
106+
const resultTable = new Table(RecordBatchReader.from<T>(await writer.toUint8Array()));
107+
108+
// 1000 / 50 = 20
109+
expect(sourceTable.batches).toHaveLength(20);
110+
expect(resultTable.batches).toHaveLength(20);
111+
expect(resultTable).toEqualTable(sourceTable);
112+
113+
for (const batch of resultTable.batches) {
114+
for (const [_, dictionary] of batch.dictionaries) {
115+
expect(dictionary).toBeInstanceOf(Vector);
116+
expect(dictionary.data).toHaveLength(1);
117+
}
118+
}
119+
});
120+
84121
it('should write delta dictionary batches', async () => {
85122

86123
const name = 'dictionary_encoded_uint32';
87-
const chunks: Vector<Dictionary<Uint32, Int32>>[] = [];
124+
const resultChunks: Vector<Dictionary<Uint32, Int32>>[] = [];
88125
const {
89126
vector: sourceVector, values: sourceValues,
90127
} = generate.dictionary(1000, 20, new Uint32(), new Int32());
@@ -95,12 +132,12 @@ describe('RecordBatchStreamWriter', () => {
95132
queueingStrategy: 'count', highWaterMark: 50,
96133
});
97134
for (const chunk of transform(sourceValues())) {
98-
chunks.push(chunk);
135+
resultChunks.push(chunk);
99136
yield new RecordBatch({ [name]: chunk.data[0] });
100137
}
101138
})());
102139

103-
expect(new Vector(chunks)).toEqualVector(sourceVector);
140+
expect(new Vector(resultChunks)).toEqualVector(sourceVector);
104141

105142
type T = { [name]: Dictionary<Uint32, Int32> };
106143
const sourceTable = new Table({ [name]: sourceVector });

0 commit comments

Comments
 (0)