Skip to content

Commit 1abea1a

Browse files
committed
Port fix: Fix JSPB binary utf8 decoding to be spec compliant.
Our prior behavior was extremely undefined when confronted with errors, it would read out of bounds, accept overlong encodings, skip over out of range bytes, compose out of range codepoints. The new implementation always detects and handles errors consistently by either throwing or using replacement characters (� aka \uFFFD) This also adds support for aligning with the proto3 spec to the code generator which requires that parsing fail for proto3 messages with invalid utf8 payloads for string fields. For now, actual failing is disabled via the goog.define jspb.binary.ENFORCE_UTF8 which is set to NEVER. A future change will flip this to DEFAULT.
1 parent 5aee743 commit 1abea1a

File tree

6 files changed

+517
-76
lines changed

6 files changed

+517
-76
lines changed

Diff for: binary/decoder.js

+36-65
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
goog.provide('jspb.BinaryDecoder');
4848

4949
goog.require('jspb.asserts');
50-
goog.require('goog.crypt');
50+
goog.require('jspb.binary.utf8');
5151
goog.require('jspb.utils');
5252

5353

@@ -256,7 +256,7 @@ jspb.BinaryDecoder.prototype.setCursor = function(cursor) {
256256
*/
257257
jspb.BinaryDecoder.prototype.advance = function(count) {
258258
this.cursor_ += count;
259-
jspb.asserts.assert(this.cursor_ <= this.end_);
259+
this.checkCursor();
260260
};
261261

262262

@@ -397,6 +397,17 @@ jspb.BinaryDecoder.prototype.readSplitFixed64 = function(convert) {
397397
return convert(lowBits, highBits);
398398
};
399399

400+
/**
401+
* Asserts that our cursor is in bounds.
402+
*
403+
* @private
404+
* @return {void}
405+
*/
406+
jspb.BinaryDecoder.prototype.checkCursor = function () {
407+
if (this.cursor_ > this.end_) {
408+
asserts.fail('Read past the end ' + this.cursor_ + ' > ' + this.end_);
409+
}
410+
}
400411

401412
/**
402413
* Skips over a varint in the block without decoding it.
@@ -452,31 +463,31 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
452463
var x = (temp & 0x7F);
453464
if (temp < 128) {
454465
this.cursor_ += 1;
455-
jspb.asserts.assert(this.cursor_ <= this.end_);
466+
this.checkCursor();
456467
return x;
457468
}
458469

459470
temp = bytes[this.cursor_ + 1];
460471
x |= (temp & 0x7F) << 7;
461472
if (temp < 128) {
462473
this.cursor_ += 2;
463-
jspb.asserts.assert(this.cursor_ <= this.end_);
474+
this.checkCursor();
464475
return x;
465476
}
466477

467478
temp = bytes[this.cursor_ + 2];
468479
x |= (temp & 0x7F) << 14;
469480
if (temp < 128) {
470481
this.cursor_ += 3;
471-
jspb.asserts.assert(this.cursor_ <= this.end_);
482+
this.checkCursor();
472483
return x;
473484
}
474485

475486
temp = bytes[this.cursor_ + 3];
476487
x |= (temp & 0x7F) << 21;
477488
if (temp < 128) {
478489
this.cursor_ += 4;
479-
jspb.asserts.assert(this.cursor_ <= this.end_);
490+
this.checkCursor();
480491
return x;
481492
}
482493

@@ -486,7 +497,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
486497
// We're reading the high bits of an unsigned varint. The byte we just read
487498
// also contains bits 33 through 35, which we're going to discard.
488499
this.cursor_ += 5;
489-
jspb.asserts.assert(this.cursor_ <= this.end_);
500+
this.checkCursor();
490501
return x >>> 0;
491502
}
492503

@@ -500,7 +511,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
500511
jspb.asserts.assert(false);
501512
}
502513

503-
jspb.asserts.assert(this.cursor_ <= this.end_);
514+
this.checkCursor();
504515
return x;
505516
};
506517

@@ -679,7 +690,7 @@ jspb.BinaryDecoder.prototype.readZigzagVarint64String = function() {
679690
jspb.BinaryDecoder.prototype.readUint8 = function() {
680691
var a = this.bytes_[this.cursor_ + 0];
681692
this.cursor_ += 1;
682-
jspb.asserts.assert(this.cursor_ <= this.end_);
693+
this.checkCursor();
683694
return a;
684695
};
685696

@@ -694,7 +705,7 @@ jspb.BinaryDecoder.prototype.readUint16 = function() {
694705
var a = this.bytes_[this.cursor_ + 0];
695706
var b = this.bytes_[this.cursor_ + 1];
696707
this.cursor_ += 2;
697-
jspb.asserts.assert(this.cursor_ <= this.end_);
708+
this.checkCursor();
698709
return (a << 0) | (b << 8);
699710
};
700711

@@ -711,7 +722,7 @@ jspb.BinaryDecoder.prototype.readUint32 = function() {
711722
var c = this.bytes_[this.cursor_ + 2];
712723
var d = this.bytes_[this.cursor_ + 3];
713724
this.cursor_ += 4;
714-
jspb.asserts.assert(this.cursor_ <= this.end_);
725+
this.checkCursor();
715726
return ((a << 0) | (b << 8) | (c << 16) | (d << 24)) >>> 0;
716727
};
717728

@@ -756,7 +767,7 @@ jspb.BinaryDecoder.prototype.readUint64String = function() {
756767
jspb.BinaryDecoder.prototype.readInt8 = function() {
757768
var a = this.bytes_[this.cursor_ + 0];
758769
this.cursor_ += 1;
759-
jspb.asserts.assert(this.cursor_ <= this.end_);
770+
this.checkCursor();
760771
return (a << 24) >> 24;
761772
};
762773

@@ -771,7 +782,7 @@ jspb.BinaryDecoder.prototype.readInt16 = function() {
771782
var a = this.bytes_[this.cursor_ + 0];
772783
var b = this.bytes_[this.cursor_ + 1];
773784
this.cursor_ += 2;
774-
jspb.asserts.assert(this.cursor_ <= this.end_);
785+
this.checkCursor();
775786
return (((a << 0) | (b << 8)) << 16) >> 16;
776787
};
777788

@@ -788,7 +799,7 @@ jspb.BinaryDecoder.prototype.readInt32 = function() {
788799
var c = this.bytes_[this.cursor_ + 2];
789800
var d = this.bytes_[this.cursor_ + 3];
790801
this.cursor_ += 4;
791-
jspb.asserts.assert(this.cursor_ <= this.end_);
802+
this.checkCursor();
792803
return (a << 0) | (b << 8) | (c << 16) | (d << 24);
793804
};
794805

@@ -858,7 +869,9 @@ jspb.BinaryDecoder.prototype.readDouble = function() {
858869
* @export
859870
*/
860871
jspb.BinaryDecoder.prototype.readBool = function() {
861-
return !!this.bytes_[this.cursor_++];
872+
const b = !!this.bytes_[this.cursor_++];
873+
this.checkCursor();
874+
return b;
862875
};
863876

864877

@@ -879,59 +892,17 @@ jspb.BinaryDecoder.prototype.readEnum = function() {
879892
* Supports codepoints from U+0000 up to U+10FFFF.
880893
* (http://en.wikipedia.org/wiki/UTF-8).
881894
* @param {number} length The length of the string to read.
895+
* @param {boolean} requireUtf8 Whether to throw when invalid utf8 is found.
882896
* @return {string} The decoded string.
883897
* @export
884898
*/
885-
jspb.BinaryDecoder.prototype.readString = function(length) {
886-
var bytes = this.bytes_;
887-
var cursor = this.cursor_;
888-
var end = cursor + length;
889-
var codeUnits = [];
890-
891-
var result = '';
892-
while (cursor < end) {
893-
var c = bytes[cursor++];
894-
if (c < 128) { // Regular 7-bit ASCII.
895-
codeUnits.push(c);
896-
} else if (c < 192) {
897-
// UTF-8 continuation mark. We are out of sync. This
898-
// might happen if we attempted to read a character
899-
// with more than four bytes.
900-
continue;
901-
} else if (c < 224) { // UTF-8 with two bytes.
902-
var c2 = bytes[cursor++];
903-
codeUnits.push(((c & 31) << 6) | (c2 & 63));
904-
} else if (c < 240) { // UTF-8 with three bytes.
905-
var c2 = bytes[cursor++];
906-
var c3 = bytes[cursor++];
907-
codeUnits.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
908-
} else if (c < 248) { // UTF-8 with 4 bytes.
909-
var c2 = bytes[cursor++];
910-
var c3 = bytes[cursor++];
911-
var c4 = bytes[cursor++];
912-
// Characters written on 4 bytes have 21 bits for a codepoint.
913-
// We can't fit that on 16bit characters, so we use surrogates.
914-
var codepoint =
915-
((c & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63);
916-
// Surrogates formula from wikipedia.
917-
// 1. Subtract 0x10000 from codepoint
918-
codepoint -= 0x10000;
919-
// 2. Split this into the high 10-bit value and the low 10-bit value
920-
// 3. Add 0xD800 to the high value to form the high surrogate
921-
// 4. Add 0xDC00 to the low value to form the low surrogate:
922-
var low = (codepoint & 1023) + 0xDC00;
923-
var high = ((codepoint >> 10) & 1023) + 0xD800;
924-
codeUnits.push(high, low);
925-
}
926899

927-
// Avoid exceeding the maximum stack size when calling `apply`.
928-
if (codeUnits.length >= 8192) {
929-
result += String.fromCharCode.apply(null, codeUnits);
930-
codeUnits.length = 0;
931-
}
932-
}
933-
result += goog.crypt.byteArrayToString(codeUnits);
934-
this.cursor_ = cursor;
900+
jspb.BinaryDecoder.prototype.readString = function (length, requireUtf8) {
901+
const cursor = this.cursor_;
902+
this.cursor_ += length;
903+
this.checkCursor();
904+
const result =
905+
jspb.binary.utf8.decodeUtf8(jspb.asserts.assert(this.bytes_), cursor, length, requireUtf8);
935906
return result;
936907
};
937908

@@ -966,7 +937,7 @@ jspb.BinaryDecoder.prototype.readBytes = function(length) {
966937
var result = this.bytes_.subarray(this.cursor_, this.cursor_ + length);
967938

968939
this.cursor_ += length;
969-
jspb.asserts.assert(this.cursor_ <= this.end_);
940+
this.checkCursor();
970941
return result;
971942
};
972943

Diff for: binary/decoder_test.js

+5-5
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,7 @@ describe('binaryDecoderTest', () => {
354354

355355
const decoder = jspb.BinaryDecoder.alloc(encoder.end());
356356

357-
expect(decoder.readString(len)).toEqual(long_string);
357+
expect(decoder.readString(len, true)).toEqual(long_string);
358358
});
359359

360360
/**
@@ -375,11 +375,11 @@ describe('binaryDecoderTest', () => {
375375

376376
const decoder = jspb.BinaryDecoder.alloc(encoder.end());
377377

378-
expect(decoder.readString(ascii.length)).toEqual(ascii);
379-
expect(utf8_two_bytes).toEqual(decoder.readString(utf8_two_bytes.length));
378+
expect(decoder.readString(ascii.length, true)).toEqual(ascii);
379+
expect(utf8_two_bytes).toEqual(decoder.readString(2, true));
380380
expect(utf8_three_bytes)
381-
.toEqual(decoder.readString(utf8_three_bytes.length));
382-
expect(utf8_four_bytes).toEqual(decoder.readString(utf8_four_bytes.length));
381+
.toEqual(decoder.readString(3, true));
382+
expect(utf8_four_bytes).toEqual(decoder.readString(4, true));
383383
});
384384

385385
/**

Diff for: binary/reader.js

+40-1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,26 @@ goog.require('jspb.BinaryConstants');
5252
goog.require('jspb.BinaryDecoder');
5353
goog.require('jspb.utils');
5454

55+
/**
56+
* Whether to enforce that string fields are valid utf8.
57+
*
58+
* <p>Currently set to `ALWAYS`, can be set to `DEPRECATED_PROTO3_ONLY` to only
59+
* enforce utf8 for proto3 string fields, for proto2 string fields it will use
60+
* replacement characters when encoding errors are found.
61+
*
62+
* <p>TODO: Remove the flag, simplify BinaryReader to remove
63+
* readStringRequireUtf8 and related support in the code generator et. al.
64+
*
65+
* @define {string}
66+
*/
67+
const ENFORCE_UTF8 = goog.define('jspb.binary.ENFORCE_UTF8', 'ALWAYS');
68+
69+
// Constrain the set of values to only these two.
70+
jspb.asserts.assert(
71+
ENFORCE_UTF8 === 'DEPRECATED_PROTO3_ONLY' || ENFORCE_UTF8 === 'ALWAYS');
72+
73+
const /** boolean */ UTF8_PARSING_ERRORS_ARE_FATAL = ENFORCE_UTF8 === 'ALWAYS';
74+
5575

5676

5777
/**
@@ -996,10 +1016,29 @@ jspb.BinaryReader.prototype.readEnum = function() {
9961016
* @export
9971017
*/
9981018
jspb.BinaryReader.prototype.readString = function() {
1019+
// delegate to the other reader so that inlining can eliminate this method
1020+
// in the common case.
1021+
if (UTF8_PARSING_ERRORS_ARE_FATAL) {
1022+
return this.readStringRequireUtf8();
1023+
}
1024+
9991025
jspb.asserts.assert(
10001026
this.nextWireType_ == jspb.BinaryConstants.WireType.DELIMITED);
10011027
var length = this.decoder_.readUnsignedVarint32();
1002-
return this.decoder_.readString(length);
1028+
return this.decoder_.readString(length, /*requireUtf8=*/ false);
1029+
};
1030+
1031+
/**
1032+
* Reads a string field from the binary stream, or throws an error if the next
1033+
* field in the stream is not of the correct wire type, or if the string is
1034+
* not valid utf8.
1035+
*
1036+
* @return {string} The value of the string field.
1037+
*/
1038+
jspb.BinaryReader.prototype.readStringRequireUtf8 = function () {
1039+
jspb.asserts.assert(this.nextWireType_ == jspb.BinaryConstants.WireType.DELIMITED);
1040+
const length = this.decoder_.readUnsignedVarint32();
1041+
return this.decoder_.readString(length, /*requireUtf8=*/ true);
10031042
};
10041043

10051044

0 commit comments

Comments
 (0)