Skip to content

Commit f65b0bc

Browse files
authored
src: implement Windows-1252 encoding support and update related tests
PR-URL: #60893 Fixes: #60888 Fixes: #59515 Fixes: #56542 Reviewed-By: Matteo Collina <[email protected]> Reviewed-By: Rafael Gonzaga <[email protected]>
1 parent 025ade6 commit f65b0bc

File tree

6 files changed

+121
-44
lines changed

6 files changed

+121
-44
lines changed

lib/internal/encoding.js

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ const kEncoding = Symbol('encoding');
2828
const kDecoder = Symbol('decoder');
2929
const kFatal = Symbol('kFatal');
3030
const kUTF8FastPath = Symbol('kUTF8FastPath');
31-
const kLatin1FastPath = Symbol('kLatin1FastPath');
31+
const kWindows1252FastPath = Symbol('kWindows1252FastPath');
3232
const kIgnoreBOM = Symbol('kIgnoreBOM');
3333

3434
const {
@@ -55,7 +55,7 @@ const {
5555
encodeIntoResults,
5656
encodeUtf8String,
5757
decodeUTF8,
58-
decodeLatin1,
58+
decodeWindows1252,
5959
} = binding;
6060

6161
const { Buffer } = require('buffer');
@@ -420,10 +420,10 @@ function makeTextDecoderICU() {
420420
this[kFatal] = Boolean(options?.fatal);
421421
// Only support fast path for UTF-8.
422422
this[kUTF8FastPath] = enc === 'utf-8';
423-
this[kLatin1FastPath] = enc === 'windows-1252';
423+
this[kWindows1252FastPath] = enc === 'windows-1252';
424424
this[kHandle] = undefined;
425425

426-
if (!this[kUTF8FastPath] && !this[kLatin1FastPath]) {
426+
if (!this[kUTF8FastPath] && !this[kWindows1252FastPath]) {
427427
this.#prepareConverter();
428428
}
429429
}
@@ -440,14 +440,14 @@ function makeTextDecoderICU() {
440440
validateDecoder(this);
441441

442442
this[kUTF8FastPath] &&= !(options?.stream);
443-
this[kLatin1FastPath] &&= !(options?.stream);
443+
this[kWindows1252FastPath] &&= !(options?.stream);
444444

445445
if (this[kUTF8FastPath]) {
446446
return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]);
447447
}
448448

449-
if (this[kLatin1FastPath]) {
450-
return decodeLatin1(input, this[kIgnoreBOM], this[kFatal]);
449+
if (this[kWindows1252FastPath]) {
450+
return decodeWindows1252(input, this[kIgnoreBOM], this[kFatal]);
451451
}
452452

453453
this.#prepareConverter();

src/encoding_binding.cc

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,8 @@ void BindingData::CreatePerIsolateProperties(IsolateData* isolate_data,
414414
SetMethodNoSideEffect(isolate, target, "decodeUTF8", DecodeUTF8);
415415
SetMethodNoSideEffect(isolate, target, "toASCII", ToASCII);
416416
SetMethodNoSideEffect(isolate, target, "toUnicode", ToUnicode);
417-
SetMethodNoSideEffect(isolate, target, "decodeLatin1", DecodeLatin1);
417+
SetMethodNoSideEffect(
418+
isolate, target, "decodeWindows1252", DecodeWindows1252);
418419
}
419420

420421
void BindingData::CreatePerContextProperties(Local<Object> target,
@@ -432,10 +433,10 @@ void BindingData::RegisterTimerExternalReferences(
432433
registry->Register(DecodeUTF8);
433434
registry->Register(ToASCII);
434435
registry->Register(ToUnicode);
435-
registry->Register(DecodeLatin1);
436+
registry->Register(DecodeWindows1252);
436437
}
437438

438-
void BindingData::DecodeLatin1(const FunctionCallbackInfo<Value>& args) {
439+
void BindingData::DecodeWindows1252(const FunctionCallbackInfo<Value>& args) {
439440
Environment* env = Environment::GetCurrent(args);
440441

441442
CHECK_GE(args.Length(), 1);
@@ -448,7 +449,6 @@ void BindingData::DecodeLatin1(const FunctionCallbackInfo<Value>& args) {
448449
}
449450

450451
bool ignore_bom = args[1]->IsTrue();
451-
bool has_fatal = args[2]->IsTrue();
452452

453453
ArrayBufferViewContents<uint8_t> buffer(args[0]);
454454
const uint8_t* data = buffer.data();
@@ -463,20 +463,45 @@ void BindingData::DecodeLatin1(const FunctionCallbackInfo<Value>& args) {
463463
return args.GetReturnValue().SetEmptyString();
464464
}
465465

466-
std::string result(length * 2, '\0');
467-
468-
size_t written = simdutf::convert_latin1_to_utf8(
469-
reinterpret_cast<const char*>(data), length, result.data());
466+
// Windows-1252 specific mapping for bytes 128-159
467+
// These differ from Latin-1/ISO-8859-1
468+
static const uint16_t windows1252_mapping[32] = {
469+
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
470+
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
471+
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
472+
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F
473+
};
474+
475+
std::string result;
476+
result.reserve(length * 3); // Reserve space for UTF-8 output
477+
478+
for (size_t i = 0; i < length; i++) {
479+
uint8_t byte = data[i];
480+
uint32_t codepoint;
481+
482+
// Check if byte is in the special Windows-1252 range (128-159)
483+
if (byte >= 0x80 && byte <= 0x9F) {
484+
codepoint = windows1252_mapping[byte - 0x80];
485+
} else {
486+
// For all other bytes, Windows-1252 is identical to Latin-1
487+
codepoint = byte;
488+
}
470489

471-
if (has_fatal && written == 0) {
472-
return node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA(
473-
env->isolate(), "The encoded data was not valid for encoding latin1");
490+
// Convert codepoint to UTF-8
491+
if (codepoint < 0x80) {
492+
result.push_back(static_cast<char>(codepoint));
493+
} else if (codepoint < 0x800) {
494+
result.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
495+
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
496+
} else {
497+
result.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
498+
result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
499+
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
500+
}
474501
}
475502

476-
std::string_view view(result.c_str(), written);
477-
478503
Local<Value> ret;
479-
if (ToV8Value(env->context(), view, env->isolate()).ToLocal(&ret)) {
504+
if (ToV8Value(env->context(), result, env->isolate()).ToLocal(&ret)) {
480505
args.GetReturnValue().Set(ret);
481506
}
482507
}

src/encoding_binding.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ class BindingData : public SnapshotableObject {
3131
static void EncodeInto(const v8::FunctionCallbackInfo<v8::Value>& args);
3232
static void EncodeUtf8String(const v8::FunctionCallbackInfo<v8::Value>& args);
3333
static void DecodeUTF8(const v8::FunctionCallbackInfo<v8::Value>& args);
34-
static void DecodeLatin1(const v8::FunctionCallbackInfo<v8::Value>& args);
34+
static void DecodeWindows1252(
35+
const v8::FunctionCallbackInfo<v8::Value>& args);
3536

3637
static void ToASCII(const v8::FunctionCallbackInfo<v8::Value>& args);
3738
static void ToUnicode(const v8::FunctionCallbackInfo<v8::Value>& args);

test/parallel/test-internal-encoding-binding.js

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,41 +8,46 @@ const assert = require('node:assert');
88
const { internalBinding } = require('internal/test/binding');
99
const binding = internalBinding('encoding_binding');
1010

11+
// Windows-1252 specific tests
1112
{
12-
// Valid input
13-
const buf = Uint8Array.from([0xC1, 0xE9, 0xF3]);
14-
assert.strictEqual(binding.decodeLatin1(buf, false, false), 'Áéó');
13+
// Test Windows-1252 special characters in 128-159 range
14+
// These differ from Latin-1
15+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x80), false, false), '€');
16+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x82), false, false), '‚');
17+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x83), false, false), 'ƒ');
18+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x9F), false, false), 'Ÿ');
1519
}
1620

1721
{
18-
// Empty input
19-
const buf = Uint8Array.from([]);
20-
assert.strictEqual(binding.decodeLatin1(buf, false, false), '');
22+
// Test Windows-1252 characters outside 128-159 range (same as Latin-1)
23+
const buf = Uint8Array.from([0xC1, 0xE9, 0xF3]);
24+
assert.strictEqual(binding.decodeWindows1252(buf, false, false), 'Áéó');
2125
}
2226

2327
{
24-
// Invalid input, but Latin1 has no invalid chars and should never throw.
25-
const buf = new TextEncoder().encode('Invalid Latin1 🧑‍🧑‍🧒‍🧒');
26-
assert.strictEqual(
27-
binding.decodeLatin1(buf, false, false),
28-
'Invalid Latin1 ð\x9F§\x91â\x80\x8Dð\x9F§\x91â\x80\x8Dð\x9F§\x92â\x80\x8Dð\x9F§\x92'
29-
);
28+
// Empty input
29+
const buf = Uint8Array.from([]);
30+
assert.strictEqual(binding.decodeWindows1252(buf, false, false), '');
3031
}
3132

33+
// Windows-1252 specific tests
3234
{
33-
// IgnoreBOM with BOM
34-
const buf = Uint8Array.from([0xFE, 0xFF, 0xC1, 0xE9, 0xF3]);
35-
assert.strictEqual(binding.decodeLatin1(buf, true, false), 'þÿÁéó');
35+
// Test Windows-1252 special characters in 128-159 range
36+
// These differ from Latin-1
37+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x80), false, false), '€');
38+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x82), false, false), '‚');
39+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x83), false, false), 'ƒ');
40+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x9F), false, false), 'Ÿ');
3641
}
3742

3843
{
39-
// Fatal and InvalidInput, but Latin1 has no invalid chars and should never throw.
40-
const buf = Uint8Array.from([0xFF, 0xFF, 0xFF]);
41-
assert.strictEqual(binding.decodeLatin1(buf, false, true), 'ÿÿÿ');
44+
// Test Windows-1252 characters outside 128-159 range (same as Latin-1)
45+
const buf = Uint8Array.from([0xC1, 0xE9, 0xF3]);
46+
assert.strictEqual(binding.decodeWindows1252(buf, false, false), 'Áéó');
4247
}
4348

4449
{
45-
// IgnoreBOM and Fatal, but Latin1 has no invalid chars and should never throw.
46-
const buf = Uint8Array.from([0xFE, 0xFF, 0xC1, 0xE9, 0xF3]);
47-
assert.strictEqual(binding.decodeLatin1(buf, true, true), 'þÿÁéó');
50+
// Empty input
51+
const buf = Uint8Array.from([]);
52+
assert.strictEqual(binding.decodeWindows1252(buf, false, false), '');
4853
}

test/parallel/test-util-text-decoder.js

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,49 @@ test('TextDecoder correctly decodes windows-1252 encoded data', { skip: !common.
1515

1616
assert.strictEqual(decodedString, expectedString);
1717
});
18+
19+
// Test for the difference between Latin1 and Windows-1252 in the 128-159
20+
// range
21+
// Ref: https://github.com/nodejs/node/issues/60888
22+
test('TextDecoder correctly decodes windows-1252 special characters in ' +
23+
'128-159 range', { skip: !common.hasIntl }, () => {
24+
const decoder = new TextDecoder('windows-1252');
25+
26+
// Test specific characters that differ between Latin1 and Windows-1252.
27+
// € Euro sign
28+
assert.strictEqual(decoder.decode(Uint8Array.of(128)).codePointAt(0),
29+
8364);
30+
// ‚ Single low-9 quotation mark
31+
assert.strictEqual(decoder.decode(Uint8Array.of(130)).codePointAt(0),
32+
8218);
33+
// Latin small letter f with hook (ƒ)
34+
assert.strictEqual(decoder.decode(Uint8Array.of(131)).codePointAt(0),
35+
402);
36+
// Ÿ Latin capital letter Y with diaeresis
37+
assert.strictEqual(decoder.decode(Uint8Array.of(159)).codePointAt(0),
38+
376);
39+
40+
// Test the full range to ensure no character is treated as Latin1
41+
// Directly.
42+
const expectedMappings = [
43+
[128, 8364], [129, 129], [130, 8218], [131, 402], [132, 8222],
44+
[133, 8230], [134, 8224], [135, 8225], [136, 710], [137, 8240],
45+
[138, 352], [139, 8249], [140, 338], [141, 141], [142, 381],
46+
[143, 143], [144, 144], [145, 8216], [146, 8217], [147, 8220],
47+
[148, 8221], [149, 8226], [150, 8211], [151, 8212], [152, 732],
48+
[153, 8482], [154, 353], [155, 8250], [156, 339], [157, 157],
49+
[158, 382], [159, 376],
50+
];
51+
52+
for (const [byte, expectedCodePoint] of expectedMappings) {
53+
const result = decoder.decode(Uint8Array.of(byte));
54+
const actualCodePoint = result.codePointAt(0);
55+
assert.strictEqual(
56+
actualCodePoint,
57+
expectedCodePoint,
58+
`Byte 0x${byte.toString(16)} should decode to ` +
59+
`U+${expectedCodePoint.toString(16)} but got ` +
60+
`U+${actualCodePoint.toString(16)}`
61+
);
62+
}
63+
});

typings/internalBinding/encoding_binding.d.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ export interface EncodingBinding {
44
decodeUTF8(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string;
55
toASCII(input: string): string;
66
toUnicode(input: string): string;
7-
decodeLatin1(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string;
7+
decodeWindows1252(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string;
88
}

0 commit comments

Comments
 (0)