Skip to content

Commit 7b7f19c

Browse files
committed
WIP parsing numeric HTML entities
1 parent 666d70a commit 7b7f19c

File tree

7 files changed

+338
-53
lines changed

7 files changed

+338
-53
lines changed

scripts/generate-char-utils.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ const { srcFileContents, specFileContents } = generateCharUtils([
2323
['isAsciiLetterChar', /[A-Za-z]/],
2424
['isAsciiAlphaNumericChar', /[A-Za-z0-9]/], // Used for parsing named HTML entities like '&amp';
2525
['isAsciiDigitChar', /[0-9]/], // Used for parsing decimal HTML entities like '<'
26+
['isAsciiUpperHexDigitChar', /[A-F]/],
27+
//['isAsciiLowerHexDigitChar', /[a-f]/], -- not actually needed at the moment
2628
['isHexChar', /[A-Fa-f0-9]/], // Used for parsing hexadecimal HTML entities like '<'
2729
['isQuoteChar', /['"]/],
2830
['isWhitespaceChar', /\s/],
@@ -32,6 +34,7 @@ const { srcFileContents, specFileContents } = generateCharUtils([
3234
['isUrlSuffixNotAllowedAsFinalChar', /[?!:,.;^]/], // URL suffix characters (i.e. path, query, and has part of the URL) that are not allowed as the *last character* in the URL suffix as they would normally form the end of a sentence. The isUrlSuffixAllowedSpecialChar() function contains additional allowed URL suffix characters which are allowed as the last character.
3335
['isOpenBraceChar', /[({[]/],
3436
['isCloseBraceChar', /[)}\]]/],
37+
['isSurrogateChar', /[\uD800-\uDBFF\uDC00-\uDFFF]/], // Leading surrogate chars are in the range U+D800 to U+DBFF. Trailing surrogate chars are in the range U+DC00 to U+DFFF. Essentially, all surrogate chars are in the range U+D800 to U+DFFF. See: https://infra.spec.whatwg.org/#surrogate
3538
]);
3639

3740
// console.log(srcFileContents);

src/char-utils.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,21 @@ export function isAsciiDigitChar(c: number): boolean {
6262
return (c >= 48 && c <= 57);
6363
}
6464

65+
/**
66+
* Determines if the given character `c` matches the regular expression /[A-F]/
67+
* by checking it via character code in a binary search fashion.
68+
*
69+
* This technique speeds this function up by a factor of ~10x vs. running RegExp.prototype.test()
70+
* on the character itself.
71+
*
72+
* NOTE: This function is generated. Do not edit manually. To regenerate, run:
73+
*
74+
* npm run generate-char-utils
75+
*/
76+
export function isAsciiUpperHexDigitChar(c: number): boolean {
77+
return (c >= 65 && c <= 70);
78+
}
79+
6580
/**
6681
* Determines if the given character `c` matches the regular expression /[A-Fa-f0-9]/
6782
* by checking it via character code in a binary search fashion.
@@ -196,3 +211,18 @@ export function isOpenBraceChar(c: number): boolean {
196211
export function isCloseBraceChar(c: number): boolean {
197212
return (c < 93 ? c == 41 : (c == 93 || c == 125));
198213
}
214+
215+
/**
216+
* Determines if the given character `c` matches the regular expression /[\uD800-\uDBFF\uDC00-\uDFFF]/
217+
* by checking it via character code in a binary search fashion.
218+
*
219+
* This technique speeds this function up by a factor of ~10x vs. running RegExp.prototype.test()
220+
* on the character itself.
221+
*
222+
* NOTE: This function is generated. Do not edit manually. To regenerate, run:
223+
*
224+
* npm run generate-char-utils
225+
*/
226+
export function isSurrogateChar(c: number): boolean {
227+
return (c >= 55296 && c <= 57343);
228+
}

src/char.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@
88
export const enum Char {
99
// Letter chars (usually used for scheme testing)
1010
A = 65,
11+
X = 88,
1112
Z = 90,
1213
a = 97,
14+
x = 120,
1315
z = 122,
1416

1517
// Quote chars (used for HTML parsing)

src/htmlParser/c1-control-chars.ts

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/**
2+
* This mapping is for decoding numeric character references in the range 0x80 to
3+
* 0x9F, minus a few.
4+
*
5+
* For example, if we encounter the HTML character reference '&#x80;', we should
6+
* replace it with the Euro sign (€)
7+
*
8+
* This replacement process is described by the table here:
9+
* https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
10+
*
11+
* And here:
12+
* https://html.spec.whatwg.org/multipage/parsing.html#parse-error-control-character-reference
13+
*/
14+
export const c1ControlCharReplacements: ReadonlyMap<number, number> = new Map([
15+
[0x80, 0x20ac], // EURO SIGN (€)
16+
[0x82, 0x201a], // SINGLE LOW-9 QUOTATION MARK (‚)
17+
[0x83, 0x0192], // LATIN SMALL LETTER F WITH HOOK (ƒ)
18+
[0x84, 0x201e], // DOUBLE LOW-9 QUOTATION MARK („)
19+
[0x85, 0x2026], // HORIZONTAL ELLIPSIS (…)
20+
[0x86, 0x2020], // DAGGER (†)
21+
[0x87, 0x2021], // DOUBLE DAGGER (‡)
22+
[0x88, 0x02c6], // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
23+
[0x89, 0x2030], // PER MILLE SIGN (‰)
24+
[0x8a, 0x0160], // LATIN CAPITAL LETTER S WITH CARON (Š)
25+
[0x8b, 0x2039], // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
26+
[0x8c, 0x0152], // LATIN CAPITAL LIGATURE OE (Œ)
27+
[0x8e, 0x017d], // LATIN CAPITAL LETTER Z WITH CARON (Ž)
28+
[0x91, 0x2018], // LEFT SINGLE QUOTATION MARK (‘)
29+
[0x92, 0x2019], // RIGHT SINGLE QUOTATION MARK (’)
30+
[0x93, 0x201c], // LEFT DOUBLE QUOTATION MARK (“)
31+
[0x94, 0x201d], // RIGHT DOUBLE QUOTATION MARK (”)
32+
[0x95, 0x2022], // BULLET (•)
33+
[0x96, 0x2013], // EN DASH (–)
34+
[0x97, 0x2014], // EM DASH (—)
35+
[0x98, 0x02dc], // SMALL TILDE (˜)
36+
[0x99, 0x2122], // TRADE MARK SIGN (™)
37+
[0x9a, 0x0161], // LATIN SMALL LETTER S WITH CARON (š)
38+
[0x9b, 0x203a], // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
39+
[0x9c, 0x0153], // LATIN SMALL LIGATURE OE (œ)
40+
[0x9e, 0x017e], // LATIN SMALL LETTER Z WITH CARON (ž)
41+
[0x9f, 0x0178], // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
42+
]);

0 commit comments

Comments
 (0)