gregjacobs
diff --git a/‎scripts/generate-char-utils.ts
Lines changed: 3 additions & 0 deletions b/‎scripts/generate-char-utils.ts
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/char-utils.ts
Lines changed: 30 additions & 0 deletions b/‎src/char-utils.ts
Lines changed: 30 additions & 0 deletions
diff --git a/‎src/char.ts
Lines changed: 2 additions & 0 deletions b/‎src/char.ts
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/htmlParser/c1-control-chars.ts
Lines changed: 42 additions & 0 deletions b/‎src/htmlParser/c1-control-chars.ts
Lines changed: 42 additions & 0 deletions
@@ -23,6 +23,8 @@ const { srcFileContents, specFileContents } = generateCharUtils([
     ['isAsciiLetterChar', /[A-Za-z]/],
     ['isAsciiAlphaNumericChar', /[A-Za-z0-9]/], // Used for parsing named HTML entities like '&amp';
     ['isAsciiDigitChar', /[0-9]/], // Used for parsing decimal HTML entities like '&#60;'
+    ['isAsciiUpperHexDigitChar', /[A-F]/],
+    //['isAsciiLowerHexDigitChar', /[a-f]/], -- not actually needed at the moment
     ['isHexChar', /[A-Fa-f0-9]/], // Used for parsing hexadecimal HTML entities like '&#x3C;'
     ['isQuoteChar', /['"]/],
     ['isWhitespaceChar', /\s/],
@@ -32,6 +34,7 @@ const { srcFileContents, specFileContents } = generateCharUtils([
     ['isUrlSuffixNotAllowedAsFinalChar', /[?!:,.;^]/], // URL suffix characters (i.e. path, query, and has part of the URL) that are not allowed as the *last character* in the URL suffix as they would normally form the end of a sentence. The isUrlSuffixAllowedSpecialChar() function contains additional allowed URL suffix characters which are allowed as the last character.
     ['isOpenBraceChar', /[({[]/],
     ['isCloseBraceChar', /[)}\]]/],
+    ['isSurrogateChar', /[\uD800-\uDBFF\uDC00-\uDFFF]/], // Leading surrogate chars are in the range U+D800 to U+DBFF. Trailing surrogate chars are in the range U+DC00 to U+DFFF. Essentially, all surrogate chars are in the range U+D800 to U+DFFF. See: https://infra.spec.whatwg.org/#surrogate
 ]);
 
 // console.log(srcFileContents);
 
@@ -62,6 +62,21 @@ export function isAsciiDigitChar(c: number): boolean {
     return (c >= 48 && c <= 57);
 }
 
+/**
+ * Determines if the given character `c` matches the regular expression /[A-F]/ 
+ * by checking it via character code in a binary search fashion.
+ * 
+ * This technique speeds this function up by a factor of ~10x vs. running RegExp.prototype.test() 
+ * on the character itself.
+ * 
+ * NOTE: This function is generated. Do not edit manually. To regenerate, run: 
+ * 
+ *     npm run generate-char-utils
+ */
+export function isAsciiUpperHexDigitChar(c: number): boolean {
+    return (c >= 65 && c <= 70);
+}
+
 /**
  * Determines if the given character `c` matches the regular expression /[A-Fa-f0-9]/ 
  * by checking it via character code in a binary search fashion.
@@ -196,3 +211,18 @@ export function isOpenBraceChar(c: number): boolean {
 export function isCloseBraceChar(c: number): boolean {
     return (c < 93 ? c == 41 : (c == 93 || c == 125));
 }
+
+/**
+ * Determines if the given character `c` matches the regular expression /[\uD800-\uDBFF\uDC00-\uDFFF]/ 
+ * by checking it via character code in a binary search fashion.
+ * 
+ * This technique speeds this function up by a factor of ~10x vs. running RegExp.prototype.test() 
+ * on the character itself.
+ * 
+ * NOTE: This function is generated. Do not edit manually. To regenerate, run: 
+ * 
+ *     npm run generate-char-utils
+ */
+export function isSurrogateChar(c: number): boolean {
+    return (c >= 55296 && c <= 57343);
+}
@@ -8,8 +8,10 @@
 export const enum Char {
     // Letter chars (usually used for scheme testing)
     A = 65,
+    X = 88,
     Z = 90,
     a = 97,
+    x = 120,
     z = 122,
 
     // Quote chars (used for HTML parsing)
 
@@ -0,0 +1,42 @@
+/**
+ * This mapping is for decoding numeric character references in the range 0x80 to
+ * 0x9F, minus a few.
+ *
+ * For example, if we encounter the HTML character reference '&#x80;', we should
+ * replace it with the Euro sign (€)
+ *
+ * This replacement process is described by the table here:
+ *     https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
+ *
+ * And here:
+ *     https://html.spec.whatwg.org/multipage/parsing.html#parse-error-control-character-reference
+ */
+export const c1ControlCharReplacements: ReadonlyMap<number, number> = new Map([
+    [0x80, 0x20ac], // EURO SIGN (€)
+    [0x82, 0x201a], // SINGLE LOW-9 QUOTATION MARK (‚)
+    [0x83, 0x0192], // LATIN SMALL LETTER F WITH HOOK (ƒ)
+    [0x84, 0x201e], // DOUBLE LOW-9 QUOTATION MARK („)
+    [0x85, 0x2026], // HORIZONTAL ELLIPSIS (…)
+    [0x86, 0x2020], // DAGGER (†)
+    [0x87, 0x2021], // DOUBLE DAGGER (‡)
+    [0x88, 0x02c6], // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
+    [0x89, 0x2030], // PER MILLE SIGN (‰)
+    [0x8a, 0x0160], // LATIN CAPITAL LETTER S WITH CARON (Š)
+    [0x8b, 0x2039], // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
+    [0x8c, 0x0152], // LATIN CAPITAL LIGATURE OE (Œ)
+    [0x8e, 0x017d], // LATIN CAPITAL LETTER Z WITH CARON (Ž)
+    [0x91, 0x2018], // LEFT SINGLE QUOTATION MARK (‘)
+    [0x92, 0x2019], // RIGHT SINGLE QUOTATION MARK (’)
+    [0x93, 0x201c], // LEFT DOUBLE QUOTATION MARK (“)
+    [0x94, 0x201d], // RIGHT DOUBLE QUOTATION MARK (”)
+    [0x95, 0x2022], // BULLET (•)
+    [0x96, 0x2013], // EN DASH (–)
+    [0x97, 0x2014], // EM DASH (—)
+    [0x98, 0x02dc], // SMALL TILDE (˜)
+    [0x99, 0x2122], // TRADE MARK SIGN (™)
+    [0x9a, 0x0161], // LATIN SMALL LETTER S WITH CARON (š)
+    [0x9b, 0x203a], // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
+    [0x9c, 0x0153], // LATIN SMALL LIGATURE OE (œ)
+    [0x9e, 0x017e], // LATIN SMALL LETTER Z WITH CARON (ž)
+    [0x9f, 0x0178], // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
+]);