Skip to content

Commit 666d70a

Browse files
committed
WIP HTML entity handling
1 parent 9128100 commit 666d70a

File tree

10 files changed

+265
-59
lines changed

10 files changed

+265
-59
lines changed

scripts/generate-char-utils.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@ const generateScriptName = 'generate-char-utils';
2121
const { srcFileContents, specFileContents } = generateCharUtils([
2222
['isControlChar', /[\x00-\x1F\x7F]/], // ASCII control characters (0-31), and the backspace char (127). Used to check for invalid characters in the HTML parser.
2323
['isAsciiLetterChar', /[A-Za-z]/],
24-
['isDigitChar', /\d/],
24+
['isAsciiAlphaNumericChar', /[A-Za-z0-9]/], // Used for parsing named HTML entities like '&amp';
25+
['isAsciiDigitChar', /[0-9]/], // Used for parsing decimal HTML entities like '<'
26+
['isHexChar', /[A-Fa-f0-9]/], // Used for parsing hexadecimal HTML entities like '<'
2527
['isQuoteChar', /['"]/],
2628
['isWhitespaceChar', /\s/],
2729
['isAlphaNumericOrMarkChar', alphaNumericAndMarksRe /*/[\p{Letter}\p{Mark}\p{Emoji}\p{Nd}]/u*/], // sadly the unicode regexp is not working, probably because the char codes are outside the range of 0-65535 for multi-char emojis and such, but not 100% sure. Need to investigate. Using the old regexp for now instead

src/char-utils.ts

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ export function isAsciiLetterChar(c: number): boolean {
3333
}
3434

3535
/**
36-
* Determines if the given character `c` matches the regular expression /\d/
36+
* Determines if the given character `c` matches the regular expression /[A-Za-z0-9]/
3737
* by checking it via character code in a binary search fashion.
3838
*
3939
* This technique speeds this function up by a factor of ~10x vs. running RegExp.prototype.test()
@@ -43,10 +43,40 @@ export function isAsciiLetterChar(c: number): boolean {
4343
*
4444
* npm run generate-char-utils
4545
*/
46-
export function isDigitChar(c: number): boolean {
46+
export function isAsciiAlphaNumericChar(c: number): boolean {
47+
return (c < 65 ? (c >= 48 && c <= 57) : ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)));
48+
}
49+
50+
/**
51+
* Determines if the given character `c` matches the regular expression /[0-9]/
52+
* by checking it via character code in a binary search fashion.
53+
*
54+
* This technique speeds this function up by a factor of ~10x vs. running RegExp.prototype.test()
55+
* on the character itself.
56+
*
57+
* NOTE: This function is generated. Do not edit manually. To regenerate, run:
58+
*
59+
* npm run generate-char-utils
60+
*/
61+
export function isAsciiDigitChar(c: number): boolean {
4762
return (c >= 48 && c <= 57);
4863
}
4964

65+
/**
66+
* Determines if the given character `c` matches the regular expression /[A-Fa-f0-9]/
67+
* by checking it via character code in a binary search fashion.
68+
*
69+
* This technique speeds this function up by a factor of ~10x vs. running RegExp.prototype.test()
70+
* on the character itself.
71+
*
72+
* NOTE: This function is generated. Do not edit manually. To regenerate, run:
73+
*
74+
* npm run generate-char-utils
75+
*/
76+
export function isHexChar(c: number): boolean {
77+
return (c < 65 ? (c >= 48 && c <= 57) : ((c >= 65 && c <= 70) || (c >= 97 && c <= 102)));
78+
}
79+
5080
/**
5181
* Determines if the given character `c` matches the regular expression /['"]/
5282
* by checking it via character code in a binary search fashion.

src/htmlParser/parse-html.ts

Lines changed: 119 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
import { Char } from '../char';
12
import {
2-
isDigitChar,
3+
isAsciiDigitChar,
34
isAsciiLetterChar,
45
isQuoteChar,
56
isWhitespaceChar,
67
isControlChar,
8+
isAsciiAlphaNumericChar,
79
} from '../char-utils';
810
import { assertNever } from '../utils';
911

@@ -12,39 +14,47 @@ import { assertNever } from '../utils';
1214

1315
class CurrentTag {
1416
public idx: number; // the index of the '<' in the html string
15-
public type: CurrentTagType;
17+
public type: HtmlTagType;
1618
public name: string;
1719
public isOpening: boolean; // true if it's an opening tag, OR a self-closing open tag
1820
public isClosing: boolean; // true if it's a closing tag, OR a self-closing open tag
1921

2022
constructor(cfg: Partial<CurrentTag> = {}) {
2123
this.idx = cfg.idx !== undefined ? cfg.idx : -1;
22-
this.type = cfg.type || CurrentTagType.Tag;
24+
this.type = cfg.type || HtmlTagType.Tag;
2325
this.name = cfg.name || '';
2426
this.isOpening = !!cfg.isOpening;
2527
this.isClosing = !!cfg.isClosing;
2628
}
2729
}
2830

2931
// For debugging: temporarily remove 'const'
30-
const enum CurrentTagType {
31-
Tag = 0,
32-
Comment,
33-
Doctype,
32+
const enum HtmlTagType {
33+
Tag = 0, // normal html tag, like <div>
34+
Comment, // <!-- html comment tag -->
35+
Doctype, // <!DOCTYPE> tag
3436
}
3537

36-
// // Represents the current HTML entity (ex: '&amp;') being read
37-
// class CurrentEntity {
38-
// readonly idx: number; // the index of the '&' in the html string
39-
// readonly type: 'decimal' | 'hex' | 'named' | undefined;
40-
// readonly content: string;
38+
// Represents the current HTML entity (ex: '&amp;') being read
39+
class CurrentEntity {
40+
public readonly idx: number; // the index of the '&' in the html string
41+
public type: HtmlEntityType;
42+
public content: string;
4143

42-
// constructor(cfg: Partial<CurrentEntity> = {}) {
43-
// this.idx = cfg.idx !== undefined ? cfg.idx : -1;
44-
// this.type = cfg.type;
45-
// this.content = cfg.content || '';
46-
// }
47-
// }
44+
constructor(cfg: Partial<CurrentEntity> = {}) {
45+
this.idx = cfg.idx !== undefined ? cfg.idx : -1;
46+
this.type = cfg.type || HtmlEntityType.Unknown;
47+
this.content = cfg.content || '';
48+
}
49+
}
50+
51+
// For debugging: temporarily remove 'const'
52+
const enum HtmlEntityType {
53+
Unknown = 0, // not yet known (we need to read more characters)
54+
Named, // ex: '&amp;'
55+
DecimalNumeric, // ex: '&#60;'
56+
HexNumeric, // ex: '&#x3C;'
57+
}
4858

4959
/**
5060
* Context object containing all the state needed by the HTML parsing state
@@ -65,7 +75,7 @@ class ParseHtmlContext {
6575
public state: State = State.Data; // begin in the Data state
6676
public currentDataIdx = 0; // where the current data start index is
6777
public currentTag: CurrentTag | null = null; // describes the current tag that is being read
68-
// public currentEntity: CurrentEntity = new CurrentEntity(); // describes the current HTML entity (ex: '&amp;') that is being read
78+
public currentEntity: CurrentEntity | null = null; // describes the current HTML entity (ex: '&amp;') that is being read
6979

7080
constructor(html: string, callbacks: ParseHtmlCallbacks) {
7181
this.html = html;
@@ -223,6 +233,24 @@ export function parseHtml(html: string, callbacks: ParseHtmlCallbacks) {
223233
case State.Doctype:
224234
stateDoctype(context, char);
225235
break;
236+
case State.CharacterReference:
237+
stateCharacterReference(context, charCode);
238+
break;
239+
case State.CharacterReferenceNamed:
240+
stateCharacterReferenceNamed(context, charCode);
241+
break;
242+
case State.CharacterReferenceNumeric:
243+
stateCharacterReferenceNumeric(context, charCode);
244+
break;
245+
case State.CharacterReferenceHexadecimal:
246+
stateCharacterReferenceHexadecimal(context, charCode);
247+
break;
248+
case State.CharacterReferenceDecimal:
249+
stateCharacterReferenceDecimal(context, charCode);
250+
break;
251+
case State.CharacterReferenceNumericEnd:
252+
stateCharacterReferenceNumericEnd(context, charCode);
253+
break;
226254

227255
/* istanbul ignore next */
228256
default:
@@ -256,9 +284,9 @@ export function parseHtml(html: string, callbacks: ParseHtmlCallbacks) {
256284
function stateData(context: ParseHtmlContext, char: string) {
257285
if (char === '<') {
258286
startNewTag(context);
259-
} /*else if (char === '&') {
287+
} else if (char === '&') {
260288
startNewEntity(context);
261-
}*/
289+
}
262290
}
263291

264292
// Called after a '<' is read from the Data state
@@ -299,7 +327,7 @@ function stateTagName(context: ParseHtmlContext, char: string, charCode: number)
299327
} else if (char === '>') {
300328
context.currentTag!.name = captureTagName(context);
301329
emitTagAndPreviousTextNode(context); // resets to Data state as well
302-
} else if (!isAsciiLetterChar(charCode) && !isDigitChar(charCode) && char !== ':') {
330+
} else if (!isAsciiLetterChar(charCode) && !isAsciiDigitChar(charCode) && char !== ':') {
303331
// Anything else that does not form an html tag. Note: the colon
304332
// character is accepted for XML namespaced tags
305333
resetToDataState(context);
@@ -495,11 +523,11 @@ function stateMarkupDeclarationOpen(context: ParseHtmlContext) {
495523
if (html.slice(charIdx, charIdx + 2) === '--') {
496524
// html comment
497525
context.charIdx++; // "consume" the second '-' character. Next loop iteration will consume the character after the '<!--' sequence
498-
context.currentTag!.type = CurrentTagType.Comment;
526+
context.currentTag!.type = HtmlTagType.Comment;
499527
context.state = State.CommentStart;
500528
} else if (html.slice(charIdx, charIdx + 7).toUpperCase() === 'DOCTYPE') {
501529
context.charIdx += 6; // "consume" the characters "OCTYPE" (the current loop iteraction consumed the 'D'). Next loop iteration will consume the character after the '<!DOCTYPE' sequence
502-
context.currentTag!.type = CurrentTagType.Doctype;
530+
context.currentTag!.type = HtmlTagType.Doctype;
503531
context.state = State.Doctype;
504532
} else {
505533
// At this point, the spec specifies that the state machine should
@@ -622,6 +650,52 @@ function stateDoctype(context: ParseHtmlContext, char: string) {
622650
}
623651
}
624652

653+
// We've read a '&' character
654+
// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
655+
function stateCharacterReference(context: ParseHtmlContext, charCode: number) {
656+
if (charCode === Char.NumberSign /* '#' */) {
657+
context.state = State.CharacterReferenceNumeric;
658+
} else if (isAsciiAlphaNumericChar(charCode)) {
659+
context.currentEntity!.type = HtmlEntityType.Named;
660+
context.state = State.CharacterReferenceNamed;
661+
} else {
662+
// TODO: Can we be inside a tag when we get here? If so, don't reset the
663+
// currentTag
664+
resetToDataState(context);
665+
}
666+
}
667+
668+
// We've read an ASCII alpha-numeric character after a '&' char, such as reading
669+
// the 'a' character in '&amp;'
670+
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
671+
function stateCharacterReferenceNamed(context: ParseHtmlContext, charCode: number) {
672+
if (charCode === Char.SemiColon /* ';' */) {
673+
const currentEntity = context.currentEntity!;
674+
currentEntity.content = context.html.slice(currentEntity.idx + 1, context.charIdx);
675+
} else if (isAsciiAlphaNumericChar(charCode)) {
676+
// stay in the CharacterReferenceNamed state
677+
} else {
678+
}
679+
}
680+
681+
// We've read a '#' char after '&' which begins a numeric character reference.
682+
// For example, we could be reading the sequence '&#60;' or '&#x3C;'
683+
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
684+
function stateCharacterReferenceNumeric(context: ParseHtmlContext, charCode: number) {}
685+
686+
// We've read an 'x' or 'X' char after a '&#' sequence, which begins a hexadecimal
687+
// character reference. For example, we could be reading the sequence '&#x3C;'
688+
// https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
689+
function stateCharacterReferenceHexadecimal(context: ParseHtmlContext, charCode: number) {}
690+
691+
// We've read an ASCII digit a '&#' sequence, which begins a decimal (base 10)
692+
// character reference. For example, we could be reading the sequence '&#60;'
693+
// https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
694+
function stateCharacterReferenceDecimal(context: ParseHtmlContext, charCode: number) {}
695+
696+
// We've read a ';' character to end a numeric character reference such as '&#60;'
697+
function stateCharacterReferenceNumericEnd(context: ParseHtmlContext, charCode: number) {}
698+
625699
/**
626700
* Resets the state back to the Data state, and removes the current tag.
627701
*
@@ -632,6 +706,7 @@ function stateDoctype(context: ParseHtmlContext, char: string) {
632706
function resetToDataState(context: ParseHtmlContext) {
633707
context.state = State.Data;
634708
context.currentTag = null;
709+
context.currentEntity = null;
635710
}
636711

637712
/**
@@ -647,6 +722,17 @@ function startNewTag(context: ParseHtmlContext) {
647722
context.currentTag = new CurrentTag({ idx: context.charIdx });
648723
}
649724

725+
/**
726+
* Starts a new HTML entity at the current index, ignoring any previous HTML
727+
* entity that was being read.
728+
*
729+
* We'll generally run this function whenever we read a new '&' character.
730+
*/
731+
function startNewEntity(context: ParseHtmlContext) {
732+
context.state = State.CharacterReference;
733+
context.currentEntity = new CurrentEntity({ idx: context.charIdx });
734+
}
735+
650736
/**
651737
* Once we've decided to emit an open tag, that means we can also emit the
652738
* text node before it.
@@ -664,15 +750,15 @@ function emitTagAndPreviousTextNode(context: ParseHtmlContext) {
664750
}
665751

666752
switch (currentTagType) {
667-
case CurrentTagType.Comment:
753+
case HtmlTagType.Comment:
668754
context.callbacks.onComment(currentTagIdx);
669755
break;
670756

671-
case CurrentTagType.Doctype:
757+
case HtmlTagType.Doctype:
672758
context.callbacks.onDoctype(currentTagIdx);
673759
break;
674760

675-
case CurrentTagType.Tag: {
761+
case HtmlTagType.Tag: {
676762
const { isOpening, isClosing } = currentTag;
677763

678764
if (isOpening) {
@@ -747,8 +833,10 @@ export const enum State {
747833
CommentEnd,
748834
CommentEndBang,
749835
Doctype,
750-
// CharacterReference, // beginning with a '&' char
751-
// CharacterReferenceNamed, // example: '&amp;'
752-
// CharacterReferenceNumeric, // example: '&#60;'
753-
// CharacterReferenceHexadecimal, // example: '&#x3C;'
836+
CharacterReference, // beginning with a '&' char
837+
CharacterReferenceNamed, // example: '&amp;'
838+
CharacterReferenceNumeric, // when we've read the '#' in '&#60;' or '&#x3C;'
839+
CharacterReferenceHexadecimal, // example: '&#x3C;'
840+
CharacterReferenceDecimal, // example: '&#60;'
841+
CharacterReferenceNumericEnd, // when we read the ';' char in a numeric character reference
754842
}

src/parser/mention-utils.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { Char } from '../char';
2-
import { isDigitChar, isAsciiLetterChar } from '../char-utils';
2+
import { isAsciiDigitChar, isAsciiLetterChar } from '../char-utils';
33

44
const mentionRegexes: { [serviceName in MentionService]: RegExp } = {
55
twitter: /^@\w{1,15}$/,
@@ -31,7 +31,7 @@ export function isMentionTextChar(charCode: number): boolean {
3131
charCode === Char.Dot || // '.'
3232
charCode === Char.Underscore || // '_'
3333
isAsciiLetterChar(charCode) ||
34-
isDigitChar(charCode)
34+
isAsciiDigitChar(charCode)
3535
);
3636
}
3737

0 commit comments

Comments
 (0)