1
+ import { Char } from '../char' ;
1
2
import {
2
- isDigitChar ,
3
+ isAsciiDigitChar ,
3
4
isAsciiLetterChar ,
4
5
isQuoteChar ,
5
6
isWhitespaceChar ,
6
7
isControlChar ,
8
+ isAsciiAlphaNumericChar ,
7
9
} from '../char-utils' ;
8
10
import { assertNever } from '../utils' ;
9
11
@@ -12,39 +14,47 @@ import { assertNever } from '../utils';
12
14
13
15
class CurrentTag {
14
16
public idx : number ; // the index of the '<' in the html string
15
- public type : CurrentTagType ;
17
+ public type : HtmlTagType ;
16
18
public name : string ;
17
19
public isOpening : boolean ; // true if it's an opening tag, OR a self-closing open tag
18
20
public isClosing : boolean ; // true if it's a closing tag, OR a self-closing open tag
19
21
20
22
constructor ( cfg : Partial < CurrentTag > = { } ) {
21
23
this . idx = cfg . idx !== undefined ? cfg . idx : - 1 ;
22
- this . type = cfg . type || CurrentTagType . Tag ;
24
+ this . type = cfg . type || HtmlTagType . Tag ;
23
25
this . name = cfg . name || '' ;
24
26
this . isOpening = ! ! cfg . isOpening ;
25
27
this . isClosing = ! ! cfg . isClosing ;
26
28
}
27
29
}
28
30
29
31
// For debugging: temporarily remove 'const'
30
- const enum CurrentTagType {
31
- Tag = 0 ,
32
- Comment ,
33
- Doctype ,
32
+ const enum HtmlTagType {
33
+ Tag = 0 , // normal html tag, like <div>
34
+ Comment , // <!-- html comment tag -->
35
+ Doctype , // <!DOCTYPE> tag
34
36
}
35
37
36
- // // Represents the current HTML entity (ex: '&') being read
37
- // class CurrentEntity {
38
- // readonly idx: number; // the index of the '&' in the html string
39
- // readonly type: 'decimal' | 'hex' | 'named' | undefined ;
40
- // readonly content: string;
38
+ // Represents the current HTML entity (ex: '&') being read
39
+ class CurrentEntity {
40
+ public readonly idx : number ; // the index of the '&' in the html string
41
+ public type : HtmlEntityType ;
42
+ public content : string ;
41
43
42
- // constructor(cfg: Partial<CurrentEntity> = {}) {
43
- // this.idx = cfg.idx !== undefined ? cfg.idx : -1;
44
- // this.type = cfg.type;
45
- // this.content = cfg.content || '';
46
- // }
47
- // }
44
+ constructor ( cfg : Partial < CurrentEntity > = { } ) {
45
+ this . idx = cfg . idx !== undefined ? cfg . idx : - 1 ;
46
+ this . type = cfg . type || HtmlEntityType . Unknown ;
47
+ this . content = cfg . content || '' ;
48
+ }
49
+ }
50
+
51
+ // For debugging: temporarily remove 'const'
52
+ const enum HtmlEntityType {
53
+ Unknown = 0 , // not yet known (we need to read more characters)
54
+ Named , // ex: '&'
55
+ DecimalNumeric , // ex: '<'
56
+ HexNumeric , // ex: '<'
57
+ }
48
58
49
59
/**
50
60
* Context object containing all the state needed by the HTML parsing state
@@ -65,7 +75,7 @@ class ParseHtmlContext {
65
75
public state : State = State . Data ; // begin in the Data state
66
76
public currentDataIdx = 0 ; // where the current data start index is
67
77
public currentTag : CurrentTag | null = null ; // describes the current tag that is being read
68
- // public currentEntity: CurrentEntity = new CurrentEntity() ; // describes the current HTML entity (ex: '&') that is being read
78
+ public currentEntity : CurrentEntity | null = null ; // describes the current HTML entity (ex: '&') that is being read
69
79
70
80
constructor ( html : string , callbacks : ParseHtmlCallbacks ) {
71
81
this . html = html ;
@@ -223,6 +233,24 @@ export function parseHtml(html: string, callbacks: ParseHtmlCallbacks) {
223
233
case State . Doctype :
224
234
stateDoctype ( context , char ) ;
225
235
break ;
236
+ case State . CharacterReference :
237
+ stateCharacterReference ( context , charCode ) ;
238
+ break ;
239
+ case State . CharacterReferenceNamed :
240
+ stateCharacterReferenceNamed ( context , charCode ) ;
241
+ break ;
242
+ case State . CharacterReferenceNumeric :
243
+ stateCharacterReferenceNumeric ( context , charCode ) ;
244
+ break ;
245
+ case State . CharacterReferenceHexadecimal :
246
+ stateCharacterReferenceHexadecimal ( context , charCode ) ;
247
+ break ;
248
+ case State . CharacterReferenceDecimal :
249
+ stateCharacterReferenceDecimal ( context , charCode ) ;
250
+ break ;
251
+ case State . CharacterReferenceNumericEnd :
252
+ stateCharacterReferenceNumericEnd ( context , charCode ) ;
253
+ break ;
226
254
227
255
/* istanbul ignore next */
228
256
default :
@@ -256,9 +284,9 @@ export function parseHtml(html: string, callbacks: ParseHtmlCallbacks) {
256
284
function stateData ( context : ParseHtmlContext , char : string ) {
257
285
if ( char === '<' ) {
258
286
startNewTag ( context ) ;
259
- } /* else if (char === '&') {
287
+ } else if ( char === '&' ) {
260
288
startNewEntity ( context ) ;
261
- }*/
289
+ }
262
290
}
263
291
264
292
// Called after a '<' is read from the Data state
@@ -299,7 +327,7 @@ function stateTagName(context: ParseHtmlContext, char: string, charCode: number)
299
327
} else if ( char === '>' ) {
300
328
context . currentTag ! . name = captureTagName ( context ) ;
301
329
emitTagAndPreviousTextNode ( context ) ; // resets to Data state as well
302
- } else if ( ! isAsciiLetterChar ( charCode ) && ! isDigitChar ( charCode ) && char !== ':' ) {
330
+ } else if ( ! isAsciiLetterChar ( charCode ) && ! isAsciiDigitChar ( charCode ) && char !== ':' ) {
303
331
// Anything else that does not form an html tag. Note: the colon
304
332
// character is accepted for XML namespaced tags
305
333
resetToDataState ( context ) ;
@@ -495,11 +523,11 @@ function stateMarkupDeclarationOpen(context: ParseHtmlContext) {
495
523
if ( html . slice ( charIdx , charIdx + 2 ) === '--' ) {
496
524
// html comment
497
525
context . charIdx ++ ; // "consume" the second '-' character. Next loop iteration will consume the character after the '<!--' sequence
498
- context . currentTag ! . type = CurrentTagType . Comment ;
526
+ context . currentTag ! . type = HtmlTagType . Comment ;
499
527
context . state = State . CommentStart ;
500
528
} else if ( html . slice ( charIdx , charIdx + 7 ) . toUpperCase ( ) === 'DOCTYPE' ) {
501
529
context . charIdx += 6 ; // "consume" the characters "OCTYPE" (the current loop iteraction consumed the 'D'). Next loop iteration will consume the character after the '<!DOCTYPE' sequence
502
- context . currentTag ! . type = CurrentTagType . Doctype ;
530
+ context . currentTag ! . type = HtmlTagType . Doctype ;
503
531
context . state = State . Doctype ;
504
532
} else {
505
533
// At this point, the spec specifies that the state machine should
@@ -622,6 +650,52 @@ function stateDoctype(context: ParseHtmlContext, char: string) {
622
650
}
623
651
}
624
652
653
+ // We've read a '&' character
654
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
655
+ function stateCharacterReference ( context : ParseHtmlContext , charCode : number ) {
656
+ if ( charCode === Char . NumberSign /* '#' */ ) {
657
+ context . state = State . CharacterReferenceNumeric ;
658
+ } else if ( isAsciiAlphaNumericChar ( charCode ) ) {
659
+ context . currentEntity ! . type = HtmlEntityType . Named ;
660
+ context . state = State . CharacterReferenceNamed ;
661
+ } else {
662
+ // TODO: Can we be inside a tag when we get here? If so, don't reset the
663
+ // currentTag
664
+ resetToDataState ( context ) ;
665
+ }
666
+ }
667
+
668
+ // We've read an ASCII alpha-numeric character after a '&' char, such as reading
669
+ // the 'a' character in '&'
670
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
671
+ function stateCharacterReferenceNamed ( context : ParseHtmlContext , charCode : number ) {
672
+ if ( charCode === Char . SemiColon /* ';' */ ) {
673
+ const currentEntity = context . currentEntity ! ;
674
+ currentEntity . content = context . html . slice ( currentEntity . idx + 1 , context . charIdx ) ;
675
+ } else if ( isAsciiAlphaNumericChar ( charCode ) ) {
676
+ // stay in the CharacterReferenceNamed state
677
+ } else {
678
+ }
679
+ }
680
+
681
+ // We've read a '#' char after '&' which begins a numeric character reference.
682
+ // For example, we could be reading the sequence '<' or '<'
683
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
684
+ function stateCharacterReferenceNumeric ( context : ParseHtmlContext , charCode : number ) { }
685
+
686
+ // We've read an 'x' or 'X' char after a '&#' sequence, which begins a hexadecimal
687
+ // character reference. For example, we could be reading the sequence '<'
688
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
689
+ function stateCharacterReferenceHexadecimal ( context : ParseHtmlContext , charCode : number ) { }
690
+
691
+ // We've read an ASCII digit a '&#' sequence, which begins a decimal (base 10)
692
+ // character reference. For example, we could be reading the sequence '<'
693
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
694
+ function stateCharacterReferenceDecimal ( context : ParseHtmlContext , charCode : number ) { }
695
+
696
+ // We've read a ';' character to end a numeric character reference such as '<'
697
+ function stateCharacterReferenceNumericEnd ( context : ParseHtmlContext , charCode : number ) { }
698
+
625
699
/**
626
700
* Resets the state back to the Data state, and removes the current tag.
627
701
*
@@ -632,6 +706,7 @@ function stateDoctype(context: ParseHtmlContext, char: string) {
632
706
function resetToDataState ( context : ParseHtmlContext ) {
633
707
context . state = State . Data ;
634
708
context . currentTag = null ;
709
+ context . currentEntity = null ;
635
710
}
636
711
637
712
/**
@@ -647,6 +722,17 @@ function startNewTag(context: ParseHtmlContext) {
647
722
context . currentTag = new CurrentTag ( { idx : context . charIdx } ) ;
648
723
}
649
724
725
+ /**
726
+ * Starts a new HTML entity at the current index, ignoring any previous HTML
727
+ * entity that was being read.
728
+ *
729
+ * We'll generally run this function whenever we read a new '&' character.
730
+ */
731
+ function startNewEntity ( context : ParseHtmlContext ) {
732
+ context . state = State . CharacterReference ;
733
+ context . currentEntity = new CurrentEntity ( { idx : context . charIdx } ) ;
734
+ }
735
+
650
736
/**
651
737
* Once we've decided to emit an open tag, that means we can also emit the
652
738
* text node before it.
@@ -664,15 +750,15 @@ function emitTagAndPreviousTextNode(context: ParseHtmlContext) {
664
750
}
665
751
666
752
switch ( currentTagType ) {
667
- case CurrentTagType . Comment :
753
+ case HtmlTagType . Comment :
668
754
context . callbacks . onComment ( currentTagIdx ) ;
669
755
break ;
670
756
671
- case CurrentTagType . Doctype :
757
+ case HtmlTagType . Doctype :
672
758
context . callbacks . onDoctype ( currentTagIdx ) ;
673
759
break ;
674
760
675
- case CurrentTagType . Tag : {
761
+ case HtmlTagType . Tag : {
676
762
const { isOpening, isClosing } = currentTag ;
677
763
678
764
if ( isOpening ) {
@@ -747,8 +833,10 @@ export const enum State {
747
833
CommentEnd ,
748
834
CommentEndBang ,
749
835
Doctype ,
750
- // CharacterReference, // beginning with a '&' char
751
- // CharacterReferenceNamed, // example: '&'
752
- // CharacterReferenceNumeric, // example: '<'
753
- // CharacterReferenceHexadecimal, // example: '<'
836
+ CharacterReference , // beginning with a '&' char
837
+ CharacterReferenceNamed , // example: '&'
838
+ CharacterReferenceNumeric , // when we've read the '#' in '<' or '<'
839
+ CharacterReferenceHexadecimal , // example: '<'
840
+ CharacterReferenceDecimal , // example: '<'
841
+ CharacterReferenceNumericEnd , // when we read the ';' char in a numeric character reference
754
842
}
0 commit comments