Skip to content

Commit 324513b

Browse files
committed
html: move the HTML parser to an exp/html package. The parser is a
work in progress, and we are not ready to freeze its API for Go 1. Package html still exists, containing just two functions: EscapeString and UnescapeString. Both the packages at exp/html and html are "package html". The former is a superset of the latter. At some point in the future, the exp/html code will move back into html, once we have finalized the parser API. R=rsc, dsymonds CC=golang-dev https://golang.org/cl/5571059
1 parent 66599c4 commit 324513b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+2560
-8
lines changed

src/pkg/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ DIRS=\
8282
exp/ebnf\
8383
exp/ebnflint\
8484
exp/gotype\
85+
exp/html\
8586
exp/norm\
8687
exp/spdy\
8788
exp/ssh\

src/pkg/exp/html/Makefile

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright 2010 The Go Authors. All rights reserved.
2+
# Use of this source code is governed by a BSD-style
3+
# license that can be found in the LICENSE file.
4+
5+
include ../../../Make.inc
6+
7+
TARG=html
8+
GOFILES=\
9+
const.go\
10+
doc.go\
11+
doctype.go\
12+
entity.go\
13+
escape.go\
14+
foreign.go\
15+
node.go\
16+
parse.go\
17+
render.go\
18+
token.go\
19+
20+
include ../../../Make.pkg
File renamed without changes.
File renamed without changes.
File renamed without changes.

src/pkg/exp/html/entity.go

Lines changed: 2253 additions & 0 deletions
Large diffs are not rendered by default.

src/pkg/exp/html/entity_test.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
// Copyright 2010 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package html
6+
7+
import (
8+
"testing"
9+
"unicode/utf8"
10+
)
11+
12+
func TestEntityLength(t *testing.T) {
13+
// We verify that the length of UTF-8 encoding of each value is <= 1 + len(key).
14+
// The +1 comes from the leading "&". This property implies that the length of
15+
// unescaped text is <= the length of escaped text.
16+
for k, v := range entity {
17+
if 1+len(k) < utf8.RuneLen(v) {
18+
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v))
19+
}
20+
if len(k) > longestEntityWithoutSemicolon && k[len(k)-1] != ';' {
21+
t.Errorf("entity name %s is %d characters, but longestEntityWithoutSemicolon=%d", k, len(k), longestEntityWithoutSemicolon)
22+
}
23+
}
24+
for k, v := range entity2 {
25+
if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) {
26+
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v[0]) + string(v[1]))
27+
}
28+
}
29+
}

src/pkg/exp/html/escape.go

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
// Copyright 2010 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package html
6+
7+
import (
8+
"bytes"
9+
"strings"
10+
"unicode/utf8"
11+
)
12+
13+
// These replacements permit compatibility with old numeric entities that
14+
// assumed Windows-1252 encoding.
15+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
16+
var replacementTable = [...]rune{
17+
'\u20AC', // First entry is what 0x80 should be replaced with.
18+
'\u0081',
19+
'\u201A',
20+
'\u0192',
21+
'\u201E',
22+
'\u2026',
23+
'\u2020',
24+
'\u2021',
25+
'\u02C6',
26+
'\u2030',
27+
'\u0160',
28+
'\u2039',
29+
'\u0152',
30+
'\u008D',
31+
'\u017D',
32+
'\u008F',
33+
'\u0090',
34+
'\u2018',
35+
'\u2019',
36+
'\u201C',
37+
'\u201D',
38+
'\u2022',
39+
'\u2013',
40+
'\u2014',
41+
'\u02DC',
42+
'\u2122',
43+
'\u0161',
44+
'\u203A',
45+
'\u0153',
46+
'\u009D',
47+
'\u017E',
48+
'\u0178', // Last entry is 0x9F.
49+
// 0x00->'\uFFFD' is handled programmatically.
50+
// 0x0D->'\u000D' is a no-op.
51+
}
52+
53+
// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
54+
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
55+
// Precondition: b[src] == '&' && dst <= src.
56+
// attribute should be true if parsing an attribute value.
57+
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
58+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
59+
60+
// i starts at 1 because we already know that s[0] == '&'.
61+
i, s := 1, b[src:]
62+
63+
if len(s) <= 1 {
64+
b[dst] = b[src]
65+
return dst + 1, src + 1
66+
}
67+
68+
if s[i] == '#' {
69+
if len(s) <= 3 { // We need to have at least "&#.".
70+
b[dst] = b[src]
71+
return dst + 1, src + 1
72+
}
73+
i++
74+
c := s[i]
75+
hex := false
76+
if c == 'x' || c == 'X' {
77+
hex = true
78+
i++
79+
}
80+
81+
x := '\x00'
82+
for i < len(s) {
83+
c = s[i]
84+
i++
85+
if hex {
86+
if '0' <= c && c <= '9' {
87+
x = 16*x + rune(c) - '0'
88+
continue
89+
} else if 'a' <= c && c <= 'f' {
90+
x = 16*x + rune(c) - 'a' + 10
91+
continue
92+
} else if 'A' <= c && c <= 'F' {
93+
x = 16*x + rune(c) - 'A' + 10
94+
continue
95+
}
96+
} else if '0' <= c && c <= '9' {
97+
x = 10*x + rune(c) - '0'
98+
continue
99+
}
100+
if c != ';' {
101+
i--
102+
}
103+
break
104+
}
105+
106+
if i <= 3 { // No characters matched.
107+
b[dst] = b[src]
108+
return dst + 1, src + 1
109+
}
110+
111+
if 0x80 <= x && x <= 0x9F {
112+
// Replace characters from Windows-1252 with UTF-8 equivalents.
113+
x = replacementTable[x-0x80]
114+
} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
115+
// Replace invalid characters with the replacement character.
116+
x = '\uFFFD'
117+
}
118+
119+
return dst + utf8.EncodeRune(b[dst:], x), src + i
120+
}
121+
122+
// Consume the maximum number of characters possible, with the
123+
// consumed characters matching one of the named references.
124+
125+
for i < len(s) {
126+
c := s[i]
127+
i++
128+
// Lower-cased characters are more common in entities, so we check for them first.
129+
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
130+
continue
131+
}
132+
if c != ';' {
133+
i--
134+
}
135+
break
136+
}
137+
138+
entityName := string(s[1:i])
139+
if entityName == "" {
140+
// No-op.
141+
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
142+
// No-op.
143+
} else if x := entity[entityName]; x != 0 {
144+
return dst + utf8.EncodeRune(b[dst:], x), src + i
145+
} else if x := entity2[entityName]; x[0] != 0 {
146+
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
147+
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
148+
} else if !attribute {
149+
maxLen := len(entityName) - 1
150+
if maxLen > longestEntityWithoutSemicolon {
151+
maxLen = longestEntityWithoutSemicolon
152+
}
153+
for j := maxLen; j > 1; j-- {
154+
if x := entity[entityName[:j]]; x != 0 {
155+
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
156+
}
157+
}
158+
}
159+
160+
dst1, src1 = dst+i, src+i
161+
copy(b[dst:dst1], b[src:src1])
162+
return dst1, src1
163+
}
164+
165+
// unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
166+
func unescape(b []byte) []byte {
167+
for i, c := range b {
168+
if c == '&' {
169+
dst, src := unescapeEntity(b, i, i, false)
170+
for src < len(b) {
171+
c := b[src]
172+
if c == '&' {
173+
dst, src = unescapeEntity(b, dst, src, false)
174+
} else {
175+
b[dst] = c
176+
dst, src = dst+1, src+1
177+
}
178+
}
179+
return b[0:dst]
180+
}
181+
}
182+
return b
183+
}
184+
185+
// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
186+
func lower(b []byte) []byte {
187+
for i, c := range b {
188+
if 'A' <= c && c <= 'Z' {
189+
b[i] = c + 'a' - 'A'
190+
}
191+
}
192+
return b
193+
}
194+
195+
const escapedChars = `&'<>"`
196+
197+
func escape(w writer, s string) error {
198+
i := strings.IndexAny(s, escapedChars)
199+
for i != -1 {
200+
if _, err := w.WriteString(s[:i]); err != nil {
201+
return err
202+
}
203+
var esc string
204+
switch s[i] {
205+
case '&':
206+
esc = "&amp;"
207+
case '\'':
208+
esc = "&apos;"
209+
case '<':
210+
esc = "&lt;"
211+
case '>':
212+
esc = "&gt;"
213+
case '"':
214+
esc = "&quot;"
215+
default:
216+
panic("unrecognized escape character")
217+
}
218+
s = s[i+1:]
219+
if _, err := w.WriteString(esc); err != nil {
220+
return err
221+
}
222+
i = strings.IndexAny(s, escapedChars)
223+
}
224+
_, err := w.WriteString(s)
225+
return err
226+
}
227+
228+
// EscapeString escapes special characters like "<" to become "&lt;". It
229+
// escapes only five such characters: amp, apos, lt, gt and quot.
230+
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
231+
// always true.
232+
func EscapeString(s string) string {
233+
if strings.IndexAny(s, escapedChars) == -1 {
234+
return s
235+
}
236+
buf := bytes.NewBuffer(nil)
237+
escape(buf, s)
238+
return buf.String()
239+
}
240+
241+
// UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
242+
// larger range of entities than EscapeString escapes. For example, "&aacute;"
243+
// unescapes to "á", as does "&#225;" and "&xE1;".
244+
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
245+
// always true.
246+
func UnescapeString(s string) string {
247+
for _, c := range s {
248+
if c == '&' {
249+
return string(unescape([]byte(s)))
250+
}
251+
}
252+
return s
253+
}
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)