Skip to content

Commit 5b92028

Browse files
nightlyonebradfitz
authored andcommittedAug 22, 2015
html: speed up UnescapeString
Add benchmarks for for sparsely escaped and densely escaped strings. Then speed up the sparse unescaping part heavily by using IndexByte and copy to skip the parts containing no escaping very fast. Unescaping densely escaped strings slower because of the new function call overhead. But sparsely encoded strings are seen more often in the utf8 enabled web. We win part of the speed back by looking up entityName differently. benchmark old ns/op new ns/op delta BenchmarkEscape 31680 31396 -0.90% BenchmarkEscapeNone 6507 6872 +5.61% BenchmarkUnescape 36481 48298 +32.39% BenchmarkUnescapeNone 332 325 -2.11% BenchmarkUnescapeSparse 8836 3221 -63.55% BenchmarkUnescapeDense 30639 32224 +5.17% Change-Id: If606cb01897a40eefe35ba98f2ff23bb25251606 Reviewed-on: https://go-review.googlesource.com/10172 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
1 parent 5f859ba commit 5b92028

File tree

2 files changed

+49
-31
lines changed

2 files changed

+49
-31
lines changed
 

‎src/html/escape.go

+31-29
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,9 @@ var replacementTable = [...]rune{
5757
// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
5858
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
5959
// Precondition: b[src] == '&' && dst <= src.
60-
// attribute should be true if parsing an attribute value.
61-
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
60+
func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
61+
const attribute = false
62+
6263
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
6364

6465
// i starts at 1 because we already know that s[0] == '&'.
@@ -139,14 +140,14 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
139140
break
140141
}
141142

142-
entityName := string(s[1:i])
143-
if entityName == "" {
143+
entityName := s[1:i]
144+
if len(entityName) == 0 {
144145
// No-op.
145146
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
146147
// No-op.
147-
} else if x := entity[entityName]; x != 0 {
148+
} else if x := entity[string(entityName)]; x != 0 {
148149
return dst + utf8.EncodeRune(b[dst:], x), src + i
149-
} else if x := entity2[entityName]; x[0] != 0 {
150+
} else if x := entity2[string(entityName)]; x[0] != 0 {
150151
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
151152
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
152153
} else if !attribute {
@@ -155,7 +156,7 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
155156
maxLen = longestEntityWithoutSemicolon
156157
}
157158
for j := maxLen; j > 1; j-- {
158-
if x := entity[entityName[:j]]; x != 0 {
159+
if x := entity[string(entityName[:j])]; x != 0 {
159160
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
160161
}
161162
}
@@ -166,26 +167,6 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
166167
return dst1, src1
167168
}
168169

169-
// unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
170-
func unescape(b []byte) []byte {
171-
for i, c := range b {
172-
if c == '&' {
173-
dst, src := unescapeEntity(b, i, i, false)
174-
for src < len(b) {
175-
c := b[src]
176-
if c == '&' {
177-
dst, src = unescapeEntity(b, dst, src, false)
178-
} else {
179-
b[dst] = c
180-
dst, src = dst+1, src+1
181-
}
182-
}
183-
return b[0:dst]
184-
}
185-
}
186-
return b
187-
}
188-
189170
var htmlEscaper = strings.NewReplacer(
190171
`&`, "&amp;",
191172
`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
@@ -208,8 +189,29 @@ func EscapeString(s string) string {
208189
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
209190
// always true.
210191
func UnescapeString(s string) string {
211-
if !strings.Contains(s, "&") {
192+
i := strings.IndexByte(s, '&')
193+
194+
if i < 0 {
212195
return s
213196
}
214-
return string(unescape([]byte(s)))
197+
198+
b := []byte(s)
199+
dst, src := unescapeEntity(b, i, i)
200+
for len(s[src:]) > 0 {
201+
if s[src] == '&' {
202+
i = 0
203+
} else {
204+
i = strings.IndexByte(s[src:], '&')
205+
}
206+
if i < 0 {
207+
dst += copy(b[dst:], s[src:])
208+
break
209+
}
210+
211+
if i > 0 {
212+
copy(b[dst:], s[src:src+i])
213+
}
214+
dst, src = unescapeEntity(b, dst+i, src+i)
215+
}
216+
return string(b[:dst])
215217
}

‎src/html/escape_test.go

+18-2
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,10 @@ func TestUnescapeEscape(t *testing.T) {
118118
}
119119

120120
var (
121-
benchEscapeData = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100)
122-
benchEscapeNone = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100)
121+
benchEscapeData = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100)
122+
benchEscapeNone = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100)
123+
benchUnescapeSparse = strings.Repeat(strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 10)+"&amp;", 10)
124+
benchUnescapeDense = strings.Repeat("&amp;&lt; &amp; &lt;", 100)
123125
)
124126

125127
func BenchmarkEscape(b *testing.B) {
@@ -151,3 +153,17 @@ func BenchmarkUnescapeNone(b *testing.B) {
151153
n += len(UnescapeString(s))
152154
}
153155
}
156+
157+
func BenchmarkUnescapeSparse(b *testing.B) {
158+
n := 0
159+
for i := 0; i < b.N; i++ {
160+
n += len(UnescapeString(benchUnescapeSparse))
161+
}
162+
}
163+
164+
func BenchmarkUnescapeDense(b *testing.B) {
165+
n := 0
166+
for i := 0; i < b.N; i++ {
167+
n += len(UnescapeString(benchUnescapeDense))
168+
}
169+
}

0 commit comments

Comments
 (0)
Please sign in to comment.