Skip to content

Commit 5d6fbc0

Browse files
authored
Fix matching empty strings next to multibyte characters (#109)
1 parent 686c823 commit 5d6fbc0

File tree

2 files changed

+42
-34
lines changed

2 files changed

+42
-34
lines changed

find_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,11 @@ var findTests = []FindTest{
103103
{`(?i)\W`, "k", nil},
104104
{`(?i)\W`, "s", nil},
105105

106+
// Multibyte characters -- verify that we don't try to match in the middle
107+
// of a character.
108+
{"[a-c]*", "\u65e5", build(2, 0, 0, 3, 3)},
109+
{"[^\u65e5]", "abc\u65e5def", build(6, 0, 1, 1, 2, 2, 3, 6, 7, 7, 8, 8, 9)},
110+
106111
// can backslash-escape any punctuation
107112
{
108113
`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`,

internal/re2.go

Lines changed: 37 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ func (re *Regexp) FindAll(b []byte, n int) [][]byte {
286286

287287
var matches [][]byte
288288

289-
re.findAll(&alloc, cs, n, func(match []int) {
289+
re.findAll(&alloc, b, "", cs, n, func(match []int) {
290290
matches = append(matches, matchedBytes(b, match))
291291
})
292292

@@ -305,7 +305,7 @@ func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
305305

306306
var matches [][]int
307307

308-
re.findAll(&alloc, cs, n, func(match []int) {
308+
re.findAll(&alloc, b, "", cs, n, func(match []int) {
309309
matches = append(matches, append([]int(nil), match...))
310310
})
311311

@@ -326,7 +326,7 @@ func (re *Regexp) FindAllString(s string, n int) []string {
326326

327327
var matches []string
328328

329-
re.findAll(&alloc, cs, n, func(match []int) {
329+
re.findAll(&alloc, nil, s, cs, n, func(match []int) {
330330
matches = append(matches, matchedString(s, match))
331331
})
332332

@@ -345,7 +345,7 @@ func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
345345

346346
var matches [][]int
347347

348-
re.findAll(&alloc, cs, n, func(match []int) {
348+
re.findAll(&alloc, nil, s, cs, n, func(match []int) {
349349
matches = append(matches, append([]int(nil), match...))
350350
})
351351

@@ -354,7 +354,7 @@ func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
354354
return res
355355
}
356356

357-
func (re *Regexp) findAll(alloc *allocation, cs cString, n int, deliver func(match []int)) {
357+
func (re *Regexp) findAll(alloc *allocation, bsrc []byte, src string, cs cString, n int, deliver func(match []int)) {
358358
var dstCap [2]int
359359

360360
if n < 0 {
@@ -372,24 +372,22 @@ func (re *Regexp) findAll(alloc *allocation, cs cString, n int, deliver func(mat
372372
break
373373
}
374374

375-
matches := readMatch(alloc, cs, matchArr.ptr, dstCap[:0])
375+
match := readMatch(alloc, cs, matchArr.ptr, dstCap[:0])
376376
accept := true
377-
if matches[0] == matches[1] {
378-
// We've found an empty match.
379-
if matches[0] == prevMatchEnd {
380-
// We don't allow an empty match right
381-
// after a previous match, so ignore it.
382-
accept = false
383-
}
384-
pos++
385-
} else {
386-
pos = matches[1]
377+
// Check if it's an empty match following a match, which we ignore.
378+
if match[0] == match[1] && match[0] == prevMatchEnd {
379+
// We don't allow an empty match right
380+
// after a previous match, so ignore it.
381+
accept = false
387382
}
383+
384+
pos = nextPos(bsrc, src, pos, match[1])
385+
388386
if accept {
389-
deliver(matches)
387+
deliver(match)
390388
count++
391389
}
392-
prevMatchEnd = matches[1]
390+
prevMatchEnd = match[1]
393391

394392
if count == n {
395393
break
@@ -514,23 +512,8 @@ func (re *Regexp) findAllSubmatch(alloc *allocation, bsrc []byte, src string, cs
514512
if match[0] == match[1] && match[0] == prevMatchEnd {
515513
accept = false
516514
}
517-
// Advance past this match; always advance at least one character.
518-
var width int
519-
if bsrc != nil {
520-
_, width = utf8.DecodeRune(bsrc[pos:])
521-
} else {
522-
_, width = utf8.DecodeRuneInString(src[pos:])
523-
}
524515

525-
if pos+width > match[1] {
526-
pos += width
527-
} else if pos+1 > match[1] {
528-
// This clause is only needed at the end of the input
529-
// string. In that case, DecodeRuneInString returns width=0.
530-
pos++
531-
} else {
532-
pos = match[1]
533-
}
516+
pos = nextPos(bsrc, src, pos, match[1])
534517
prevMatchEnd = match[1]
535518
}
536519
if accept {
@@ -1032,6 +1015,26 @@ func matchedString(s string, match []int) string {
10321015
return s[match[0]:match[1]]
10331016
}
10341017

1018+
func nextPos(bsrc []byte, src string, pos int, matchEnd int) int {
1019+
// Advance past the match; always advance at least one character.
1020+
var width int
1021+
if bsrc != nil {
1022+
_, width = utf8.DecodeRune(bsrc[pos:])
1023+
} else {
1024+
_, width = utf8.DecodeRuneInString(src[pos:])
1025+
}
1026+
1027+
if pos+width > matchEnd {
1028+
return pos + width
1029+
} else if pos+1 > matchEnd {
1030+
// This clause is only needed at the end of the input
1031+
// string. In that case, DecodeRuneInString returns width=0.
1032+
return pos + 1
1033+
} else {
1034+
return matchEnd
1035+
}
1036+
}
1037+
10351038
func QuoteForError(s string) string {
10361039
if strconv.CanBackquote(s) {
10371040
return "`" + s + "`"

0 commit comments

Comments
 (0)