Skip to content

Commit 5f255b2

Browse files
committed
add encoding diagnostics
1 parent b4682dc commit 5f255b2

16 files changed

+144
-48
lines changed

experimental/parser/lex.go

Lines changed: 72 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -51,45 +51,7 @@ func lex(ctx token.Context, errs *report.Report) {
5151
})
5252
defer l.Freeze()
5353

54-
// Check that the file isn't too big. We give up immediately if that's
55-
// the case.
56-
if len(l.Text()) > maxFileSize {
57-
l.Errorf("files larger than 2GB (%d bytes) are not supported", maxFileSize).Apply(
58-
report.InFile(l.Path()),
59-
)
60-
return
61-
}
62-
63-
// Also check that the text of the file is actually UTF-8.
64-
// We go rune by rune to find the first invalid offset.
65-
var idx int
66-
var count int
67-
stringsx.Runes(l.Text())(func(n int, r rune) bool {
68-
if r == -1 {
69-
if count == 0 {
70-
idx = n
71-
}
72-
count++
73-
}
74-
return true
75-
})
76-
switch {
77-
case count == 0:
78-
break
79-
case count < 32:
80-
// This diagnostic is for cases where there appear to be one or two
81-
// stray, non-UTF-8 values.
82-
l.Errorf("input contains non-UTF-8 byte").Apply(
83-
report.Snippet(l.Span(idx, idx+1)),
84-
report.Notef("invalid byte occurs at offset %d (%#x)", idx, idx),
85-
report.Notef("Protobuf files must be UTF-8 encoded"),
86-
)
87-
return
88-
default:
89-
l.Errorf("input appears to be a binary file").Apply(
90-
report.InFile(l.Path()),
91-
report.Notef("invalid byte occurs at offset %d (%#x)", idx, idx),
92-
)
54+
if !lexPrelude(l) {
9355
return
9456
}
9557

@@ -230,6 +192,77 @@ func lex(ctx token.Context, errs *report.Report) {
230192
fuseStrings(l)
231193
}
232194

195+
// lexPrelude performs various file-prelude checks, such as size and encoding
196+
// verification. Returns whether lexing should proceed.
197+
func lexPrelude(l *lexer) bool {
198+
// Check that the file isn't too big. We give up immediately if that's
199+
// the case.
200+
if len(l.Text()) > maxFileSize {
201+
l.Errorf("files larger than 2GB (%d bytes) are not supported", maxFileSize).Apply(
202+
report.InFile(l.Path()),
203+
)
204+
return false
205+
}
206+
207+
// Heuristically check for a UTF-16-encoded file. There are two good
208+
// heuristics:
209+
// 1. Presence of a UTF-16 BOM, which is either FE FF or FF FE, depending on
210+
// endianness.
211+
// 2. Exactly one of the first two bytes is a NUL. Valid Protobuf cannot
212+
// contain a NUL in the first two bytes, so this is probably a UTF-16-encoded
213+
// ASCII rune.
214+
bom16 := strings.HasPrefix(l.Text(), "\xfe\xff") || strings.HasPrefix(l.Text(), "\xff\xfe")
215+
ascii16 := len(l.Text()) >= 2 && (l.Text()[0] == 0 || l.Text()[1] == 0)
216+
if bom16 || ascii16 {
217+
l.Errorf("input appears to be encoded with UTF-16").Apply(
218+
report.InFile(l.Path()),
219+
report.Notef("Protobuf files must be UTF-8 encoded"),
220+
)
221+
return false
222+
}
223+
224+
// Check that the text of the file is actually UTF-8.
225+
var idx int
226+
var count int
227+
stringsx.Runes(l.Text())(func(n int, r rune) bool {
228+
if r == -1 {
229+
if count == 0 {
230+
idx = n
231+
}
232+
count++
233+
}
234+
return true
235+
})
236+
frac := float64(count) / float64(len(l.Text()))
237+
switch {
238+
case frac == 0:
239+
break
240+
case frac < 0.2:
241+
// This diagnostic is for cases where this file appears to be corrupt.
242+
// We pick 20% non-UTF-8 as the threshold to show this error.
243+
l.Errorf("input appears to be encoded with UTF-8, but found invalid byte").Apply(
244+
report.Snippet(l.Span(idx, idx+1)),
245+
report.Notef("non-UTF-8 byte occurs at offset %d (%#x)", idx, idx),
246+
report.Notef("Protobuf files must be UTF-8 encoded"),
247+
)
248+
return false
249+
default:
250+
l.Errorf("input appears to be a binary file").Apply(
251+
report.InFile(l.Path()),
252+
report.Notef("non-UTF-8 byte occurs at offset %d (%#x)", idx, idx),
253+
report.Notef("Protobuf files must be UTF-8 encoded"),
254+
)
255+
return false
256+
}
257+
258+
if l.Peek() == '\uFEFF' {
259+
l.Pop() // Peel off a leading UTF-8 BOM.
260+
l.Push(3, token.Unrecognized)
261+
}
262+
263+
return true
264+
}
265+
233266
// fuseBraces performs brace matching and token fusion, based on the contents of
234267
// l.braces.
235268
func fuseBraces(l *lexer) {
Binary file not shown.
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
error: input appears to be a binary file
2+
--> testdata/lexer/encoding/random.proto
3+
= note: non-UTF-8 byte occurs at offset 5 (0x5)
4+
= note: Protobuf files must be UTF-8 encoded
5+
6+
encountered 1 error
Binary file not shown.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
error: input appears to be encoded with UTF-16
2+
--> testdata/lexer/encoding/utf16be-bom.proto
3+
= note: Protobuf files must be UTF-8 encoded
4+
5+
encountered 1 error
Binary file not shown.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
error: input appears to be encoded with UTF-16
2+
--> testdata/lexer/encoding/utf16be.proto
3+
= note: Protobuf files must be UTF-8 encoded
4+
5+
encountered 1 error
Binary file not shown.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
error: input appears to be encoded with UTF-16
2+
--> testdata/lexer/encoding/utf16le-bom.proto
3+
= note: Protobuf files must be UTF-8 encoded
4+
5+
encountered 1 error
Binary file not shown.

0 commit comments

Comments
 (0)