@@ -51,45 +51,7 @@ func lex(ctx token.Context, errs *report.Report) {
51
51
})
52
52
defer l .Freeze ()
53
53
54
- // Check that the file isn't too big. We give up immediately if that's
55
- // the case.
56
- if len (l .Text ()) > maxFileSize {
57
- l .Errorf ("files larger than 2GB (%d bytes) are not supported" , maxFileSize ).Apply (
58
- report .InFile (l .Path ()),
59
- )
60
- return
61
- }
62
-
63
- // Also check that the text of the file is actually UTF-8.
64
- // We go rune by rune to find the first invalid offset.
65
- var idx int
66
- var count int
67
- stringsx .Runes (l .Text ())(func (n int , r rune ) bool {
68
- if r == - 1 {
69
- if count == 0 {
70
- idx = n
71
- }
72
- count ++
73
- }
74
- return true
75
- })
76
- switch {
77
- case count == 0 :
78
- break
79
- case count < 32 :
80
- // This diagnostic is for cases where there appear to be one or two
81
- // stray, non-UTF-8 values.
82
- l .Errorf ("input contains non-UTF-8 byte" ).Apply (
83
- report .Snippet (l .Span (idx , idx + 1 )),
84
- report .Notef ("invalid byte occurs at offset %d (%#x)" , idx , idx ),
85
- report .Notef ("Protobuf files must be UTF-8 encoded" ),
86
- )
87
- return
88
- default :
89
- l .Errorf ("input appears to be a binary file" ).Apply (
90
- report .InFile (l .Path ()),
91
- report .Notef ("invalid byte occurs at offset %d (%#x)" , idx , idx ),
92
- )
54
+ if ! lexPrelude (l ) {
93
55
return
94
56
}
95
57
@@ -230,6 +192,77 @@ func lex(ctx token.Context, errs *report.Report) {
230
192
fuseStrings (l )
231
193
}
232
194
195
+ // lexPrelude performs various file-prelude checks, such as size and encoding
196
+ // verification. Returns whether lexing should proceed.
197
+ func lexPrelude (l * lexer ) bool {
198
+ // Check that the file isn't too big. We give up immediately if that's
199
+ // the case.
200
+ if len (l .Text ()) > maxFileSize {
201
+ l .Errorf ("files larger than 2GB (%d bytes) are not supported" , maxFileSize ).Apply (
202
+ report .InFile (l .Path ()),
203
+ )
204
+ return false
205
+ }
206
+
207
+ // Heuristically check for a UTF-16-encoded file. There are two good
208
+ // heuristics:
209
+ // 1. Presence of a UTF-16 BOM, which is either FE FF or FF FE, depending on
210
+ // endianness.
211
+ // 2. Exactly one of the first two bytes is a NUL. Valid Protobuf cannot
212
+ // contain a NUL in the first two bytes, so this is probably a UTF-16-encoded
213
+ // ASCII rune.
214
+ bom16 := strings .HasPrefix (l .Text (), "\xfe \xff " ) || strings .HasPrefix (l .Text (), "\xff \xfe " )
215
+ ascii16 := len (l .Text ()) >= 2 && (l .Text ()[0 ] == 0 || l .Text ()[1 ] == 0 )
216
+ if bom16 || ascii16 {
217
+ l .Errorf ("input appears to be encoded with UTF-16" ).Apply (
218
+ report .InFile (l .Path ()),
219
+ report .Notef ("Protobuf files must be UTF-8 encoded" ),
220
+ )
221
+ return false
222
+ }
223
+
224
+ // Check that the text of the file is actually UTF-8.
225
+ var idx int
226
+ var count int
227
+ stringsx .Runes (l .Text ())(func (n int , r rune ) bool {
228
+ if r == - 1 {
229
+ if count == 0 {
230
+ idx = n
231
+ }
232
+ count ++
233
+ }
234
+ return true
235
+ })
236
+ frac := float64 (count ) / float64 (len (l .Text ()))
237
+ switch {
238
+ case frac == 0 :
239
+ break
240
+ case frac < 0.2 :
241
+ // This diagnostic is for cases where this file appears to be corrupt.
242
+ // We pick 20% non-UTF-8 as the threshold to show this error.
243
+ l .Errorf ("input appears to be encoded with UTF-8, but found invalid byte" ).Apply (
244
+ report .Snippet (l .Span (idx , idx + 1 )),
245
+ report .Notef ("non-UTF-8 byte occurs at offset %d (%#x)" , idx , idx ),
246
+ report .Notef ("Protobuf files must be UTF-8 encoded" ),
247
+ )
248
+ return false
249
+ default :
250
+ l .Errorf ("input appears to be a binary file" ).Apply (
251
+ report .InFile (l .Path ()),
252
+ report .Notef ("non-UTF-8 byte occurs at offset %d (%#x)" , idx , idx ),
253
+ report .Notef ("Protobuf files must be UTF-8 encoded" ),
254
+ )
255
+ return false
256
+ }
257
+
258
+ if l .Peek () == '\uFEFF' {
259
+ l .Pop () // Peel off a leading UTF-8 BOM.
260
+ l .Push (3 , token .Unrecognized )
261
+ }
262
+
263
+ return true
264
+ }
265
+
233
266
// fuseBraces performs brace matching and token fusion, based on the contents of
234
267
// l.braces.
235
268
func fuseBraces (l * lexer ) {
0 commit comments