Skip to content

Commit 93d5302

Browse files
committed
fix(unicode): use surrogateescape in bytes.decode
That way, we will try to decode as default encoding (usually utf-8), but allow ourselves to simply keep bytes that don't match within the resulting unicode string. That way, we allow for lossless decode/encode cycles while still assuring that decoding never fails. NOTE: I was too lazy to create a test that would verify it, but manually executed https://github.com/petertodd/gitpython-unicode-error. fixes #532
1 parent ff389af commit 93d5302

File tree

3 files changed

+193
-7
lines changed

3 files changed

+193
-7
lines changed

Diff for: git/compat.py

+191-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import locale
1111
import os
1212
import sys
13+
import codecs
14+
1315

1416
from gitdb.utils.compat import (
1517
xrange,
@@ -67,7 +69,7 @@ def safe_decode(s):
6769
if isinstance(s, unicode):
6870
return s
6971
elif isinstance(s, bytes):
70-
return s.decode(defenc, 'replace')
72+
return s.decode(defenc, 'surrogateescape')
7173
elif s is not None:
7274
raise TypeError('Expected bytes or text, but got %r' % (s,))
7375

@@ -121,3 +123,191 @@ def __str__(self):
121123
else: # Python 2
122124
def __str__(self):
123125
return self.__unicode__().encode(defenc)
126+
127+
128+
"""
129+
This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
130+
handler of Python 3.
131+
Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
132+
"""
133+
134+
# This code is released under the Python license and the BSD 2-clause license
135+
136+
137+
FS_ERRORS = 'surrogateescape'
138+
139+
# # -- Python 2/3 compatibility -------------------------------------
140+
# FS_ERRORS = 'my_surrogateescape'
141+
142+
def u(text):
143+
if PY3:
144+
return text
145+
else:
146+
return text.decode('unicode_escape')
147+
148+
def b(data):
149+
if PY3:
150+
return data.encode('latin1')
151+
else:
152+
return data
153+
154+
if PY3:
155+
_unichr = chr
156+
bytes_chr = lambda code: bytes((code,))
157+
else:
158+
_unichr = unichr
159+
bytes_chr = chr
160+
161+
def surrogateescape_handler(exc):
162+
"""
163+
Pure Python implementation of the PEP 383: the "surrogateescape" error
164+
handler of Python 3. Undecodable bytes will be replaced by a Unicode
165+
character U+DCxx on decoding, and these are translated into the
166+
original bytes on encoding.
167+
"""
168+
mystring = exc.object[exc.start:exc.end]
169+
170+
try:
171+
if isinstance(exc, UnicodeDecodeError):
172+
# mystring is a byte-string in this case
173+
decoded = replace_surrogate_decode(mystring)
174+
elif isinstance(exc, UnicodeEncodeError):
175+
# In the case of u'\udcc3'.encode('ascii',
176+
# 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
177+
# exception anyway after this function is called, even though I think
178+
# it's doing what it should. It seems that the strict encoder is called
179+
# to encode the unicode string that this function returns ...
180+
decoded = replace_surrogate_encode(mystring)
181+
else:
182+
raise exc
183+
except NotASurrogateError:
184+
raise exc
185+
return (decoded, exc.end)
186+
187+
188+
class NotASurrogateError(Exception):
189+
pass
190+
191+
192+
def replace_surrogate_encode(mystring):
193+
"""
194+
Returns a (unicode) string, not the more logical bytes, because the codecs
195+
register_error functionality expects this.
196+
"""
197+
decoded = []
198+
for ch in mystring:
199+
# if PY3:
200+
# code = ch
201+
# else:
202+
code = ord(ch)
203+
204+
# The following magic comes from Py3.3's Python/codecs.c file:
205+
if not 0xD800 <= code <= 0xDCFF:
206+
# Not a surrogate. Fail with the original exception.
207+
raise exc
208+
# mybytes = [0xe0 | (code >> 12),
209+
# 0x80 | ((code >> 6) & 0x3f),
210+
# 0x80 | (code & 0x3f)]
211+
# Is this a good idea?
212+
if 0xDC00 <= code <= 0xDC7F:
213+
decoded.append(_unichr(code - 0xDC00))
214+
elif code <= 0xDCFF:
215+
decoded.append(_unichr(code - 0xDC00))
216+
else:
217+
raise NotASurrogateError
218+
return str().join(decoded)
219+
220+
221+
def replace_surrogate_decode(mybytes):
222+
"""
223+
Returns a (unicode) string
224+
"""
225+
decoded = []
226+
for ch in mybytes:
227+
# We may be parsing newbytes (in which case ch is an int) or a native
228+
# str on Py2
229+
if isinstance(ch, int):
230+
code = ch
231+
else:
232+
code = ord(ch)
233+
if 0x80 <= code <= 0xFF:
234+
decoded.append(_unichr(0xDC00 + code))
235+
elif code <= 0x7F:
236+
decoded.append(_unichr(code))
237+
else:
238+
# # It may be a bad byte
239+
# # Try swallowing it.
240+
# continue
241+
# print("RAISE!")
242+
raise NotASurrogateError
243+
return str().join(decoded)
244+
245+
246+
def encodefilename(fn):
247+
if FS_ENCODING == 'ascii':
248+
# ASCII encoder of Python 2 expects that the error handler returns a
249+
# Unicode string encodable to ASCII, whereas our surrogateescape error
250+
# handler has to return bytes in 0x80-0xFF range.
251+
encoded = []
252+
for index, ch in enumerate(fn):
253+
code = ord(ch)
254+
if code < 128:
255+
ch = bytes_chr(code)
256+
elif 0xDC80 <= code <= 0xDCFF:
257+
ch = bytes_chr(code - 0xDC00)
258+
else:
259+
raise UnicodeEncodeError(FS_ENCODING,
260+
fn, index, index+1,
261+
'ordinal not in range(128)')
262+
encoded.append(ch)
263+
return bytes().join(encoded)
264+
elif FS_ENCODING == 'utf-8':
265+
# UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
266+
# doesn't go through our error handler
267+
encoded = []
268+
for index, ch in enumerate(fn):
269+
code = ord(ch)
270+
if 0xD800 <= code <= 0xDFFF:
271+
if 0xDC80 <= code <= 0xDCFF:
272+
ch = bytes_chr(code - 0xDC00)
273+
encoded.append(ch)
274+
else:
275+
raise UnicodeEncodeError(
276+
FS_ENCODING,
277+
fn, index, index+1, 'surrogates not allowed')
278+
else:
279+
ch_utf8 = ch.encode('utf-8')
280+
encoded.append(ch_utf8)
281+
return bytes().join(encoded)
282+
else:
283+
return fn.encode(FS_ENCODING, FS_ERRORS)
284+
285+
def decodefilename(fn):
286+
return fn.decode(FS_ENCODING, FS_ERRORS)
287+
288+
FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
289+
# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
290+
# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
291+
292+
293+
# normalize the filesystem encoding name.
294+
# For example, we expect "utf-8", not "UTF8".
295+
FS_ENCODING = codecs.lookup(FS_ENCODING).name
296+
297+
298+
def register_surrogateescape():
299+
"""
300+
Registers the surrogateescape error handler on Python 2 (only)
301+
"""
302+
if PY3:
303+
return
304+
try:
305+
codecs.lookup_error(FS_ERRORS)
306+
except LookupError:
307+
codecs.register_error(FS_ERRORS, surrogateescape_handler)
308+
309+
310+
try:
311+
"hello".decode(defenc, "surrogateescape")
312+
except:
313+
register_surrogateescape()

Diff for: git/objects/fun.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,7 @@ def tree_entries_from_data(data):
7676
# default encoding for strings in git is utf8
7777
# Only use the respective unicode object if the byte stream was encoded
7878
name = data[ns:i]
79-
try:
80-
name = name.decode(defenc)
81-
except UnicodeDecodeError:
82-
pass
83-
# END handle encoding
79+
name = name.decode(defenc, 'surrogateescape')
8480

8581
# byte is NULL, get next 20
8682
i += 1

Diff for: git/test/performance/test_commit.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def test_iteration(self):
5252
# END for each object
5353
# END for each commit
5454
elapsed_time = time() - st
55-
print("Traversed %i Trees and a total of %i unchached objects in %s [s] ( %f objs/s )"
55+
print("Traversed %i Trees and a total of %i uncached objects in %s [s] ( %f objs/s )"
5656
% (nc, no, elapsed_time, no / elapsed_time), file=sys.stderr)
5757

5858
def test_commit_traversal(self):

0 commit comments

Comments
 (0)