Skip to content

Commit c1b3154

Browse files
authoredApr 11, 2024··
Merge pull request #172 from kjd/optimize-contextj
More efficient resolution of joiner contexts
2 parents cd58a23 + 0394ec7 commit c1b3154

File tree

3 files changed

+2164
-62
lines changed

3 files changed

+2164
-62
lines changed
 

‎idna/core.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,11 @@ def valid_contextj(label: str, pos: int) -> bool:
150150
joining_type = idnadata.joining_types.get(ord(label[i]))
151151
if joining_type == ord('T'):
152152
continue
153-
if joining_type in [ord('L'), ord('D')]:
153+
elif joining_type in [ord('L'), ord('D')]:
154154
ok = True
155155
break
156+
else:
157+
break
156158

157159
if not ok:
158160
return False
@@ -162,9 +164,11 @@ def valid_contextj(label: str, pos: int) -> bool:
162164
joining_type = idnadata.joining_types.get(ord(label[i]))
163165
if joining_type == ord('T'):
164166
continue
165-
if joining_type in [ord('R'), ord('D')]:
167+
elif joining_type in [ord('R'), ord('D')]:
166168
ok = True
167169
break
170+
else:
171+
break
168172
return ok
169173

170174
if cp_value == 0x200d:
@@ -236,12 +240,8 @@ def check_label(label: Union[str, bytes, bytearray]) -> None:
236240
if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']):
237241
continue
238242
elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']):
239-
try:
240-
if not valid_contextj(label, pos):
241-
raise InvalidCodepointContext('Joiner {} not allowed at position {} in {}'.format(
242-
_unot(cp_value), pos+1, repr(label)))
243-
except ValueError:
244-
raise IDNAError('Unknown codepoint adjacent to joiner {} at position {} in {}'.format(
243+
if not valid_contextj(label, pos):
244+
raise InvalidCodepointContext('Joiner {} not allowed at position {} in {}'.format(
245245
_unot(cp_value), pos+1, repr(label)))
246246
elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']):
247247
if not valid_contexto(label, pos):

‎idna/idnadata.py

+2,146-51
Large diffs are not rendered by default.

‎tools/idna-data

+10-3
Original file line numberDiff line numberDiff line change
@@ -246,11 +246,18 @@ class UnicodeData(object):
246246
def _load_arabicshaping(self):
247247

248248
self.ucd_as = {}
249-
f_as = self._ucdfile('ArabicShaping.txt')
249+
f_as = self._ucdfile('extracted/DerivedJoiningType.txt')
250250
for line in f_as.splitlines():
251-
result = re.match(r'^(?P<cp>[0-9A-F]{4,6})\s*;\s*.*?\s*;\s*(?P<jt>\S+)\s*;', line)
251+
result = re.match(
252+
r'^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<jt>\S+)\s*(|\#.*)$',
253+
line)
252254
if result:
253-
self.ucd_as[int(result.group('cp'), 16)] = result.group('jt')
255+
if result.group('end'):
256+
for i in hexrange(result.group('start'), result.group('end')):
257+
self.ucd_as[i] = result.group('jt')
258+
else:
259+
i = hexvalue(result.group('start'))
260+
self.ucd_as[i] = result.group('jt')
254261

255262
def _load_scripts(self):
256263

0 commit comments

Comments
 (0)
Please sign in to comment.