Skip to content

Commit 0d362b1

Browse files
committed
Report more accurate UCD version in unicodedata using icu4j, use icu4j for all operations in unicodedata module
PullRequest: graalpython/3559
2 parents edf0b97 + 7771462 commit 0d362b1

File tree

1 file changed

+43
-121
lines changed

1 file changed

+43
-121
lines changed

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/UnicodeDataModuleBuiltins.java

Lines changed: 43 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018, 2023, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* The Universal Permissive License (UPL), Version 1.0
@@ -43,11 +43,12 @@
4343
import static com.oracle.graal.python.runtime.exception.PythonErrorType.ValueError;
4444
import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING;
4545

46-
import java.text.Normalizer;
4746
import java.util.List;
4847

4948
import org.graalvm.shadowed.com.ibm.icu.lang.UCharacter;
5049
import org.graalvm.shadowed.com.ibm.icu.lang.UProperty;
50+
import org.graalvm.shadowed.com.ibm.icu.text.Normalizer2;
51+
import org.graalvm.shadowed.com.ibm.icu.util.VersionInfo;
5152

5253
import com.oracle.graal.python.annotations.ArgumentClinic;
5354
import com.oracle.graal.python.builtins.Builtin;
@@ -64,7 +65,9 @@
6465
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
6566
import com.oracle.truffle.api.dsl.Bind;
6667
import com.oracle.truffle.api.dsl.Cached;
68+
import com.oracle.truffle.api.dsl.Cached.Exclusive;
6769
import com.oracle.truffle.api.dsl.GenerateNodeFactory;
70+
import com.oracle.truffle.api.dsl.ImportStatic;
6871
import com.oracle.truffle.api.dsl.NodeFactory;
6972
import com.oracle.truffle.api.dsl.Specialization;
7073
import com.oracle.truffle.api.nodes.Node;
@@ -78,90 +81,10 @@ protected List<? extends NodeFactory<? extends PythonBuiltinBaseNode>> getNodeFa
7881
}
7982

8083
public static String getUnicodeVersion() {
81-
82-
// Preliminary Unicode 11 data obtained from
83-
// <https://www.unicode.org/Public/11.0.0/ucd/DerivedAge-11.0.0d13.txt>.
84-
if (Character.getType('\u0560') != Character.UNASSIGNED) {
85-
return "11.0.0"; // 11.0, June 2018.
86-
}
87-
88-
if (Character.getType('\u0860') != Character.UNASSIGNED) {
89-
return "10.0.0"; // 10.0, June 2017.
90-
}
91-
92-
if (Character.getType('\u08b6') != Character.UNASSIGNED) {
93-
return "9.0.0"; // 9.0, June 2016.
94-
}
95-
96-
if (Character.getType('\u08b3') != Character.UNASSIGNED) {
97-
return "8.0.0"; // 8.0, June 2015.
98-
}
99-
100-
if (Character.getType('\u037f') != Character.UNASSIGNED) {
101-
return "7.0.0"; // 7.0, June 2014.
102-
}
103-
104-
if (Character.getType('\u061c') != Character.UNASSIGNED) {
105-
return "6.3.0"; // 6.3, September 2013.
106-
}
107-
108-
if (Character.getType('\u20ba') != Character.UNASSIGNED) {
109-
return "6.2.0"; // 6.2, September 2012.
110-
}
111-
112-
if (Character.getType('\u058f') != Character.UNASSIGNED) {
113-
return "6.1.0"; // 6.1, January 2012.
114-
}
115-
116-
if (Character.getType('\u0526') != Character.UNASSIGNED) {
117-
return "6.0.0"; // 6.0, October 2010.
118-
}
119-
120-
if (Character.getType('\u0524') != Character.UNASSIGNED) {
121-
return "5.2.0"; // 5.2, October 2009.
122-
}
123-
124-
if (Character.getType('\u0370') != Character.UNASSIGNED) {
125-
return "5.1.0"; // 5.1, March 2008.
126-
}
127-
128-
if (Character.getType('\u0242') != Character.UNASSIGNED) {
129-
return "5.0.0"; // 5.0, July 2006.
130-
}
131-
132-
if (Character.getType('\u0237') != Character.UNASSIGNED) {
133-
return "4.1.0"; // 4.1, March 2005.
134-
}
135-
136-
if (Character.getType('\u0221') != Character.UNASSIGNED) {
137-
return "4.0.0"; // 4.0, April 2003.
138-
}
139-
140-
if (Character.getType('\u0220') != Character.UNASSIGNED) {
141-
return "3.2.0"; // 3.2, March 2002.
142-
}
143-
144-
if (Character.getType('\u03f4') != Character.UNASSIGNED) {
145-
return "3.1.0"; // 3.1, March 2001.
146-
}
147-
148-
if (Character.getType('\u01f6') != Character.UNASSIGNED) {
149-
return "3.0.0"; // 3.0, September 1999.
150-
}
151-
152-
if (Character.getType('\u20ac') != Character.UNASSIGNED) {
153-
return "2.1.0"; // 2.1, May 1998.
154-
}
155-
156-
if (Character.getType('\u0591') != Character.UNASSIGNED) {
157-
return "2.0.0"; // 2.0, July 1996.
158-
}
159-
160-
if (Character.getType('\u0000') != Character.UNASSIGNED) {
161-
return "1.1.0"; // 1.1, June 1993.
162-
}
163-
164-
return "1.0.0"; // 1.0
84+
VersionInfo version = UCharacter.getUnicodeVersion();
85+
return Integer.toString(version.getMajor()) + '.' +
86+
version.getMinor() + '.' +
87+
version.getMicro();
16588
}
16689

16790
/**
@@ -186,39 +109,44 @@ public void initialize(Python3Core core) {
186109
addBuiltinConstant("unidata_version", getUnicodeVersion());
187110
}
188111

112+
static final int NORMALIZER_FORM_COUNT = 4;
113+
114+
@TruffleBoundary
115+
static Normalizer2 getNormalizer(TruffleString form) {
116+
return switch (form.toJavaStringUncached()) {
117+
case "NFC" -> Normalizer2.getNFCInstance();
118+
case "NFKC" -> Normalizer2.getNFKCInstance();
119+
case "NFD" -> Normalizer2.getNFDInstance();
120+
case "NFKD" -> Normalizer2.getNFKDInstance();
121+
default -> null;
122+
};
123+
}
124+
189125
// unicodedata.normalize(form, unistr)
190126
@Builtin(name = "normalize", minNumOfPositionalArgs = 2, parameterNames = {"form", "unistr"})
191127
@ArgumentClinic(name = "form", conversion = ArgumentClinic.ClinicConversion.TString)
192128
@ArgumentClinic(name = "unistr", conversion = ArgumentClinic.ClinicConversion.TString)
193129
@GenerateNodeFactory
130+
@ImportStatic(UnicodeDataModuleBuiltins.class)
194131
public abstract static class NormalizeNode extends PythonBinaryClinicBuiltinNode {
195-
@TruffleBoundary
196-
protected Normalizer.Form getForm(TruffleString form) {
197-
try {
198-
return Normalizer.Form.valueOf(form.toJavaStringUncached());
199-
} catch (IllegalArgumentException e) {
200-
return null;
201-
}
202-
}
203-
204-
@Specialization(guards = {"stringEquals(form, cachedForm, equalNode)"}, limit = "4")
132+
@Specialization(guards = {"cachedNormalizer != null", "stringEquals(form, cachedForm, equalNode)"}, limit = "NORMALIZER_FORM_COUNT")
205133
static TruffleString normalize(@SuppressWarnings("unused") TruffleString form, TruffleString unistr,
206-
@Bind("this") Node inliningTarget,
207134
@SuppressWarnings("unused") @Cached("form") TruffleString cachedForm,
208-
@Cached("getForm(cachedForm)") Normalizer.Form cachedNormForm,
135+
@Cached("getNormalizer(cachedForm)") Normalizer2 cachedNormalizer,
209136
@SuppressWarnings("unused") @Cached TruffleString.EqualNode equalNode,
210137
@Cached TruffleString.ToJavaStringNode toJavaStringNode,
211-
@Cached TruffleString.FromJavaStringNode fromJavaStringNode,
212-
@Cached PRaiseNode.Lazy raiseNode) {
213-
if (cachedNormForm == null) {
214-
throw raiseNode.get(inliningTarget).raise(ValueError, ErrorMessages.INVALID_NORMALIZATION_FORM);
215-
}
216-
return fromJavaStringNode.execute(normalize(toJavaStringNode.execute(unistr), cachedNormForm), TS_ENCODING);
138+
@Exclusive @Cached TruffleString.FromJavaStringNode fromJavaStringNode) {
139+
return fromJavaStringNode.execute(normalize(toJavaStringNode.execute(unistr), cachedNormalizer), TS_ENCODING);
140+
}
141+
142+
@Specialization(guards = "getNormalizer(form) == null")
143+
TruffleString invalidForm(@SuppressWarnings("unused") TruffleString form, @SuppressWarnings("unused") TruffleString unistr) {
144+
throw PRaiseNode.raiseUncached(this, ValueError, ErrorMessages.INVALID_NORMALIZATION_FORM);
217145
}
218146

219147
@TruffleBoundary
220-
private static String normalize(String str, Normalizer.Form normForm) {
221-
return Normalizer.normalize(str, normForm);
148+
private static String normalize(String str, Normalizer2 normalizer) {
149+
return normalizer.normalize(str);
222150
}
223151

224152
@Override
@@ -232,26 +160,20 @@ protected ArgumentClinicProvider getArgumentClinic() {
232160
@ArgumentClinic(name = "form", conversion = ArgumentClinic.ClinicConversion.TString)
233161
@ArgumentClinic(name = "unistr", conversion = ArgumentClinic.ClinicConversion.TString)
234162
@GenerateNodeFactory
163+
@ImportStatic(UnicodeDataModuleBuiltins.class)
235164
public abstract static class IsNormalizedNode extends PythonBinaryClinicBuiltinNode {
236-
@TruffleBoundary
237-
protected Normalizer.Form getForm(TruffleString form) {
238-
try {
239-
return Normalizer.Form.valueOf(form.toJavaStringUncached());
240-
} catch (IllegalArgumentException e) {
241-
return null;
242-
}
243-
}
244-
245-
@Specialization(guards = {"stringEquals(form, cachedForm, equalNode)"}, limit = "4")
165+
@Specialization(guards = {"cachedNormalizer != null", "stringEquals(form, cachedForm, equalNode)"}, limit = "NORMALIZER_FORM_COUNT")
246166
@TruffleBoundary
247167
boolean isNormalized(@SuppressWarnings("unused") TruffleString form, TruffleString unistr,
248168
@SuppressWarnings("unused") @Cached("form") TruffleString cachedForm,
249-
@Cached("getForm(cachedForm)") Normalizer.Form cachedNormForm,
169+
@Cached("getNormalizer(cachedForm)") Normalizer2 cachedNormalizer,
250170
@SuppressWarnings("unused") @Cached TruffleString.EqualNode equalNode) {
251-
if (cachedNormForm == null) {
252-
throw PRaiseNode.raiseUncached(this, ValueError, ErrorMessages.INVALID_NORMALIZATION_FORM);
253-
}
254-
return Normalizer.isNormalized(unistr.toJavaStringUncached(), cachedNormForm);
171+
return cachedNormalizer.isNormalized(unistr.toJavaStringUncached());
172+
}
173+
174+
@Specialization(guards = "getNormalizer(form) == null")
175+
TruffleString invalidForm(@SuppressWarnings("unused") TruffleString form, @SuppressWarnings("unused") TruffleString unistr) {
176+
throw PRaiseNode.raiseUncached(this, ValueError, ErrorMessages.INVALID_NORMALIZATION_FORM);
255177
}
256178

257179
@Override

0 commit comments

Comments
 (0)