diff --git a/src/main/java/com/ctc/wstx/io/WstxInputData.java b/src/main/java/com/ctc/wstx/io/WstxInputData.java index cc5bf02..dbec9a2 100644 --- a/src/main/java/com/ctc/wstx/io/WstxInputData.java +++ b/src/main/java/com/ctc/wstx/io/WstxInputData.java @@ -17,6 +17,8 @@ import com.ctc.wstx.util.XmlChars; +import java.util.stream.IntStream; + /** * Base class used by readers (specifically, by * {@link com.ctc.wstx.sr.StreamScanner}, and its sub-classes) @@ -50,6 +52,23 @@ public class WstxInputData */ public final static int MAX_UNICODE_CHAR = 0x10FFFF; + private static final boolean[] asciiNameStartChars = new boolean[128]; + static { + IntStream.rangeClosed('a', 'z').forEach(i -> asciiNameStartChars[i] = true); + IntStream.rangeClosed('A', 'Z').forEach(i -> asciiNameStartChars[i] = true); + asciiNameStartChars['_'] = true; + } + + private static final boolean[] asciiNameChars = new boolean[128]; + static { + IntStream.rangeClosed('a', 'z').forEach(i -> asciiNameChars[i] = true); + IntStream.rangeClosed('A', 'Z').forEach(i -> asciiNameChars[i] = true); + IntStream.rangeClosed('0', '9').forEach(i -> asciiNameChars[i] = true); + asciiNameChars['.'] = true; + asciiNameChars['-'] = true; + asciiNameChars['_'] = true; + } + /* //////////////////////////////////////////////////// // Configuration @@ -153,14 +172,9 @@ protected final boolean isNameStartChar(char c) /* First, let's handle 7-bit ascii range (identical between xml * 1.0 and 1.1) */ - if (c <= 0x7A) { // 'z' or earlier - if (c >= 0x61) { // 'a' - 'z' are ok - return true; - } - if (c < 0x41) { // before 'A' just white space - return false; - } - return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok + if (c < 128) { + // this is performance critical, so we use a lookup table instead of if-branches + return asciiNameStartChars[c]; } /* Ok, otherwise need to use a big honking bit sets... which * differ between 1.0 and 1.1 @@ -178,18 +192,9 @@ protected final boolean isNameStartChar(char c) protected final boolean isNameChar(char c) { // First, let's handle 7-bit ascii range - if (c <= 0x7A) { // 'z' or earlier - if (c >= 0x61) { // 'a' - 'z' are ok - return true; - } - if (c <= 0x5A) { - if (c >= 0x41) { // 'A' - 'Z' ok too - return true; - } - // As are 0-9, '.' and '-' - return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-'); - } - return (c == 0x5F); // '_' is ok too + if (c < 128) { + // this is performance critical, so we use a lookup table instead of if-branches + return asciiNameChars[c]; } return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c); } diff --git a/src/test/java/com/ctc/wstx/io/WstxInputDataTest.java b/src/test/java/com/ctc/wstx/io/WstxInputDataTest.java new file mode 100644 index 0000000..4f23fb1 --- /dev/null +++ b/src/test/java/com/ctc/wstx/io/WstxInputDataTest.java @@ -0,0 +1,77 @@ +package com.ctc.wstx.io; + +import com.ctc.wstx.util.XmlChars; +import junit.framework.TestCase; +import org.junit.Test; + +import java.util.stream.IntStream; + +public class WstxInputDataTest extends TestCase { + + @Test + public void testIsNameStartCharBehavesSameAsBranchyVersion() { + WstxInputData wstxInputDataXml10 = new WstxInputData(); + WstxInputData wstxInputDataXml11 = new WstxInputData(); + wstxInputDataXml11.mXml11 = true; + + // include all 7-bit ASCII characters plus some left and right + IntStream.range(-10, 138).forEach(i -> { + char c = (char) i; + assertEquals(isNameStartCharBranchy(c, false), wstxInputDataXml10.isNameStartChar(c)); + assertEquals(isNameStartCharBranchy(c, true), wstxInputDataXml11.isNameStartChar(c)); + }); + } + + // previous implementation with branches + private final boolean isNameStartCharBranchy(char c, boolean mXml11) { + /* First, let's handle 7-bit ascii range (identical between xml + * 1.0 and 1.1) + */ + if (c <= 0x7A) { // 'z' or earlier + if (c >= 0x61) { // 'a' - 'z' are ok + return true; + } + if (c < 0x41) { // before 'A' just white space + return false; + } + return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok + } + /* Ok, otherwise need to use a big honking bit sets... which + * differ between 1.0 and 1.1 + */ + return mXml11 ? XmlChars.is11NameStartChar(c) : XmlChars.is10NameStartChar(c); + } + + @Test + public void testIsNameCharBehavesSameAsBranchyVersion() { + WstxInputData wstxInputDataXml10 = new WstxInputData(); + WstxInputData wstxInputDataXml11 = new WstxInputData(); + wstxInputDataXml11.mXml11 = true; + + // include all 7-bit ASCII characters plus some left and right + IntStream.range(-10, 138).forEach(i -> { + char c = (char) i; + assertEquals(isNameCharBranchy(c, false), wstxInputDataXml10.isNameChar(c)); + assertEquals(isNameCharBranchy(c, true), wstxInputDataXml11.isNameChar(c)); + }); + } + + // previous implementation with branches + private final boolean isNameCharBranchy(char c, boolean mXml11) { + // First, let's handle 7-bit ascii range + if (c <= 0x7A) { // 'z' or earlier + if (c >= 0x61) { // 'a' - 'z' are ok + return true; + } + if (c <= 0x5A) { + if (c >= 0x41) { // 'A' - 'Z' ok too + return true; + } + // As are 0-9, '.' and '-' + return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-'); + } + return (c == 0x5F); // '_' is ok too + } + return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c); + } +} \ No newline at end of file