Skip to content

Commit

Permalink
(#220) Switch to lookup tables in hotspots isNameChar()/`isNameStar…
Browse files Browse the repository at this point in the history
…tChar()` (#221)
  • Loading branch information
winfriedgerlach authored Feb 4, 2025
1 parent 012a512 commit b0a292e
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 20 deletions.
2 changes: 2 additions & 0 deletions release-notes/VERSION
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ Project: woodstox
#213: SAX: `Locator#getSystemId` and `Locator#getPublicId` are not
available during `startDocument` event
(fix contributed by Philipp N)
#221: Switch to lookup tables in hotspots `isNameChar()`/`isNameStartChar()`
(contributed by @winfriedgerlach)

7.1.0 (22-Oct-2024)

Expand Down
45 changes: 25 additions & 20 deletions src/main/java/com/ctc/wstx/io/WstxInputData.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

import com.ctc.wstx.util.XmlChars;

import java.util.stream.IntStream;

/**
* Base class used by readers (specifically, by
* {@link com.ctc.wstx.sr.StreamScanner}, and its sub-classes)
Expand Down Expand Up @@ -50,6 +52,23 @@ public class WstxInputData
*/
public final static int MAX_UNICODE_CHAR = 0x10FFFF;

private static final boolean[] asciiNameStartChars = new boolean[128];
static {
IntStream.rangeClosed('a', 'z').forEach(i -> asciiNameStartChars[i] = true);
IntStream.rangeClosed('A', 'Z').forEach(i -> asciiNameStartChars[i] = true);
asciiNameStartChars['_'] = true;
}

private static final boolean[] asciiNameChars = new boolean[128];
static {
IntStream.rangeClosed('a', 'z').forEach(i -> asciiNameChars[i] = true);
IntStream.rangeClosed('A', 'Z').forEach(i -> asciiNameChars[i] = true);
IntStream.rangeClosed('0', '9').forEach(i -> asciiNameChars[i] = true);
asciiNameChars['.'] = true;
asciiNameChars['-'] = true;
asciiNameChars['_'] = true;
}

/*
////////////////////////////////////////////////////
// Configuration
Expand Down Expand Up @@ -153,14 +172,9 @@ protected final boolean isNameStartChar(char c)
/* First, let's handle 7-bit ascii range (identical between xml
* 1.0 and 1.1)
*/
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c < 0x41) { // before 'A' just white space
return false;
}
return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok
if (c < 128) {
// this is performance critical, so we use a lookup table instead of if-branches
return asciiNameStartChars[c];
}
/* Ok, otherwise need to use a big honking bit sets... which
* differ between 1.0 and 1.1
Expand All @@ -178,18 +192,9 @@ protected final boolean isNameStartChar(char c)
protected final boolean isNameChar(char c)
{
// First, let's handle 7-bit ascii range
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c <= 0x5A) {
if (c >= 0x41) { // 'A' - 'Z' ok too
return true;
}
// As are 0-9, '.' and '-'
return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-');
}
return (c == 0x5F); // '_' is ok too
if (c < 128) {
// this is performance critical, so we use a lookup table instead of if-branches
return asciiNameChars[c];
}
return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
}
Expand Down
77 changes: 77 additions & 0 deletions src/test/java/com/ctc/wstx/io/WstxInputDataTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package com.ctc.wstx.io;

import com.ctc.wstx.util.XmlChars;
import junit.framework.TestCase;
import org.junit.Test;

import java.util.stream.IntStream;

public class WstxInputDataTest extends TestCase {

@Test
public void testIsNameStartCharBehavesSameAsBranchyVersion() {
WstxInputData wstxInputDataXml10 = new WstxInputData();
WstxInputData wstxInputDataXml11 = new WstxInputData();
wstxInputDataXml11.mXml11 = true;

// include all 7-bit ASCII characters plus some left and right
IntStream.range(-10, 138).forEach(i -> {
char c = (char) i;
assertEquals(isNameStartCharBranchy(c, false), wstxInputDataXml10.isNameStartChar(c));
assertEquals(isNameStartCharBranchy(c, true), wstxInputDataXml11.isNameStartChar(c));
});
}

// previous implementation with branches
private final boolean isNameStartCharBranchy(char c, boolean mXml11) {
/* First, let's handle 7-bit ascii range (identical between xml
* 1.0 and 1.1)
*/
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c < 0x41) { // before 'A' just white space
return false;
}
return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok
}
/* Ok, otherwise need to use a big honking bit sets... which
* differ between 1.0 and 1.1
*/
return mXml11 ? XmlChars.is11NameStartChar(c) : XmlChars.is10NameStartChar(c);
}

@Test
public void testIsNameCharBehavesSameAsBranchyVersion() {
WstxInputData wstxInputDataXml10 = new WstxInputData();
WstxInputData wstxInputDataXml11 = new WstxInputData();
wstxInputDataXml11.mXml11 = true;

// include all 7-bit ASCII characters plus some left and right
IntStream.range(-10, 138).forEach(i -> {
char c = (char) i;
assertEquals(isNameCharBranchy(c, false), wstxInputDataXml10.isNameChar(c));
assertEquals(isNameCharBranchy(c, true), wstxInputDataXml11.isNameChar(c));
});
}

// previous implementation with branches
private final boolean isNameCharBranchy(char c, boolean mXml11) {
// First, let's handle 7-bit ascii range
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c <= 0x5A) {
if (c >= 0x41) { // 'A' - 'Z' ok too
return true;
}
// As are 0-9, '.' and '-'
return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-');
}
return (c == 0x5F); // '_' is ok too
}
return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
}
}

0 comments on commit b0a292e

Please sign in to comment.