Skip to content

Commit

Permalink
OPENNLP-421 - Remove StringListWrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
rzo1 committed Dec 25, 2023
1 parent d455e33 commit 9fa715e
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 85 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import opennlp.tools.dictionary.serializer.DictionaryEntryPersistor;
import opennlp.tools.dictionary.serializer.Entry;
import opennlp.tools.util.StringList;
import opennlp.tools.util.StringUtil;
import opennlp.tools.util.model.DictionarySerializer;
import opennlp.tools.util.model.SerializableArtifact;

Expand All @@ -43,56 +42,7 @@
* @see Iterable
*/
public class Dictionary implements Iterable<StringList>, SerializableArtifact {

private class StringListWrapper {

private final StringList stringList;

private StringListWrapper(StringList stringList) {
this.stringList = stringList;
}

private StringList getStringList() {
return stringList;
}

@Override
public boolean equals(Object obj) {

boolean result;

if (obj == this) {
result = true;
}
else if (obj instanceof StringListWrapper other) {

if (isCaseSensitive) {
result = this.stringList.equals(other.getStringList());
}
else {
result = this.stringList.compareToIgnoreCase(other.getStringList());
}
}
else {
result = false;
}

return result;
}

@Override
public int hashCode() {
// if lookup is too slow optimize this
return StringUtil.toLowerCase(this.stringList.toString()).hashCode();
}

@Override
public String toString() {
return this.stringList.toString();
}
}

private final Set<StringListWrapper> entrySet = new HashSet<>();
private final Set<StringList> entrySet = new HashSet<>();
private final boolean isCaseSensitive;
private int minTokenCount = 99999;
private int maxTokenCount = 0;
Expand Down Expand Up @@ -131,7 +81,7 @@ public Dictionary(InputStream in) throws IOException {
* @param tokens the new entry
*/
public void put(StringList tokens) {
entrySet.add(new StringListWrapper(tokens));
entrySet.add(applyCaseSensitivity(tokens));
minTokenCount = StrictMath.min(minTokenCount, tokens.size());
maxTokenCount = StrictMath.max(maxTokenCount, tokens.size());
}
Expand All @@ -151,7 +101,7 @@ public int getMaxTokenCount() {
* @return {@code true} if it contains the entry, {@code false} otherwise.
*/
public boolean contains(StringList tokens) {
return entrySet.contains(new StringListWrapper(tokens));
return entrySet.contains(applyCaseSensitivity(tokens));
}

/**
Expand All @@ -160,15 +110,15 @@ public boolean contains(StringList tokens) {
* @param tokens The tokens to be filtered out (= removed).
*/
public void remove(StringList tokens) {
entrySet.remove(new StringListWrapper(tokens));
entrySet.remove(applyCaseSensitivity(tokens));
}

/**
* @return Retrieves a token-{@link Iterator} over all elements.
*/
@Override
public Iterator<StringList> iterator() {
final Iterator<StringListWrapper> entries = entrySet.iterator();
final Iterator<StringList> entries = entrySet.iterator();

return new Iterator<>() {

Expand All @@ -179,7 +129,7 @@ public boolean hasNext() {

@Override
public StringList next() {
return entries.next().getStringList();
return entries.next();
}

@Override
Expand Down Expand Up @@ -308,7 +258,7 @@ public Set<String> asStringSet() {

@Override
public Iterator<String> iterator() {
final Iterator<StringListWrapper> entries = entrySet.iterator();
final Iterator<StringList> entries = entrySet.iterator();

return new Iterator<>() {
@Override
Expand All @@ -317,7 +267,7 @@ public boolean hasNext() {
}
@Override
public String next() {
return entries.next().getStringList().getToken(0);
return entries.next().getToken(0);
}
@Override
public void remove() {
Expand All @@ -337,7 +287,7 @@ public boolean contains(Object obj) {

if (obj instanceof String str) {

result = entrySet.contains(new StringListWrapper(new StringList(str)));
result = entrySet.contains(new StringList(isCaseSensitive, str));

}
return result;
Expand All @@ -353,13 +303,13 @@ public boolean equals(Object o) {
return false;
}
Iterator<String> toCheckIter = toCheck.iterator();
for (StringListWrapper entry : entrySet) {
for (StringList entry : entrySet) {
if (isCaseSensitive) {
if (!entry.stringList.equals(new StringList(toCheckIter.next()))) {
if (!entry.equals(new StringList(true, toCheckIter.next()))) {
return false;
}
} else {
if (!entry.stringList.compareToIgnoreCase(new StringList(toCheckIter.next()))) {
if (!entry.compareToIgnoreCase(new StringList(false, toCheckIter.next()))) {
return false;
}
}
Expand All @@ -383,4 +333,19 @@ public int hashCode() {
public Class<?> getArtifactSerializerClass() {
return DictionarySerializer.class;
}

/**
* @return {@code true}, if this {@link Dictionary} is case-sensitive.
*/
public boolean isCaseSensitive() {
return isCaseSensitive;
}

private StringList applyCaseSensitivity(StringList list) {
if (isCaseSensitive) {
return list.toCaseSensitive();
} else {
return list.toCaseInsensitive();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ public class DictionaryNameFinder implements TokenNameFinder {
* @param type the name type used for the produced spans. Must not be {@code null}.
*/
public DictionaryNameFinder(Dictionary dictionary, String type) {
mDictionary = Objects.requireNonNull(dictionary, "dictionary must not be null");
this.mDictionary = Objects.requireNonNull(dictionary, "dictionary must not be null");
this.type = Objects.requireNonNull(type, "type must not be null");
}

Expand All @@ -61,22 +61,23 @@ public DictionaryNameFinder(Dictionary dictionary) {
@Override
public Span[] find(String[] textTokenized) {
List<Span> namesFound = new LinkedList<>();

final boolean caseSensitive = mDictionary.isCaseSensitive();
final int maxTokenCount = mDictionary.getMaxTokenCount();
for (int offsetFrom = 0; offsetFrom < textTokenized.length; offsetFrom++) {
Span nameFound = null;
String[] tokensSearching;

for (int offsetTo = offsetFrom; offsetTo < textTokenized.length; offsetTo++) {
int lengthSearching = offsetTo - offsetFrom + 1;

if (lengthSearching > mDictionary.getMaxTokenCount()) {
if (lengthSearching > maxTokenCount) {
break;
} else {
tokensSearching = new String[lengthSearching];
System.arraycopy(textTokenized, offsetFrom, tokensSearching, 0,
lengthSearching);

StringList entryForSearch = new StringList(tokensSearching);
StringList entryForSearch = new StringList(caseSensitive, tokensSearching);

if (mDictionary.contains(entryForSearch)) {
nameFound = new Span(offsetFrom, offsetTo + 1, type);
Expand Down
83 changes: 64 additions & 19 deletions opennlp-tools/src/main/java/opennlp/tools/util/StringList.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,22 @@ public class StringList implements Iterable<String> {

private final String[] tokens;

private final boolean caseSensitive;

/**
* Initializes a {@link StringList} instance.
* Initializes a {@link StringList} instance. By default, this instance is case-sensitive.
* <p>
* Note: <br>
* Token String will be interned via {@link StringInterners}.
*
* @param singleToken One single token
*/
public StringList(String singleToken) {
tokens = new String[]{StringInterners.intern(singleToken)};
this(true, singleToken);
}

/**
* Initializes a {@link StringList} instance.
* Initializes a {@link StringList} instance. By default, this instance is case-sensitive.
* <p>
* Note: <br>
* Token Strings will be interned via {@link StringInterners}.
Expand All @@ -55,6 +57,22 @@ public StringList(String singleToken) {
* @throws IllegalArgumentException Thrown if parameters were invalid.
*/
public StringList(String... tokens) {
this(true, tokens);
}

/**
* Initializes a {@link StringList} instance.
* <p>
* Note: <br>
* Token Strings will be interned via {@link StringInterners}.
*
* @param isCaseSensitive Whether it will operate case-sensitive, or not.
* @param tokens The string parts of the new {@link StringList}.
* Must not be an empty tokens array or {@code null}.
*
* @throws IllegalArgumentException Thrown if parameters were invalid.
*/
public StringList(boolean isCaseSensitive, String... tokens) {

Objects.requireNonNull(tokens, "tokens must not be null");

Expand All @@ -67,6 +85,8 @@ public StringList(String... tokens) {
for (int i = 0; i < tokens.length; i++) {
this.tokens[i] = StringInterners.intern(tokens[i]);
}

this.caseSensitive = isCaseSensitive;
}

/**
Expand Down Expand Up @@ -127,44 +147,40 @@ public void remove() {
* @return {@code true} if identically with ignore the case, {@code false} otherwise.
*/
public boolean compareToIgnoreCase(StringList tokens) {

if (size() == tokens.size()) {
for (int i = 0; i < size(); i++) {

if (getToken(i).compareToIgnoreCase(
tokens.getToken(i)) != 0) {
if (getToken(i).compareToIgnoreCase(tokens.getToken(i)) != 0) {
return false;
}
}
}
else {
} else {
return false;
}

return true;
}

@Override
public int hashCode() {
return Arrays.hashCode(tokens);
// if lookup is too slow optimize this
return StringUtil.toLowerCase(toString()).hashCode();
}

@Override
public boolean equals(Object obj) {
if (this == obj) {
if (obj == this) {
return true;
} else if (obj instanceof StringList tokenList) {
if (caseSensitive) {
return Arrays.equals(tokens, tokenList.tokens);
} else {
return compareToIgnoreCase(tokenList);
}
}

if (obj instanceof StringList tokenList) {

return Arrays.equals(tokens, tokenList.tokens);
}

return false;
}

/**
* @return A human-readable representation of this {@link Span}.
* @return A human-readable representation of this {@link StringList}.
*/
@Override
public String toString() {
Expand All @@ -184,4 +200,33 @@ public String toString() {

return string.toString();
}

/**
* @return {@code true}, if this {@link StringList} is case-sensitive.
*/
public boolean isCaseSensitive() {
return caseSensitive;
}

/**
* @return If this {@link StringList} is case-insensitive,
* the same instance is returned. Otherwise, a new object is returned.
*/
public StringList toCaseInsensitive() {
if (isCaseSensitive()) {
return new StringList(false, tokens);
}
return this;
}

/**
* @return If this {@link StringList} is case-sensitive,
* the same instance is returned. Otherwise, a new object is returned.
*/
public StringList toCaseSensitive() {
if (!isCaseSensitive()) {
return new StringList(true, tokens);
}
return this;
}
}

0 comments on commit 9fa715e

Please sign in to comment.