diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSampleStream.java
index 2bc23eedf9..53f6e979a6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSampleStream.java
@@ -31,8 +31,8 @@
* Parses the conll 2000 shared task shallow parser training data.
*
* Data format is specified on the conll page:
- *
- * http://www.cnts.ua.ac.be/conll2000/chunking/
+ *
+ * https://www.cnts.ua.ac.be/conll2000/chunking/
*/
public class ChunkSampleStream extends FilterObjectStream {
@@ -57,7 +57,7 @@ public ChunkSample read() throws IOException {
for (String line = samples.read(); line != null && !line.isEmpty(); line = samples.read()) {
String[] parts = line.split(" ");
if (parts.length != 3) {
- logger.error("Skipping corrupt line: {}", line);
+ logger.warn("Skipping corrupt line: {}", line);
}
else {
toks.add(parts[0]);
@@ -66,11 +66,11 @@ public ChunkSample read() throws IOException {
}
}
- if (toks.size() > 0) {
+ if (!toks.isEmpty()) {
return new ChunkSample(toks.toArray(new String[0]),
tags.toArray(new String[0]), preds.toArray(new String[0]));
+ } else {
+ return null;
}
-
- return null;
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
index 663980ac5a..da9a36d18e 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
@@ -17,7 +17,15 @@
package opennlp.tools.formats;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.ObjectStreamFactory;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
/**
* Base class for sample stream factories.
@@ -40,4 +48,44 @@ public String getLang() {
public Class getParameters() {
return params;
}
+
+ /**
+ * Creates an {@link ObjectStream} for the specified arguments and
+ * the generic type {@code P}.
+ *
+ * @param args A set of command line arguments.
+ * @return The created {@link ObjectStream} instance.
+ */
+ protected
ObjectStream readData(String[] args,
+ Class parametersClass) {
+ P params = validateBasicFormatParameters(args, parametersClass);
+ ObjectStream lineStream = null;
+ try {
+ InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
+ lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
+ } catch (IOException ex) {
+ CmdLineUtil.handleCreateObjectStreamError(ex);
+ }
+ return lineStream;
+ }
+
+ /**
+ * Validates the specified arguments ({@code args}) given the
+ * context the generic type {@code P} which provides at least all
+ * {@link BasicFormatParams}.
+ *
+ * @implNote Additional checks for the basic {@code -data} argument are conducted, that is
+ * wether the file exists or not.
+ *
+ * @param args A set of command line arguments.
+ * @return The parsed (basic format) parameter instance.
+ */
+ protected P validateBasicFormatParameters(String[] args, Class
clazz) {
+ if (args == null) {
+ throw new IllegalArgumentException("Passed args must not be null!");
+ }
+ P params = ArgumentParser.parse(args, clazz);
+ CmdLineUtil.checkInputFile("Data", params.getData());
+ return params;
+ }
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java
index e2ad4ef43a..5bdff9327f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java
@@ -45,25 +45,24 @@
*
* Data can be found on this
* website,
- * or in
- * this repository.
+ * or in this
+ * GitHub repository.
*
- * The BioNLP/NLPBA 2004 data were originally published here:
- *
- *
- * http://www-tsujii.is.s.u-tokyo.ac.jp/GENIA/ERtask/report.html,
+ * The BioNLP/NLPBA 2004 data were originally published
+ * here,
*
* yet this page was gone when last checked in December 2022.
*
- * It looks like this repo contains a copy of the data located on the original page:
- * The BioNLP 2004 seems to be related to http://www.geniaproject.org/shared-tasks/bionlp-jnlpba-shared-task-2004
- *
* Note:
* Do not use this class, internal use only!
*/
@Internal
public class BioNLP2004NameSampleStream implements ObjectStream {
+ private static final String CODEC_TAG_O = "O";
+ private static final String CODEC_TAG_B = "B-";
+ private static final String CODEC_TAG_I = "I-";
+
public static final int GENERATE_DNA_ENTITIES = 0x01;
public static final int GENERATE_PROTEIN_ENTITIES = 0x01 << 1;
public static final int GENERATE_CELLTYPE_ENTITIES = 0x01 << 2;
@@ -96,7 +95,6 @@ public NameSample read() throws IOException {
boolean isClearAdaptiveData = false;
// Empty line indicates end of sentence
-
String line;
while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line.trim())) {
@@ -121,7 +119,7 @@ public NameSample read() throws IOException {
}
}
- if (sentence.size() > 0) {
+ if (!sentence.isEmpty()) {
// convert name tags into spans
List names = new ArrayList<>();
@@ -133,34 +131,32 @@ public NameSample read() throws IOException {
String tag = tags.get(i);
if (tag.endsWith("DNA") && (types & GENERATE_DNA_ENTITIES) == 0)
- tag = "O";
+ tag = CODEC_TAG_O;
if (tag.endsWith("protein") && (types & GENERATE_PROTEIN_ENTITIES) == 0)
- tag = "O";
+ tag = CODEC_TAG_O;
if (tag.endsWith("cell_type") && (types & GENERATE_CELLTYPE_ENTITIES) == 0)
- tag = "O";
+ tag = CODEC_TAG_O;
if (tag.endsWith("cell_line") && (types & GENERATE_CELLTYPE_ENTITIES) == 0)
- tag = "O";
+ tag = CODEC_TAG_O;
if (tag.endsWith("RNA") && (types & GENERATE_RNA_ENTITIES) == 0)
- tag = "O";
+ tag = CODEC_TAG_O;
- if (tag.startsWith("B-")) {
+ if (tag.startsWith(CODEC_TAG_B)) {
if (beginIndex != -1) {
names.add(new Span(beginIndex, endIndex, tags.get(beginIndex).substring(2)));
- beginIndex = -1;
- endIndex = -1;
}
beginIndex = i;
endIndex = i + 1;
}
- else if (tag.startsWith("I-")) {
+ else if (tag.startsWith(CODEC_TAG_I)) {
endIndex++;
}
- else if (tag.equals("O")) {
+ else if (tag.equals(CODEC_TAG_O)) {
if (beginIndex != -1) {
names.add(new Span(beginIndex, endIndex, tags.get(beginIndex).substring(2)));
beginIndex = -1;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java
index 422cd4c632..0b7bfe3cfc 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java
@@ -19,7 +19,6 @@
import java.io.IOException;
-import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
@@ -30,46 +29,46 @@
/**
* @see BioNLP2004NameSampleStream
*/
-public class BioNLP2004NameSampleStreamFactory extends AbstractSampleStreamFactory {
+public class BioNLP2004NameSampleStreamFactory extends
+ AbstractSampleStreamFactory {
- interface Parameters extends BasicFormatParams {
+ public interface Parameters extends BasicFormatParams {
@ParameterDescription(valueName = "DNA,protein,cell_type,cell_line,RNA")
String getTypes();
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(NameSample.class,
- "bionlp2004", new BioNLP2004NameSampleStreamFactory<>(Parameters.class));
+ "bionlp2004", new BioNLP2004NameSampleStreamFactory(Parameters.class));
}
- protected BioNLP2004NameSampleStreamFactory(Class params) {
+ protected BioNLP2004NameSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
-
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
int typesToGenerate = 0;
-
- if (params.getTypes().contains("DNA")) {
+ String types = params.getTypes();
+ if (types.contains("DNA")) {
typesToGenerate = typesToGenerate |
BioNLP2004NameSampleStream.GENERATE_DNA_ENTITIES;
}
- else if (params.getTypes().contains("protein")) {
+ if (types.contains("protein")) {
typesToGenerate = typesToGenerate |
BioNLP2004NameSampleStream.GENERATE_PROTEIN_ENTITIES;
}
- else if (params.getTypes().contains("cell_type")) {
+ if (types.contains("cell_type")) {
typesToGenerate = typesToGenerate |
BioNLP2004NameSampleStream.GENERATE_CELLTYPE_ENTITIES;
}
- else if (params.getTypes().contains("cell_line")) {
+ if (types.contains("cell_line")) {
typesToGenerate = typesToGenerate |
BioNLP2004NameSampleStream.GENERATE_CELLLINE_ENTITIES;
}
- else if (params.getTypes().contains("RNA")) {
+ if (types.contains("RNA")) {
typesToGenerate = typesToGenerate |
BioNLP2004NameSampleStream.GENERATE_RNA_ENTITIES;
}
@@ -77,8 +76,9 @@ else if (params.getTypes().contains("RNA")) {
try {
return new BioNLP2004NameSampleStream(
CmdLineUtil.createInputStreamFactory(params.getData()), typesToGenerate);
- } catch (IOException e) {
- throw new IllegalStateException(e);
+ } catch (IOException ex) {
+ CmdLineUtil.handleCreateObjectStreamError(ex);
}
+ return null;
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ChunkerSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ChunkerSampleStreamFactory.java
index 36f8b58efa..8925d1960e 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ChunkerSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ChunkerSampleStreamFactory.java
@@ -17,49 +17,33 @@
package opennlp.tools.formats;
-import java.io.IOException;
-
import opennlp.tools.chunker.ChunkSample;
import opennlp.tools.chunker.ChunkSampleStream;
-import opennlp.tools.cmdline.ArgumentParser;
-import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.BasicFormatParams;
-import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
/**
* Factory producing OpenNLP {@link ChunkSampleStream}s.
*/
-public class ChunkerSampleStreamFactory extends AbstractSampleStreamFactory {
+public class ChunkerSampleStreamFactory extends
+ AbstractSampleStreamFactory {
- interface Parameters extends BasicFormatParams {
+ public interface Parameters extends BasicFormatParams {
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(ChunkSample.class,
- StreamFactoryRegistry.DEFAULT_FORMAT, new ChunkerSampleStreamFactory<>(Parameters.class));
+ StreamFactoryRegistry.DEFAULT_FORMAT, new ChunkerSampleStreamFactory(Parameters.class));
}
- protected ChunkerSampleStreamFactory(Class params) {
+ protected ChunkerSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
-
- CmdLineUtil.checkInputFile("Data", params.getData());
- InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
- ObjectStream lineStream = null;
- try {
- lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
-
- } catch (IOException ex) {
- CmdLineUtil.handleCreateObjectStreamError(ex);
- }
-
+ ObjectStream lineStream = readData(args, Parameters.class);
return new ChunkSampleStream(lineStream);
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStreamFactory.java
index d417df0cc1..91cca66714 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStreamFactory.java
@@ -19,7 +19,6 @@
import java.io.IOException;
-import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
@@ -37,9 +36,10 @@
* @see Conll02NameSampleStream
*/
@Internal
-public class Conll02NameSampleStreamFactory extends LanguageSampleStreamFactory {
+public class Conll02NameSampleStreamFactory extends
+ LanguageSampleStreamFactory {
- interface Parameters extends BasicFormatParams {
+ public interface Parameters extends BasicFormatParams {
@ParameterDescription(valueName = "spa|nld")
String getLang();
@@ -49,17 +49,17 @@ interface Parameters extends BasicFormatParams {
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(NameSample.class,
- "conll02", new Conll02NameSampleStreamFactory<>(Parameters.class));
+ "conll02", new Conll02NameSampleStreamFactory(Parameters.class));
}
- protected Conll02NameSampleStreamFactory(Class params) {
+ protected Conll02NameSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
LANGUAGE lang;
if ("nl".equals(params.getLang()) || "nld".equals(params.getLang())) {
@@ -93,7 +93,6 @@ else if ("es".equals(params.getLang()) || "spa".equals(params.getLang())) {
Conll02NameSampleStream.GENERATE_MISC_ENTITIES;
}
-
try {
return new Conll02NameSampleStream(lang,
CmdLineUtil.createInputStreamFactory(params.getData()), typesToGenerate);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java
index dae580cbba..d1a6150928 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java
@@ -19,7 +19,6 @@
import java.io.IOException;
-import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
@@ -32,9 +31,10 @@
/**
* @see Conll03NameSampleStream
*/
-public class Conll03NameSampleStreamFactory extends LanguageSampleStreamFactory {
+public class Conll03NameSampleStreamFactory extends
+ LanguageSampleStreamFactory {
- interface Parameters extends BasicFormatParams {
+ public interface Parameters extends BasicFormatParams {
@ParameterDescription(valueName = "eng|deu")
String getLang();
@@ -44,17 +44,17 @@ interface Parameters extends BasicFormatParams {
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(NameSample.class,
- "conll03", new Conll03NameSampleStreamFactory<>(Parameters.class));
+ "conll03", new Conll03NameSampleStreamFactory(Parameters.class));
}
- protected Conll03NameSampleStreamFactory(Class params) {
+ protected Conll03NameSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
// TODO: support the other languages with this CoNLL.
LANGUAGE lang;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStreamFactory.java
index 18b48b5a42..a7a586d9b8 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStreamFactory.java
@@ -18,13 +18,10 @@
package opennlp.tools.formats;
import java.io.IOException;
-import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
-import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
-import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.cmdline.params.BasicFormatParams;
import opennlp.tools.commons.Internal;
import opennlp.tools.postag.POSSample;
@@ -35,40 +32,35 @@
* Note:
* Do not use this class, internal use only!
*
+ * @see POSSample
* @see ConllXPOSSampleStream
*/
@Internal
-public class ConllXPOSSampleStreamFactory extends AbstractSampleStreamFactory {
+public class ConllXPOSSampleStreamFactory extends
+ AbstractSampleStreamFactory {
public static final String CONLLX_FORMAT = "conllx";
- interface Parameters extends BasicFormatParams {
+ public interface Parameters extends BasicFormatParams {
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(POSSample.class,
- CONLLX_FORMAT, new ConllXPOSSampleStreamFactory<>(Parameters.class));
+ CONLLX_FORMAT, new ConllXPOSSampleStreamFactory(Parameters.class));
}
- protected ConllXPOSSampleStreamFactory(Class params) {
+ protected ConllXPOSSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
-
- InputStreamFactory inFactory =
- CmdLineUtil.createInputStreamFactory(params.getData());
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
try {
+ InputStreamFactory inFactory = CmdLineUtil.createInputStreamFactory(params.getData());
return new ConllXPOSSampleStream(inFactory, StandardCharsets.UTF_8);
- } catch (UnsupportedEncodingException e) {
- // this shouldn't happen
- throw new TerminateToolException(-1, "UTF-8 encoding is not supported: " + e.getMessage(), e);
- }
- catch (IOException e) {
- // That will throw an exception
+ } catch (IOException e) {
CmdLineUtil.handleCreateObjectStreamError(e);
return null;
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactory.java
index 505f94f2c4..2d622e0fcf 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactory.java
@@ -29,29 +29,31 @@
/**
* Note:
* Do not use this class, internal use only!
+ *
+ * @see SentenceSample
+ * @see POSToSentenceSampleStream
*/
@Internal
-public class ConllXSentenceSampleStreamFactory extends
- DetokenizerSampleStreamFactory {
+public class ConllXSentenceSampleStreamFactory extends
+ DetokenizerSampleStreamFactory {
- interface Parameters extends ConllXPOSSampleStreamFactory.Parameters, DetokenizerParameter {
+ public interface Parameters extends ConllXPOSSampleStreamFactory.Parameters, DetokenizerParameter {
// TODO: make chunk size configurable
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(SentenceSample.class,
ConllXPOSSampleStreamFactory.CONLLX_FORMAT,
- new ConllXSentenceSampleStreamFactory<>(Parameters.class));
+ new ConllXSentenceSampleStreamFactory(Parameters.class));
}
- protected ConllXSentenceSampleStreamFactory(Class params) {
+ protected ConllXSentenceSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
-
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
ObjectStream posSampleStream = StreamFactoryRegistry.getFactory(POSSample.class,
ConllXPOSSampleStreamFactory.CONLLX_FORMAT).create(
ArgumentParser.filter(args, ConllXPOSSampleStreamFactory.Parameters.class));
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXTokenSampleStreamFactory.java
index c894be0a3a..81c1eb217e 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXTokenSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXTokenSampleStreamFactory.java
@@ -29,25 +29,29 @@
/**
* Note:
* Do not use this class, internal use only!
+ *
+ * @see TokenSample
+ * @see POSToTokenSampleStream
*/
@Internal
-public class ConllXTokenSampleStreamFactory extends DetokenizerSampleStreamFactory {
+public class ConllXTokenSampleStreamFactory extends
+ DetokenizerSampleStreamFactory {
- interface Parameters extends ConllXPOSSampleStreamFactory.Parameters, DetokenizerParameter {
+ public interface Parameters extends ConllXPOSSampleStreamFactory.Parameters, DetokenizerParameter {
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(TokenSample.class,
- ConllXPOSSampleStreamFactory.CONLLX_FORMAT, new ConllXTokenSampleStreamFactory<>(Parameters.class));
+ ConllXPOSSampleStreamFactory.CONLLX_FORMAT, new ConllXTokenSampleStreamFactory(Parameters.class));
}
- protected ConllXTokenSampleStreamFactory(Class params) {
+ protected ConllXTokenSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
ObjectStream samples = StreamFactoryRegistry.getFactory(POSSample.class,
ConllXPOSSampleStreamFactory.CONLLX_FORMAT).create(
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/DocumentSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/DocumentSampleStreamFactory.java
index 2ecf56641f..1775eaa46b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/DocumentSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/DocumentSampleStreamFactory.java
@@ -17,48 +17,33 @@
package opennlp.tools.formats;
-import java.io.IOException;
-
-import opennlp.tools.cmdline.ArgumentParser;
-import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.BasicFormatParams;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.doccat.DocumentSampleStream;
-import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
/**
* Factory producing OpenNLP {@link DocumentSampleStream}s.
*/
-public class DocumentSampleStreamFactory extends AbstractSampleStreamFactory {
+public class DocumentSampleStreamFactory extends
+ AbstractSampleStreamFactory {
- interface Parameters extends BasicFormatParams {
+ public interface Parameters extends BasicFormatParams {
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(DocumentSample.class,
- StreamFactoryRegistry.DEFAULT_FORMAT, new DocumentSampleStreamFactory<>(Parameters.class));
+ StreamFactoryRegistry.DEFAULT_FORMAT, new DocumentSampleStreamFactory(Parameters.class));
}
- protected DocumentSampleStreamFactory(Class params) {
+ protected DocumentSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
-
- CmdLineUtil.checkInputFile("Data", params.getData());
- InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
- ObjectStream lineStream = null;
- try {
- lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
- } catch (IOException ex) {
- CmdLineUtil.handleCreateObjectStreamError(ex);
- }
-
+ ObjectStream lineStream = readData(args, Parameters.class);
return new DocumentSampleStream(lineStream);
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java
index 76574500ed..a1382c5dcb 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java
@@ -32,7 +32,7 @@
import opennlp.tools.util.StringUtil;
/**
- * Parser for the Italian NER training files of the Evalita 2007 and 2009 NER shared tasks.
+ * Parser for the Italian NER training files of the Evalita 2007 and 2009 NER shared tasks.
*
* The data does not contain article boundaries,
* adaptive data will be cleared for every sentence.
@@ -46,12 +46,12 @@
* 2. The Entity type tag: PER (for Person), ORG (for Organization),
* GPE (for Geo-Political Entity), or LOC (for Location).
*
- * Each file consists of four columns separated by a blank, containing
- * respectively the token, the Elsnet PoS-tag, the Adige news story to
- * which the token belongs, and the Named Entity tag.
+ * Each file consists of four columns separated by a blank, containing respectively the token, the
+ * Elsnet
+ * PoS-tag, the Adige news story to which the token belongs, and the Named Entity tag.
*
* Data can be found on this
- * web site.
+ * web site.
*
* Note:
* Do not use this class, internal use only!
@@ -59,6 +59,15 @@
@Internal
public class EvalitaNameSampleStream implements ObjectStream {
+ public static final String DOCSTART = "-DOCSTART-";
+ private static final String CODEC_TAG_O = "O";
+ private static final String CODEC_TAG_B = "B-";
+ private static final String CODEC_TAG_I = "I-";
+ private static final String ENT_TYPE_PER = "PER"; // Person
+ private static final String ENT_TYPE_LOC = "LOC"; // Location
+ private static final String ENT_TYPE_GPE = "GPE"; // Geo-Political Entity
+ private static final String ENT_TYPE_ORG = "ORG"; // Organization
+
public enum LANGUAGE {
IT
}
@@ -68,8 +77,6 @@ public enum LANGUAGE {
public static final int GENERATE_LOCATION_ENTITIES = 0x01 << 2;
public static final int GENERATE_GPE_ENTITIES = 0x01 << 3;
- public static final String DOCSTART = "-DOCSTART-";
-
private final LANGUAGE lang;
private final ObjectStream lineStream;
@@ -82,7 +89,7 @@ public EvalitaNameSampleStream(LANGUAGE lang, ObjectStream lineStream, i
}
public EvalitaNameSampleStream(LANGUAGE lang, InputStreamFactory in, int types) throws IOException {
- this(lang, new PlainTextByLineStream(in, StandardCharsets.UTF_8),types);
+ this(lang, new PlainTextByLineStream(in, StandardCharsets.UTF_8), types);
}
private static Span extract(int begin, int end, String beginTag) throws InvalidFormatException {
@@ -90,17 +97,16 @@ private static Span extract(int begin, int end, String beginTag) throws InvalidF
String type = beginTag.substring(2);
type = switch (type) {
- case "PER" -> "person";
- case "LOC" -> "location";
- case "GPE" -> "gpe";
- case "ORG" -> "organization";
+ case ENT_TYPE_PER -> "person";
+ case ENT_TYPE_LOC -> "location";
+ case ENT_TYPE_GPE -> "gpe";
+ case ENT_TYPE_ORG -> "organization";
default -> throw new InvalidFormatException("Unknown type: " + type);
};
return new Span(begin, end, type);
}
-
@Override
public NameSample read() throws IOException {
@@ -110,7 +116,6 @@ public NameSample read() throws IOException {
boolean isClearAdaptiveData = false;
// Empty line indicates end of sentence
-
String line;
while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) {
@@ -140,7 +145,7 @@ public NameSample read() throws IOException {
if (LANGUAGE.IT.equals(lang))
isClearAdaptiveData = true;
- if (sentence.size() > 0) {
+ if (!sentence.isEmpty()) {
// convert name tags into spans
List names = new ArrayList<>();
@@ -151,33 +156,31 @@ public NameSample read() throws IOException {
String tag = tags.get(i);
- if (tag.endsWith("PER") && (types & GENERATE_PERSON_ENTITIES) == 0)
- tag = "O";
+ if (tag.endsWith(ENT_TYPE_PER) && (types & GENERATE_PERSON_ENTITIES) == 0)
+ tag = CODEC_TAG_O;
- if (tag.endsWith("ORG") && (types & GENERATE_ORGANIZATION_ENTITIES) == 0)
- tag = "O";
+ if (tag.endsWith(ENT_TYPE_ORG) && (types & GENERATE_ORGANIZATION_ENTITIES) == 0)
+ tag = CODEC_TAG_O;
- if (tag.endsWith("LOC") && (types & GENERATE_LOCATION_ENTITIES) == 0)
- tag = "O";
+ if (tag.endsWith(ENT_TYPE_LOC) && (types & GENERATE_LOCATION_ENTITIES) == 0)
+ tag = CODEC_TAG_O;
- if (tag.endsWith("GPE") && (types & GENERATE_GPE_ENTITIES) == 0)
- tag = "O";
+ if (tag.endsWith(ENT_TYPE_GPE) && (types & GENERATE_GPE_ENTITIES) == 0)
+ tag = CODEC_TAG_O;
- if (tag.startsWith("B-")) {
+ if (tag.startsWith(CODEC_TAG_B)) {
if (beginIndex != -1) {
names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
- beginIndex = -1;
- endIndex = -1;
}
beginIndex = i;
endIndex = i + 1;
}
- else if (tag.startsWith("I-")) {
+ else if (tag.startsWith(CODEC_TAG_I)) {
endIndex++;
}
- else if (tag.equals("O")) {
+ else if (tag.equals(CODEC_TAG_O)) {
if (beginIndex != -1) {
names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
beginIndex = -1;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java
index 7fa9db404b..066861f303 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java
@@ -19,7 +19,6 @@
import java.io.IOException;
-import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
@@ -37,9 +36,10 @@
* @see EvalitaNameSampleStream
*/
@Internal
-public class EvalitaNameSampleStreamFactory extends LanguageSampleStreamFactory {
+public class EvalitaNameSampleStreamFactory extends
+ LanguageSampleStreamFactory {
- interface Parameters extends BasicFormatParams {
+ public interface Parameters extends BasicFormatParams {
@ParameterDescription(valueName = "it")
String getLang();
@@ -49,17 +49,16 @@ interface Parameters extends BasicFormatParams {
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(NameSample.class,
- "evalita", new EvalitaNameSampleStreamFactory<>(Parameters.class));
+ "evalita", new EvalitaNameSampleStreamFactory(Parameters.class));
}
- protected EvalitaNameSampleStreamFactory(Class params) {
+ protected EvalitaNameSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
-
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
LANGUAGE lang;
if ("it".equals(params.getLang())) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
index bda9d482fb..9aedc4bbea 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
@@ -17,51 +17,34 @@
package opennlp.tools.formats;
-import java.io.IOException;
-
-import opennlp.tools.cmdline.ArgumentParser;
-import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.BasicFormatParams;
-import opennlp.tools.doccat.DocumentSampleStream;
import opennlp.tools.langdetect.LanguageDetectorSampleStream;
import opennlp.tools.langdetect.LanguageSample;
-import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
/**
- * Factory producing OpenNLP {@link DocumentSampleStream}s.
+ * Factory producing OpenNLP {@link LanguageDetectorSampleStream lang detector sample streams}.
*/
-public class LanguageDetectorSampleStreamFactory
- extends AbstractSampleStreamFactory {
+public class LanguageDetectorSampleStreamFactory extends
+ AbstractSampleStreamFactory {
- interface Parameters extends BasicFormatParams {
+ public interface Parameters extends BasicFormatParams {
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(LanguageSample.class,
StreamFactoryRegistry.DEFAULT_FORMAT,
- new LanguageDetectorSampleStreamFactory<>(Parameters.class));
+ new LanguageDetectorSampleStreamFactory(Parameters.class));
}
- protected LanguageDetectorSampleStreamFactory(Class params) {
+ protected LanguageDetectorSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
-
- CmdLineUtil.checkInputFile("Data", params.getData());
- InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
- ObjectStream lineStream = null;
- try {
- lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
- } catch (IOException ex) {
- CmdLineUtil.handleCreateObjectStreamError(ex);
- }
-
+ ObjectStream lineStream = readData(args, Parameters.class);
return new LanguageDetectorSampleStream(lineStream);
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LemmatizerSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LemmatizerSampleStreamFactory.java
index dfb137e747..f220917097 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LemmatizerSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LemmatizerSampleStreamFactory.java
@@ -17,49 +17,33 @@
package opennlp.tools.formats;
-import java.io.IOException;
-
-import opennlp.tools.cmdline.ArgumentParser;
-import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.BasicFormatParams;
import opennlp.tools.lemmatizer.LemmaSample;
import opennlp.tools.lemmatizer.LemmaSampleStream;
-import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
/**
* Factory producing OpenNLP {@link LemmaSampleStream}s.
*/
-public class LemmatizerSampleStreamFactory extends AbstractSampleStreamFactory {
+public class LemmatizerSampleStreamFactory extends
+ AbstractSampleStreamFactory {
- interface Parameters extends BasicFormatParams {
+ public interface Parameters extends BasicFormatParams {
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(LemmaSample.class,
- StreamFactoryRegistry.DEFAULT_FORMAT, new LemmatizerSampleStreamFactory<>(Parameters.class));
+ StreamFactoryRegistry.DEFAULT_FORMAT, new LemmatizerSampleStreamFactory(Parameters.class));
}
- protected LemmatizerSampleStreamFactory(Class params) {
+ protected LemmatizerSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
-
- CmdLineUtil.checkInputFile("Data", params.getData());
- InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
- ObjectStream lineStream = null;
- try {
- lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
-
- } catch (IOException ex) {
- CmdLineUtil.handleCreateObjectStreamError(ex);
- }
-
+ ObjectStream lineStream = readData(args, Parameters.class);
return new LemmaSampleStream(lineStream);
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/NameSampleDataStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/NameSampleDataStreamFactory.java
index 508359bbfe..efc786e1e2 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/NameSampleDataStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/NameSampleDataStreamFactory.java
@@ -17,50 +17,35 @@
package opennlp.tools.formats;
-import java.io.IOException;
-
-import opennlp.tools.cmdline.ArgumentParser;
-import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.commons.Internal;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
-import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
/**
* Factory producing OpenNLP {@link NameSampleDataStream}s.
*/
-public class NameSampleDataStreamFactory extends AbstractSampleStreamFactory {
+@Internal
+public class NameSampleDataStreamFactory extends
+ AbstractSampleStreamFactory {
public interface Parameters extends BasicFormatParams {
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(NameSample.class,
- StreamFactoryRegistry.DEFAULT_FORMAT, new NameSampleDataStreamFactory<>(Parameters.class));
+ StreamFactoryRegistry.DEFAULT_FORMAT, new NameSampleDataStreamFactory(Parameters.class));
}
- protected NameSampleDataStreamFactory(Class params) {
+ protected NameSampleDataStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
-
- CmdLineUtil.checkInputFile("Data", params.getData());
-
- InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
-
- ObjectStream lineStream = null;
- try {
- lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
- } catch (IOException ex) {
- CmdLineUtil.handleCreateObjectStreamError(ex);
- }
-
+ ObjectStream lineStream = readData(args, Parameters.class);
return new NameSampleDataStream(lineStream);
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ParseSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ParseSampleStreamFactory.java
index 6ed457430e..2ce9ec36c0 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ParseSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ParseSampleStreamFactory.java
@@ -17,49 +17,33 @@
package opennlp.tools.formats;
-import java.io.IOException;
-
-import opennlp.tools.cmdline.ArgumentParser;
-import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.BasicFormatParams;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.ParseSampleStream;
-import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
/**
* Factory producing OpenNLP {@link ParseSampleStream}s.
*/
-public class ParseSampleStreamFactory extends AbstractSampleStreamFactory {
+public class ParseSampleStreamFactory extends
+ AbstractSampleStreamFactory {
public interface Parameters extends BasicFormatParams {
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(Parse.class,
- StreamFactoryRegistry.DEFAULT_FORMAT, new ParseSampleStreamFactory<>(Parameters.class));
+ StreamFactoryRegistry.DEFAULT_FORMAT, new ParseSampleStreamFactory(Parameters.class));
}
- protected ParseSampleStreamFactory(Class params) {
+ protected ParseSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
-
- CmdLineUtil.checkInputFile("Data", params.getData());
- InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
-
- ObjectStream lineStream = null;
- try {
- lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
- } catch (IOException ex) {
- CmdLineUtil.handleCreateObjectStreamError(ex);
- }
-
+ ObjectStream lineStream = readData(args, Parameters.class);
return new ParseSampleStream(lineStream);
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/SentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/SentenceSampleStreamFactory.java
index e002bbb1d7..61b8e6bb50 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/SentenceSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/SentenceSampleStreamFactory.java
@@ -17,50 +17,34 @@
package opennlp.tools.formats;
-import java.io.IOException;
-
-import opennlp.tools.cmdline.ArgumentParser;
-import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.BasicFormatParams;
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.sentdetect.SentenceSampleStream;
-import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
/**
* Factory producing OpenNLP {@link SentenceSampleStream}s.
*/
-public class SentenceSampleStreamFactory extends AbstractSampleStreamFactory {
+public class SentenceSampleStreamFactory extends
+ AbstractSampleStreamFactory {
- interface Parameters extends BasicFormatParams {
+ public interface Parameters extends BasicFormatParams {
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(SentenceSample.class,
- StreamFactoryRegistry.DEFAULT_FORMAT, new SentenceSampleStreamFactory<>(Parameters.class));
+ StreamFactoryRegistry.DEFAULT_FORMAT, new SentenceSampleStreamFactory(Parameters.class));
}
- protected SentenceSampleStreamFactory(Class params) {
+ protected SentenceSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
-
- CmdLineUtil.checkInputFile("Data", params.getData());
- InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
-
- ObjectStream lineStream = null;
- try {
- lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
- } catch (IOException ex) {
- CmdLineUtil.handleCreateObjectStreamError(ex);
- }
-
+ ObjectStream lineStream = readData(args, Parameters.class);
return new SentenceSampleStream(lineStream);
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/TokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/TokenSampleStreamFactory.java
index ffbd1e6c64..75dc62d42b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/TokenSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/TokenSampleStreamFactory.java
@@ -17,49 +17,33 @@
package opennlp.tools.formats;
-import java.io.IOException;
-
-import opennlp.tools.cmdline.ArgumentParser;
-import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.BasicFormatParams;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.tokenize.TokenSampleStream;
-import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
/**
* Factory producing OpenNLP {@link TokenSampleStream}s.
*/
-public class TokenSampleStreamFactory extends LanguageSampleStreamFactory {
+public class TokenSampleStreamFactory extends
+ LanguageSampleStreamFactory {
- interface Parameters extends BasicFormatParams {
+ public interface Parameters extends BasicFormatParams {
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(TokenSample.class,
- StreamFactoryRegistry.DEFAULT_FORMAT, new TokenSampleStreamFactory<>(Parameters.class));
+ StreamFactoryRegistry.DEFAULT_FORMAT, new TokenSampleStreamFactory(Parameters.class));
}
- protected TokenSampleStreamFactory(Class params) {
+ protected TokenSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
-
- CmdLineUtil.checkInputFile("Data", params.getData());
- InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
-
- ObjectStream lineStream = null;
- try {
- lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
- } catch (IOException ex) {
- CmdLineUtil.handleCreateObjectStreamError(ex);
- }
-
+ ObjectStream lineStream = readData(args, Parameters.class);
return new TokenSampleStream(lineStream);
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStream.java
index eb3ab1dff5..c69319ce3c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStream.java
@@ -28,6 +28,15 @@
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.ObjectStream;
+/**
+ * An {@link ObjectStream} implementation for the Twenty Newsgroups text corpus.
+ *
+ * The document collection was created and donated by: Tom Mitchell,
+ * School of Computer Science, Carnegie Mellon University.
+ *
+ * Details and the data can be found via this DOI:
+ * 10.24432/C5C323.
+ */
public class TwentyNewsgroupSampleStream implements ObjectStream {
private final Tokenizer tokenizer;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactory.java
index edf3d5d3e2..85179eaefa 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactory.java
@@ -26,35 +26,55 @@
import opennlp.tools.cmdline.params.EncodingParameter;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.ThreadSafeTokenizerME;
import opennlp.tools.tokenize.Tokenizer;
-import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.ObjectStream;
-public class TwentyNewsgroupSampleStreamFactory extends AbstractSampleStreamFactory {
+/**
+ * Note: Do not use this class, internal use only!
+ *
+ * @see TwentyNewsgroupSampleStream
+ */
+public class TwentyNewsgroupSampleStreamFactory extends
+ AbstractSampleStreamFactory {
+
+ public interface Parameters extends EncodingParameter {
+ @ArgumentParser.ParameterDescription(valueName = "dataDir",
+ description = "dir containing the 20newsgroup folders")
+ File getDataDir();
+
+ @ArgumentParser.ParameterDescription(valueName = "modelFile")
+ @ArgumentParser.OptionalParameter
+ File getTokenizerModel();
+
+ @ArgumentParser.ParameterDescription(valueName = "name")
+ @ArgumentParser.OptionalParameter
+ String getRuleBasedTokenizer();
+ }
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(DocumentSample.class,
"20newsgroup",
- new TwentyNewsgroupSampleStreamFactory<>(TwentyNewsgroupSampleStreamFactory.Parameters.class));
+ new TwentyNewsgroupSampleStreamFactory(TwentyNewsgroupSampleStreamFactory.Parameters.class));
}
- protected TwentyNewsgroupSampleStreamFactory(Class params) {
+ protected TwentyNewsgroupSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
-
- TwentyNewsgroupSampleStreamFactory.Parameters params =
- ArgumentParser.parse(args, TwentyNewsgroupSampleStreamFactory.Parameters.class);
+ if (args == null) {
+ throw new IllegalArgumentException("Passed args must not be null!");
+ }
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
-
if (params.getTokenizerModel() != null) {
try {
- tokenizer = new TokenizerME(new TokenizerModel(params.getTokenizerModel()));
+ tokenizer = new ThreadSafeTokenizerME(new TokenizerModel(params.getTokenizerModel()));
} catch (IOException e) {
throw new TerminateToolException(-1, "Failed to load tokenizer model!", e);
}
@@ -74,24 +94,10 @@ else if ("whitespace".equals(tokenizerName)) {
}
try {
- return new TwentyNewsgroupSampleStream(
- tokenizer, params.getDataDir().toPath());
+ return new TwentyNewsgroupSampleStream(tokenizer, params.getDataDir().toPath());
} catch (IOException e) {
throw new TerminateToolException(-1, "IO error while opening sample data: " + e.getMessage(), e);
}
}
- interface Parameters extends EncodingParameter {
- @ArgumentParser.ParameterDescription(valueName = "dataDir",
- description = "dir containing the 20newsgroup folders")
- File getDataDir();
-
- @ArgumentParser.ParameterDescription(valueName = "modelFile")
- @ArgumentParser.OptionalParameter
- File getTokenizerModel();
-
- @ArgumentParser.ParameterDescription(valueName = "name")
- @ArgumentParser.OptionalParameter
- String getRuleBasedTokenizer();
- }
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/WordTagSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/WordTagSampleStreamFactory.java
index 4972b4d1db..6b92c41a39 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/WordTagSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/WordTagSampleStreamFactory.java
@@ -17,52 +17,37 @@
package opennlp.tools.formats;
-import java.io.IOException;
-
-import opennlp.tools.cmdline.ArgumentParser;
-import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.BasicFormatParams;
import opennlp.tools.commons.Internal;
import opennlp.tools.postag.POSSample;
import opennlp.tools.postag.WordTagSampleStream;
-import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
/**
* Note:
* Do not use this class, internal use only!
*/
@Internal
-public class WordTagSampleStreamFactory extends AbstractSampleStreamFactory {
+public class WordTagSampleStreamFactory extends
+ AbstractSampleStreamFactory {
public interface Parameters extends BasicFormatParams {
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(POSSample.class,
- StreamFactoryRegistry.DEFAULT_FORMAT, new WordTagSampleStreamFactory<>(Parameters.class));
+ StreamFactoryRegistry.DEFAULT_FORMAT, new WordTagSampleStreamFactory(Parameters.class));
}
- protected WordTagSampleStreamFactory(Class params) {
+ protected WordTagSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
-
- CmdLineUtil.checkInputFile("Data", params.getData());
- InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
-
- ObjectStream lineStream = null;
- try {
- lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
- } catch (IOException ex) {
- CmdLineUtil.handleCreateObjectStreamError(ex);
- }
-
+ ObjectStream lineStream = readData(args, Parameters.class);
return new WordTagSampleStream(lineStream);
}
+
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java
index cdcbd9d1ae..6c51ee6882 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java
@@ -104,14 +104,14 @@ public ChunkSample read() throws IOException {
index++;
// skip this one
} else {
- Node root = paragraph.getRoot();
+ Node root = paragraph.root();
List sentence = new ArrayList<>();
List tags = new ArrayList<>();
List target = new ArrayList<>();
processRoot(root, sentence, tags, target);
- if (sentence.size() > 0) {
+ if (!sentence.isEmpty()) {
index++;
return new ChunkSample(sentence, tags, target);
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java
index 49922f8c22..95183dad26 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java
@@ -17,21 +17,16 @@
package opennlp.tools.formats.ad;
-import java.io.File;
-import java.io.IOException;
import java.nio.charset.Charset;
import opennlp.tools.chunker.ChunkSample;
-import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
-import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
import opennlp.tools.commons.Internal;
import opennlp.tools.formats.LanguageSampleStreamFactory;
-import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
/**
* A Factory to create a Arvores Deitadas ChunkStream from the command line
@@ -41,18 +36,16 @@
* Do not use this class, internal use only!
*/
@Internal
-public class ADChunkSampleStreamFactory extends LanguageSampleStreamFactory {
+public class ADChunkSampleStreamFactory extends
+ LanguageSampleStreamFactory {
- interface Parameters {
+ public interface Parameters extends BasicFormatParams {
//all have to be repeated, because encoding is not optional,
//according to the check if (encoding == null) { below (now removed)
@ParameterDescription(valueName = "charsetName",
description = "encoding for reading and writing text, if absent the system default is used.")
Charset getEncoding();
- @ParameterDescription(valueName = "sampleData", description = "data to be used, usually a file name.")
- File getData();
-
@ParameterDescription(valueName = "language", description = "language which is being processed.")
String getLang();
@@ -67,26 +60,17 @@ interface Parameters {
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(ChunkSample.class,
- "ad", new ADChunkSampleStreamFactory<>(Parameters.class));
+ "ad", new ADChunkSampleStreamFactory(Parameters.class));
}
- protected ADChunkSampleStreamFactory(Class params) {
+ protected ADChunkSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
-
- Parameters params = ArgumentParser.parse(args, Parameters.class);
- language = params.getLang();
-
- InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
- ObjectStream lineStream = null;
- try {
- lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
- } catch (IOException ex) {
- CmdLineUtil.handleCreateObjectStreamError(ex);
- }
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
+ ObjectStream lineStream = readData(args, Parameters.class);
ADChunkSampleStream sampleStream = new ADChunkSampleStream(lineStream);
if (params.getStart() != null && params.getStart() > -1) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java
index dae804be5f..d2db063515 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java
@@ -206,7 +206,7 @@ public NameSample read() throws IOException {
textID = currentTextID;
}
- Node root = paragraph.getRoot();
+ Node root = paragraph.root();
List sentence = new ArrayList<>();
List names = new ArrayList<>();
process(root, sentence, names);
@@ -438,7 +438,7 @@ enum Type {
private int getTextID(Sentence paragraph) {
- final String meta = paragraph.getMetadata();
+ final String meta = paragraph.metadata();
Type corpusType;
Pattern metaPattern;
int textIdMeta2 = -1;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java
index 525b40972c..b1af32eb61 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java
@@ -17,21 +17,16 @@
package opennlp.tools.formats.ad;
-import java.io.File;
-import java.io.IOException;
import java.nio.charset.Charset;
-import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
-import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
import opennlp.tools.commons.Internal;
import opennlp.tools.formats.LanguageSampleStreamFactory;
import opennlp.tools.namefind.NameSample;
-import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
/**
* A Factory to create a Arvores Deitadas NameSampleDataStream from the command line
@@ -41,18 +36,16 @@
* Do not use this class, internal use only!
*/
@Internal
-public class ADNameSampleStreamFactory extends LanguageSampleStreamFactory {
+public class ADNameSampleStreamFactory extends
+ LanguageSampleStreamFactory {
- interface Parameters {
+ public interface Parameters extends BasicFormatParams {
//all have to be repeated, because encoding is not optional,
//according to the check if (encoding == null) { below (now removed)
@ParameterDescription(valueName = "charsetName",
description = "encoding for reading and writing text, if absent the system default is used.")
Charset getEncoding();
- @ParameterDescription(valueName = "sampleData", description = "data to be used, usually a file name.")
- File getData();
-
@ParameterDescription(valueName = "split",
description = "if true all hyphenated tokens will be separated (default true)")
@OptionalParameter(defaultValue = "true")
@@ -64,27 +57,18 @@ interface Parameters {
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(NameSample.class,
- "ad", new ADNameSampleStreamFactory<>(Parameters.class));
+ "ad", new ADNameSampleStreamFactory(Parameters.class));
}
- protected ADNameSampleStreamFactory(Class params) {
+ protected ADNameSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
-
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
language = params.getLang();
-
- InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
- ObjectStream lineStream = null;
- try {
- lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
- } catch (IOException ex) {
- CmdLineUtil.handleCreateObjectStreamError(ex);
- }
-
+ ObjectStream lineStream = readData(args, Parameters.class);
return new ADNameSampleStream(lineStream, params.getSplitHyphenatedTokens());
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java
index 742e27e696..8b7bd96416 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java
@@ -84,7 +84,7 @@ public ADPOSSampleStream(InputStreamFactory in, String charsetName,
public POSSample read() throws IOException {
Sentence paragraph;
if ((paragraph = this.adSentenceStream.read()) != null) {
- Node root = paragraph.getRoot();
+ Node root = paragraph.root();
List sentence = new ArrayList<>();
List tags = new ArrayList<>();
process(root, sentence, tags);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java
index 80dff4767c..88771d7100 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java
@@ -17,38 +17,30 @@
package opennlp.tools.formats.ad;
-import java.io.File;
-import java.io.IOException;
import java.nio.charset.Charset;
-import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
-import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
import opennlp.tools.commons.Internal;
import opennlp.tools.formats.LanguageSampleStreamFactory;
import opennlp.tools.postag.POSSample;
-import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
/**
* Note:
* Do not use this class, internal use only!
*/
@Internal
-public class ADPOSSampleStreamFactory extends
- LanguageSampleStreamFactory {
+public class ADPOSSampleStreamFactory extends
+ LanguageSampleStreamFactory {
- interface Parameters {
+ public interface Parameters extends BasicFormatParams {
@ParameterDescription(valueName = "charsetName",
description = "encoding for reading and writing text, if absent the system default is used.")
Charset getEncoding();
- @ParameterDescription(valueName = "sampleData", description = "data to be used, usually a file name.")
- File getData();
-
@ParameterDescription(valueName = "language", description = "language which is being processed.")
String getLang();
@@ -64,27 +56,18 @@ interface Parameters {
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(POSSample.class, "ad",
- new ADPOSSampleStreamFactory<>(Parameters.class));
+ new ADPOSSampleStreamFactory(Parameters.class));
}
- protected ADPOSSampleStreamFactory(Class params) {
+ protected ADPOSSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
-
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
language = params.getLang();
-
- InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
- ObjectStream lineStream = null;
- try {
- lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
- } catch (IOException ex) {
- CmdLineUtil.handleCreateObjectStreamError(ex);
- }
-
+ ObjectStream lineStream = readData(args, Parameters.class);
return new ADPOSSampleStream(lineStream, params.getExpandME(), params.getIncludeFeatures());
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStream.java
index d5a3401b9b..c78bf2b9a4 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStream.java
@@ -99,9 +99,9 @@ public SentenceSample read() throws IOException {
do {
do {
if (!isTitle || isIncludeTitles) {
- if (hasPunctuation(sent.getText())) {
+ if (hasPunctuation(sent.text())) {
int start = document.length();
- document.append(sent.getText());
+ document.append(sent.text());
sentences.add(new Span(start, document.length()));
document.append(" ");
}
@@ -116,7 +116,7 @@ public SentenceSample read() throws IOException {
while (isSameText);
String doc;
- if (document.length() > 0) {
+ if (!document.isEmpty()) {
doc = document.substring(0, document.length() - 1);
} else {
doc = document.toString();
@@ -127,7 +127,7 @@ public SentenceSample read() throws IOException {
private boolean hasPunctuation(String text) {
text = text.trim();
- if (text.length() > 0) {
+ if (!text.isEmpty()) {
char lastChar = text.charAt(text.length() - 1);
return Arrays.binarySearch(ptEosCharacters, lastChar) >= 0;
}
@@ -135,13 +135,12 @@ private boolean hasPunctuation(String text) {
}
// there are some different types of metadata depending on the corpus.
- // TODO Merge this patterns
- private static final Pattern META_1 = Pattern
- .compile("^(?:[a-zA-Z\\-]*(\\d+)).*?p=(\\d+).*");
+ // TODO Merge these patterns
+ private static final Pattern META_1 = Pattern.compile("^(?:[a-zA-Z\\-]*(\\d+)).*?p=(\\d+).*");
private void updateMeta() {
if (this.sent != null) {
- String meta = this.sent.getMetadata();
+ String meta = this.sent.metadata();
Matcher m = META_1.matcher(meta);
int currentText;
int currentPara;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java
index 48748c9205..50c8f593fc 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java
@@ -17,37 +17,29 @@
package opennlp.tools.formats.ad;
-import java.io.File;
-import java.io.IOException;
import java.nio.charset.Charset;
-import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
-import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
import opennlp.tools.commons.Internal;
import opennlp.tools.formats.LanguageSampleStreamFactory;
import opennlp.tools.sentdetect.SentenceSample;
-import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
/**
* Note:
* Do not use this class, internal use only!
*/
@Internal
-public class ADSentenceSampleStreamFactory extends
- LanguageSampleStreamFactory {
+public class ADSentenceSampleStreamFactory extends
+ LanguageSampleStreamFactory {
- interface Parameters {
+ public interface Parameters extends BasicFormatParams {
@ParameterDescription(valueName = "charsetName", description = "encoding for reading and writing text.")
Charset getEncoding();
- @ParameterDescription(valueName = "sampleData", description = "data to be used, usually a file name.")
- File getData();
-
@ParameterDescription(valueName = "language", description = "language which is being processed.")
String getLang();
@@ -59,29 +51,19 @@ interface Parameters {
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(SentenceSample.class, "ad",
- new ADSentenceSampleStreamFactory<>(Parameters.class));
+ new ADSentenceSampleStreamFactory(Parameters.class));
}
- protected ADSentenceSampleStreamFactory(Class params) {
+ protected ADSentenceSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
-
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
language = params.getLang();
-
boolean includeTitle = params.getIncludeTitles();
- InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
-
- ObjectStream lineStream = null;
- try {
- lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
- } catch (IOException ex) {
- CmdLineUtil.handleCreateObjectStreamError(ex);
- }
-
+ ObjectStream lineStream = readData(args, Parameters.class);
return new ADSentenceSampleStream(lineStream, includeTitle);
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
index 27e9174940..1445f153b5 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
@@ -51,38 +51,8 @@
@Internal
public class ADSentenceStream extends FilterObjectStream {
- public static class Sentence {
-
- private String text;
- private Node root;
- private String metadata;
-
+ public record Sentence (String text, Node root, String metadata) {
public static final String META_LABEL_FINAL = "final";
-
- public String getText() {
- return text;
- }
-
- public void setText(String text) {
- this.text = text;
- }
-
- public Node getRoot() {
- return root;
- }
-
- public void setRoot(Node root) {
- this.root = root;
- }
-
- public void setMetadata(String metadata) {
- this.metadata = metadata;
- }
-
- public String getMetadata() {
- return metadata;
- }
-
}
/**
@@ -116,7 +86,7 @@ public static class SentenceParser {
* @return A {@link Sentence} instance parsed from {@code sentenceString}.
*/
public Sentence parse(String sentenceString, int para, boolean isTitle, boolean isBox) {
- Sentence sentence = new Sentence();
+ Sentence sentence;
Node root = new Node();
try (BufferedReader reader = new BufferedReader(new StringReader(sentenceString))) {
// first line is
@@ -153,8 +123,7 @@ public Sentence parse(String sentenceString, int para, boolean isTitle, boolean
meta = line.substring(0, start) + " p=" + para + titleTag + boxTag + metaFromSource;
}
}
- sentence.setText(text);
- sentence.setMetadata(meta);
+ sentence = new Sentence(text, root, meta);
// now we look for the root node
do {
line = reader.readLine();
@@ -232,10 +201,9 @@ public Sentence parse(String sentenceString, int para, boolean isTitle, boolean
} catch (Exception e) {
logger.warn("Caught exception for the given sentence: '{}'", sentenceString, e);
- return sentence;
+ return null;
}
// second line should be SOURCE
- sentence.setRoot(root);
return sentence;
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java
index 7a93006a3c..a720b1abbd 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java
@@ -32,24 +32,24 @@
* Do not use this class, internal use only!
*/
@Internal
-public class ADTokenSampleStreamFactory extends
- DetokenizerSampleStreamFactory {
+public class ADTokenSampleStreamFactory extends
+ DetokenizerSampleStreamFactory {
- interface Parameters extends ADNameSampleStreamFactory.Parameters, DetokenizerParameter {
+ public interface Parameters extends ADNameSampleStreamFactory.Parameters, DetokenizerParameter {
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(TokenSample.class, "ad",
- new ADTokenSampleStreamFactory<>(Parameters.class));
+ new ADTokenSampleStreamFactory(Parameters.class));
}
- protected ADTokenSampleStreamFactory(Class params) {
+ protected ADTokenSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
ObjectStream samples = StreamFactoryRegistry.getFactory(
NameSample.class, "ad").create(
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/PortugueseContractionUtility.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/PortugueseContractionUtility.java
index c734ba46c2..36b1f79a9d 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/PortugueseContractionUtility.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/PortugueseContractionUtility.java
@@ -40,6 +40,7 @@
public class PortugueseContractionUtility {
protected static final Map CONTRACTIONS;
+ private static final String SYMBOL_PLUS = "+";
static {
Map elems = new HashMap<>();
@@ -162,7 +163,7 @@ public class PortugueseContractionUtility {
* @return The merged contraction.
*/
public static String toContraction(String left, String right) {
- String key = left + "+" + right;
+ String key = left + SYMBOL_PLUS + right;
if (CONTRACTIONS.containsKey(key)) {
return CONTRACTIONS.get(key);
} else {
@@ -171,7 +172,7 @@ public static String toContraction(String left, String right) {
for (int i = 0; i < parts.length - 1; i++) {
sb.append(parts[i]).append(" ");
}
- key = parts[parts.length - 1] + "+" + right;
+ key = parts[parts.length - 1] + SYMBOL_PLUS + right;
if (CONTRACTIONS.containsKey(key)) {
sb.append(CONTRACTIONS.get(key));
return sb.toString();
@@ -180,7 +181,7 @@ public static String toContraction(String left, String right) {
if (right.contains("_")) {
parts = right.split("_");
- key = left + "+" + parts[0];
+ key = left + SYMBOL_PLUS + parts[0];
if (CONTRACTIONS.containsKey(key)) {
sb.append(CONTRACTIONS.get(key)).append(" ");
@@ -194,7 +195,7 @@ public static String toContraction(String left, String right) {
}
String leftLower = StringUtil.toLowerCase(parts[parts.length - 1]);
- key = leftLower + "+" + right;
+ key = leftLower + SYMBOL_PLUS + right;
if (CONTRACTIONS.containsKey(key)) {
String r = CONTRACTIONS.get(key);
String firstChar = r.substring(0, 1);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java
index b1c7703f5c..1b669d7b76 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java
@@ -30,6 +30,13 @@
import opennlp.tools.tokenize.WhitespaceTokenizer;
+/**
+ * Encapsulates a type to class mapping for entities, relations, events, etc.
+ *
+ * Details on how a annotation configuration file should be structured can be found
+ * in the brat annotation configuration
+ * section of the official BRAT documentation.
+ */
public class AnnotationConfiguration {
public static final String SPAN_TYPE = "Span";
@@ -38,68 +45,93 @@ public class AnnotationConfiguration {
public static final String ATTRIBUTE_TYPE = "Attribute";
public static final String EVENT_TYPE = "Event";
+ private static final String SYMBOL_HASH = "#";
+ private static final String BRACKET_OPEN = "[";
+ private static final String BRACKET_CLOSE = "]";
+
private final Map typeToClassMap;
+ /**
+ * Initializes an {@link AnnotationConfiguration} with the specified {@code typeToClassMap}.
+ * @param typeToClassMap A type to class mapping. Must not be {@code null}.
+ */
public AnnotationConfiguration(Map typeToClassMap) {
this.typeToClassMap = Map.copyOf(typeToClassMap);
}
+ /**
+ * @param type The type to get the type class for.
+ * @return Retrieves the class for the specified {@code type}, {@code null} if not found.
+ */
public String getTypeClass(String type) {
return typeToClassMap.get(type);
}
-
+ /**
+ * Parses a given {@link File annConfigFile} into a {@link AnnotationConfiguration}.
+ *
+ * @param in A valid {@link File annConfigFile} from which the config should
+ * be read. Must not be {@code null} and must be in the correct format,
+ * see:
+ * Brat annotation configuration
+ *
+ * @return A valid {@link AnnotationConfiguration} instance.
+ * @throws IOException Thrown if IO errors occurred during parsing.
+ */
public static AnnotationConfiguration parse(InputStream in) throws IOException {
Map typeToClassMap = new HashMap<>();
- BufferedReader reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
-
- // Note: This only supports entities and relations section
- String line;
- String sectionType = null;
-
- while ((line = reader.readLine()) != null) {
- line = line.trim();
-
- if (!line.isEmpty()) {
- if (!line.startsWith("#")) {
- if (line.startsWith("[") && line.endsWith("]")) {
- sectionType = line.substring(line.indexOf('[') + 1, line.indexOf(']'));
- }
- else {
- String typeName = WhitespaceTokenizer.INSTANCE.tokenize(line)[0];
-
- switch (sectionType) {
- case "entities":
- typeToClassMap.put(typeName, AnnotationConfiguration.ENTITY_TYPE);
- break;
-
- case "relations":
- typeToClassMap.put(typeName, AnnotationConfiguration.RELATION_TYPE);
- break;
-
- case "attributes":
- typeToClassMap.put(typeName, AnnotationConfiguration.ATTRIBUTE_TYPE);
- break;
-
- case "events":
- typeToClassMap.put(typeName, AnnotationConfiguration.EVENT_TYPE);
- break;
-
- default:
- break;
+ try (BufferedReader reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) {
+ // Note: This only supports entities and relations section
+ String line;
+ String sectionType = null;
+
+ while ((line = reader.readLine()) != null) {
+ line = line.trim();
+
+ if (!line.isEmpty()) {
+ if (!line.startsWith(SYMBOL_HASH)) {
+ if (line.startsWith(BRACKET_OPEN) && line.endsWith(BRACKET_CLOSE)) {
+ sectionType = line.substring(line.indexOf('[') + 1, line.indexOf(']'));
+ } else {
+ String typeName = WhitespaceTokenizer.INSTANCE.tokenize(line)[0];
+
+ switch (sectionType) {
+ case "entities":
+ typeToClassMap.put(typeName, AnnotationConfiguration.ENTITY_TYPE);
+ break;
+ case "relations":
+ typeToClassMap.put(typeName, AnnotationConfiguration.RELATION_TYPE);
+ break;
+ case "attributes":
+ typeToClassMap.put(typeName, AnnotationConfiguration.ATTRIBUTE_TYPE);
+ break;
+ case "events":
+ typeToClassMap.put(typeName, AnnotationConfiguration.EVENT_TYPE);
+ break;
+ default:
+ break;
+ }
}
}
}
}
}
-
return new AnnotationConfiguration(typeToClassMap);
}
+ /**
+ * Parses a given {@link File annConfigFile} into a {@link AnnotationConfiguration}.
+ *
+ * @param annConfigFile A valid {@link File annConfigFile} from which the config should
+ * be read. Must not be {@code null} and must be in the correct format,
+ * see:
+ * Brat annotation configuration
+ *
+ * @return A valid {@link AnnotationConfiguration} instance.
+ * @throws IOException Thrown if IO errors occurred during parsing.
+ */
public static AnnotationConfiguration parse(File annConfigFile) throws IOException {
- try (InputStream in = new BufferedInputStream(new FileInputStream(annConfigFile))) {
- return parse(in);
- }
+ return parse(new BufferedInputStream(new FileInputStream(annConfigFile)));
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java
index e45797d414..43dc3a2535 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java
@@ -199,8 +199,7 @@ static class AnnotatorNoteParser extends BratAnnotationParser {
@Override
BratAnnotation parse(Span[] tokens, CharSequence line) throws IOException {
-
- Span noteSpan = new Span( tokens[START_VALUE_OFFSET].getStart(),
+ Span noteSpan = new Span( tokens[START_VALUE_OFFSET].getStart(),
tokens[tokens.length - 1].getEnd() );
return new AnnotatorNoteAnnotation(tokens[ID_OFFSET].getCoveredText(line).toString(),
@@ -208,6 +207,7 @@ BratAnnotation parse(Span[] tokens, CharSequence line) throws IOException {
noteSpan.getCoveredText(line).toString());
}
}
+
private final AnnotationConfiguration config;
private final BufferedReader reader;
private final String id;
@@ -219,10 +219,9 @@ BratAnnotation parse(Span[] tokens, CharSequence line) throws IOException {
reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
}
+ @Override
public BratAnnotation read() throws IOException {
-
String line = reader.readLine();
-
if (line != null) {
Span[] tokens = WhitespaceTokenizer.INSTANCE.tokenizePos(line);
@@ -259,7 +258,7 @@ public BratAnnotation read() throws IOException {
}
break;
default:
- // Skip it, do that for everything unsupported (e.g. "*" id)
+ // Skip it, do that for everything unsupported (e.g. "*" id)
return read();
}
@@ -275,10 +274,12 @@ public BratAnnotation read() throws IOException {
return null;
}
+ @Override
public void reset() throws IOException, UnsupportedOperationException {
reader.reset();
}
+ @Override
public void close() throws IOException {
reader.close();
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
index 8f786749b9..8f14d54b3a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
@@ -100,25 +100,22 @@ public Collection getAnnotations() {
public static BratDocument parseDocument(AnnotationConfiguration config, String id,
InputStream txtIn, InputStream annIn) throws IOException {
- Reader txtReader = new InputStreamReader(txtIn, StandardCharsets.UTF_8);
-
- StringBuilder text = new StringBuilder();
-
- char[] cbuf = new char[1024];
-
- int len;
- while ((len = txtReader.read(cbuf)) > 0) {
- text.append(cbuf, 0, len);
- }
-
- Collection annotations = new ArrayList<>();
- ObjectStream annStream = new BratAnnotationStream(config, id, annIn);
- BratAnnotation ann;
- while ((ann = annStream.read()) != null) {
- annotations.add(ann);
+ try (Reader txtReader = new InputStreamReader(txtIn, StandardCharsets.UTF_8);
+ ObjectStream annStream = new BratAnnotationStream(config, id, annIn)) {
+
+ StringBuilder text = new StringBuilder();
+ char[] cbuf = new char[1024];
+ int len;
+ while ((len = txtReader.read(cbuf)) > 0) {
+ text.append(cbuf, 0, len);
+ }
+ Collection annotations = new ArrayList<>();
+ BratAnnotation ann;
+ while ((ann = annStream.read()) != null) {
+ annotations.add(ann);
+ }
+ return new BratDocument(config, id, text.toString(), annotations);
}
- annStream.close();
-
- return new BratDocument(config, id, text.toString(), annotations);
}
+
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
index d799b4c474..feb3a4c3a2 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
@@ -38,44 +38,41 @@ public class BratNameSampleStream extends SegmenterObjectStream samples) {
- super(samples);
-
- this.parser = new BratDocumentParser(sentDetector, tokenizer, null);
+ this(sentDetector, tokenizer, samples, null);
}
/**
- * Creates a new {@link BratNameSampleStream}.
- * @param sentModel a {@link SentenceModel} model
- * @param tokenModel a {@link TokenizerModel} model
- * @param samples a {@link BratDocument} {@link ObjectStream}
+ * Initializes a new {@link BratNameSampleStream} with the specified (model) parameters.
+ *
+ * @param sentModel A valid {@link SentenceModel sentence detection model}.
+ * @param tokenModel A valid {@link TokenizerModel tokenizer model}.
+ * @param samples The {@link BratDocument} {@link ObjectStream} to process.
*/
public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel,
ObjectStream samples) {
- super(samples);
-
- // TODO: We can pass in custom validators here ...
- this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel),
- new TokenizerME(tokenModel), null);
+ this(new SentenceDetectorME(sentModel), new TokenizerME(tokenModel), samples, null);
}
/**
* Creates a new {@link BratNameSampleStream}.
- * @param sentDetector a {@link SentenceDetector} instance
- * @param tokenizer a {@link Tokenizer} instance
- * @param samples a {@link BratDocument} {@link ObjectStream}
- * @param nameTypes the name types to use or null if all name types
+ * @param sentDetector A valid {@link SentenceDetector} instance.
+ * @param tokenizer A valid {@link Tokenizer} instance.
+ * @param samples The {@link BratDocument} {@link ObjectStream} to process.
+ *
+ * @param nameTypes the name types to use or {@code null} if all name types.
*/
public BratNameSampleStream(SentenceDetector sentDetector,
Tokenizer tokenizer, ObjectStream samples, Set nameTypes) {
super(samples);
-
this.parser = new BratDocumentParser(sentDetector, tokenizer, nameTypes);
}
@@ -84,13 +81,12 @@ public BratNameSampleStream(SentenceDetector sentDetector,
* @param sentModel a {@link SentenceModel} model
* @param tokenModel a {@link TokenizerModel} model
* @param samples a {@link BratDocument} {@link ObjectStream}
- * @param nameTypes the name types to use or null if all name types
+ * @param nameTypes the name types to use or {@code null} if all name types
*/
public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel,
ObjectStream samples, Set nameTypes) {
super(samples);
-
- // TODO: We can pass in custom validators here ...
+ // Hint: We can pass in custom validators here ...
this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel),
new TokenizerME(tokenModel), nameTypes);
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
index d5e8793ee1..9017c944e4 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
@@ -41,10 +41,15 @@
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.ObjectStream;
+/**
+ * Note: Do not use this class, internal use only!
+ *
+ * @see BratNameSampleStream
+ */
public class BratNameSampleStreamFactory
extends AbstractSampleStreamFactory {
- interface Parameters {
+ public interface Parameters {
@ParameterDescription(valueName = "bratDataDir", description = "location of brat data dir")
File getBratDataDir();
@@ -76,37 +81,40 @@ protected BratNameSampleStreamFactory() {
super(Parameters.class);
}
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(NameSample.class, "brat",
+ new BratNameSampleStreamFactory());
+ }
+
/**
- * Checks that non of the passed values are null.
+ * Checks that non of the passed values are {@code null}.
*
- * @param objects
- * @return true or false
+ * @param objects The objects to check for {@code null}.
+ * @return {@code true} if at least one object is {@code null}, {@code false} otherwise.
*/
private boolean notNull(Object... objects) {
-
for (Object obj : objects) {
if (obj == null)
return false;
}
-
return true;
}
@Override
public ObjectStream create(String[] args) {
-
+ if (args == null) {
+ throw new IllegalArgumentException("Passed args must not be null!");
+ }
Parameters params = ArgumentParser.parse(args, Parameters.class);
if (notNull(params.getRuleBasedTokenizer(), params.getTokenizerModel())) {
throw new TerminateToolException(-1, "Either use rule based or statistical tokenizer!");
}
- // TODO: Provide the file name to the annotation.conf file and implement the parser ...
AnnotationConfiguration annConfig;
try {
annConfig = AnnotationConfiguration.parse(params.getAnnotationConfig());
- }
- catch (IOException e) {
+ } catch (IOException e) {
throw new TerminateToolException(1, "Failed to parse annotation.conf file!");
}
@@ -121,37 +129,30 @@ public ObjectStream create(String[] args) {
}
SentenceDetector sentDetector;
-
if (params.getSentenceDetectorModel() != null) {
try {
sentDetector = new SentenceDetectorME(new SentenceModel(params.getSentenceDetectorModel()));
} catch (IOException e) {
throw new TerminateToolException(-1, "Failed to load sentence detector model!", e);
}
- }
- else {
+ } else {
sentDetector = new NewlineSentenceDetector();
}
Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
-
if (params.getTokenizerModel() != null) {
try {
tokenizer = new TokenizerME(new TokenizerModel(params.getTokenizerModel()));
} catch (IOException e) {
throw new TerminateToolException(-1, "Failed to load tokenizer model!", e);
}
- }
- else if (params.getRuleBasedTokenizer() != null) {
+ } else if (params.getRuleBasedTokenizer() != null) {
String tokenizerName = params.getRuleBasedTokenizer();
-
if ("simple".equals(tokenizerName)) {
tokenizer = SimpleTokenizer.INSTANCE;
- }
- else if ("whitespace".equals(tokenizerName)) {
+ } else if ("whitespace".equals(tokenizerName)) {
tokenizer = WhitespaceTokenizer.INSTANCE;
- }
- else {
+ } else {
throw new TerminateToolException(-1, "Unknown tokenizer: " + tokenizerName);
}
}
@@ -167,8 +168,4 @@ else if ("whitespace".equals(tokenizerName)) {
return new BratNameSampleStream(sentDetector, tokenizer, samples, nameTypes);
}
- public static void registerFactory() {
- StreamFactoryRegistry.registerFactory(NameSample.class, "brat",
- new BratNameSampleStreamFactory());
- }
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java
index 82c4e4aa41..05ee3e3dbb 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java
@@ -35,6 +35,7 @@ public SegmenterObjectStream(ObjectStream in) {
protected abstract List read(S sample) throws IOException;
+ @Override
public final T read() throws IOException {
if (sampleIt.hasNext()) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
index c7b1f77bf8..2dedf86cea 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
@@ -32,11 +32,15 @@
/**
* Note: Do not use this class, internal use only!
+ *
+ * @see LemmaSample
+ * @see ConlluLemmaSampleStream
*/
@Internal
-public class ConlluLemmaSampleStreamFactory extends AbstractSampleStreamFactory {
+public class ConlluLemmaSampleStreamFactory extends
+ AbstractSampleStreamFactory {
- interface Parameters extends BasicFormatParams {
+ public interface Parameters extends BasicFormatParams {
@ArgumentParser.ParameterDescription(valueName = "tagset",
description = "u|x u for unified tags and x for language-specific part-of-speech tags")
@ArgumentParser.OptionalParameter(defaultValue = "u")
@@ -46,15 +50,15 @@ interface Parameters extends BasicFormatParams {
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(LemmaSample.class,
ConlluPOSSampleStreamFactory.CONLLU_FORMAT,
- new ConlluLemmaSampleStreamFactory<>(Parameters.class));
+ new ConlluLemmaSampleStreamFactory(Parameters.class));
}
- protected ConlluLemmaSampleStreamFactory(Class params) {
+ protected ConlluLemmaSampleStreamFactory(Class params) {
super(params);
}
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
ConlluTagset tagset = switch (params.getTagset()) {
case "u" -> ConlluTagset.U;
@@ -68,7 +72,6 @@ public ObjectStream create(String[] args) {
try {
return new ConlluLemmaSampleStream(new ConlluStream(inFactory), tagset);
} catch (IOException e) {
- // That will throw an exception
CmdLineUtil.handleCreateObjectStreamError(e);
}
return null;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java
index 6601dbb64d..9a40b0a178 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java
@@ -32,13 +32,17 @@
/**
* Note: Do not use this class, internal use only!
+ *
+ * @see POSSample
+ * @see ConlluPOSSampleStream
*/
@Internal
-public class ConlluPOSSampleStreamFactory extends AbstractSampleStreamFactory {
+public class ConlluPOSSampleStreamFactory extends
+ AbstractSampleStreamFactory {
public static final String CONLLU_FORMAT = "conllu";
- interface Parameters extends BasicFormatParams {
+ public interface Parameters extends BasicFormatParams {
@ArgumentParser.ParameterDescription(valueName = "tagset",
description = "u|x u for unified tags and x for language-specific part-of-speech tags")
@ArgumentParser.OptionalParameter(defaultValue = "u")
@@ -47,15 +51,16 @@ interface Parameters extends BasicFormatParams {
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(POSSample.class,
- CONLLU_FORMAT, new ConlluPOSSampleStreamFactory<>(Parameters.class));
+ CONLLU_FORMAT, new ConlluPOSSampleStreamFactory(Parameters.class));
}
- protected ConlluPOSSampleStreamFactory(Class params) {
+ protected ConlluPOSSampleStreamFactory(Class params) {
super(params);
}
+ @Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
ConlluTagset tagset = switch (params.getTagset()) {
case "u" -> ConlluTagset.U;
@@ -69,7 +74,6 @@ public ObjectStream create(String[] args) {
try {
return new ConlluPOSSampleStream(new ConlluStream(inFactory), tagset);
} catch (IOException e) {
- // That will throw an exception
CmdLineUtil.handleCreateObjectStreamError(e);
}
return null;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
index 695534d188..2150be309f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
@@ -44,9 +44,7 @@ public class ConlluSentence {
public ConlluSentence(List wordLines, String sentenceIdComment, String textComment,
boolean newDocument, String documentId, boolean newParagraph, String paragraphId,
Map textLang, String translit) {
- this.wordLines = wordLines;
- this.sentenceIdComment = sentenceIdComment;
- this.textComment = textComment;
+ this(wordLines, sentenceIdComment, textComment);
this.newDocument = newDocument;
this.documentId = documentId;
this.newParagraph = newParagraph;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
index 3b1164caf6..6e66575954 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
@@ -30,15 +30,16 @@
import opennlp.tools.util.ObjectStream;
/**
- * Note:
- * Do not use this class, internal use only!
+ * Note: Do not use this class, internal use only!
*
+ * @see SentenceSample
* @see ConlluSentenceSampleStream
*/
@Internal
-public class ConlluSentenceSampleStreamFactory extends AbstractSampleStreamFactory {
+public class ConlluSentenceSampleStreamFactory extends
+ AbstractSampleStreamFactory {
- interface Parameters extends BasicFormatParams {
+ public interface Parameters extends BasicFormatParams {
@ArgumentParser.ParameterDescription(valueName = "sentencesPerSample",
description = "number of sentences per sample")
String getSentencesPerSample();
@@ -47,16 +48,16 @@ interface Parameters extends BasicFormatParams {
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(SentenceSample.class,
ConlluPOSSampleStreamFactory.CONLLU_FORMAT,
- new ConlluSentenceSampleStreamFactory<>(ConlluSentenceSampleStreamFactory.Parameters.class));
+ new ConlluSentenceSampleStreamFactory(ConlluSentenceSampleStreamFactory.Parameters.class));
}
- protected ConlluSentenceSampleStreamFactory(Class params) {
+ protected ConlluSentenceSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
InputStreamFactory inFactory =
CmdLineUtil.createInputStreamFactory(params.getData());
@@ -65,7 +66,6 @@ public ObjectStream create(String[] args) {
return new ConlluSentenceSampleStream(new ConlluStream(inFactory),
Integer.parseInt(params.getSentencesPerSample()));
} catch (IOException e) {
- // That will throw an exception
CmdLineUtil.handleCreateObjectStreamError(e);
}
return null;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
index 5f813a65fe..ac3b18d479 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
@@ -19,7 +19,6 @@
import java.io.IOException;
-import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.BasicFormatParams;
@@ -30,30 +29,31 @@
import opennlp.tools.util.ObjectStream;
/**
- * Note:
- * Do not use this class, internal use only!
+ * Note: Do not use this class, internal use only!
*
+ * @see TokenSample
* @see ConlluTokenSampleStream
*/
@Internal
-public class ConlluTokenSampleStreamFactory extends AbstractSampleStreamFactory {
+public class ConlluTokenSampleStreamFactory extends
+ AbstractSampleStreamFactory {
- interface Parameters extends BasicFormatParams {
+ public interface Parameters extends BasicFormatParams {
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(TokenSample.class,
ConlluPOSSampleStreamFactory.CONLLU_FORMAT,
- new ConlluTokenSampleStreamFactory<>(ConlluTokenSampleStreamFactory.Parameters.class));
+ new ConlluTokenSampleStreamFactory(ConlluTokenSampleStreamFactory.Parameters.class));
}
- protected ConlluTokenSampleStreamFactory(Class params) {
+ protected ConlluTokenSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
InputStreamFactory inFactory =
CmdLineUtil.createInputStreamFactory(params.getData());
@@ -61,7 +61,6 @@ public ObjectStream create(String[] args) {
try {
return new ConlluTokenSampleStream(new ConlluStream(inFactory));
} catch (IOException e) {
- // That will throw an exception
CmdLineUtil.handleCreateObjectStreamError(e);
}
return null;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactory.java
index ad5354e4c3..7297126bad 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactory.java
@@ -34,23 +34,24 @@
* @see NameToSentenceSampleStream
*/
@Internal
-public class NameToSentenceSampleStreamFactory extends DetokenizerSampleStreamFactory {
+public class NameToSentenceSampleStreamFactory extends
+ DetokenizerSampleStreamFactory {
interface Parameters extends NameSampleDataStreamFactory.Parameters, DetokenizerParameter {
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(SentenceSample.class,
- "namefinder", new NameToSentenceSampleStreamFactory<>(Parameters.class));
+ "namefinder", new NameToSentenceSampleStreamFactory(Parameters.class));
}
- protected NameToSentenceSampleStreamFactory(Class params) {
+ protected NameToSentenceSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
ObjectStream nameSampleStream = StreamFactoryRegistry.getFactory(
NameSample.class, StreamFactoryRegistry.DEFAULT_FORMAT).create(
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactory.java
index 4d2c10bc61..54392331bc 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactory.java
@@ -34,23 +34,24 @@
* @see NameToTokenSampleStream
*/
@Internal
-public class NameToTokenSampleStreamFactory extends DetokenizerSampleStreamFactory {
+public class NameToTokenSampleStreamFactory extends
+ DetokenizerSampleStreamFactory {
- interface Parameters extends NameSampleDataStreamFactory.Parameters, DetokenizerParameter {
+ public interface Parameters extends NameSampleDataStreamFactory.Parameters, DetokenizerParameter {
}
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(TokenSample.class,
- "namefinder", new NameToTokenSampleStreamFactory<>(Parameters.class));
+ "namefinder", new NameToTokenSampleStreamFactory(Parameters.class));
}
- protected NameToTokenSampleStreamFactory(Class params) {
+ protected NameToTokenSampleStreamFactory(Class params) {
super(params);
}
@Override
public ObjectStream create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ Parameters params = validateBasicFormatParameters(args, Parameters.class);
ObjectStream nameSampleStream = StreamFactoryRegistry.getFactory(
NameSample.class, StreamFactoryRegistry.DEFAULT_FORMAT).create(
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactory.java
index 25162683ff..8aaf104fc0 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactory.java
@@ -34,23 +34,24 @@
* @see POSToSentenceSampleStream
*/
@Internal
-public class POSToSentenceSampleStreamFactory extends DetokenizerSampleStreamFactory {
+public class POSToSentenceSampleStreamFactory extends
+ DetokenizerSampleStreamFactory