Skip to content

Commit f6df357

Browse files
committed
OPENNLP-1695: Add more tests for classes in formats package
- introduces AbstractSampleStreamFactoryTest as common base class - reduces code duplication in format factory classes - adds a ton of new test classes for format factories (+ sample data) - adds two more Evalita samples taken from Appendix of: https://www.evalita.it/wp-content/uploads/2021/11/Guidelines_evalita09_NER.pdf - adds two OntoNotes samples from the public, official v5.0 release notes (sec 6.4 + 6.8), see: https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
1 parent e86b47f commit f6df357

File tree

161 files changed

+6146
-1060
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

161 files changed

+6146
-1060
lines changed

opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSampleStream.java

+6-6
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@
3131
* Parses the conll 2000 shared task shallow parser training data.
3232
* <p>
3333
* Data format is specified on the conll page:<br>
34-
* <a href="http://www.cnts.ua.ac.be/conll2000/chunking/">
35-
* http://www.cnts.ua.ac.be/conll2000/chunking/</a>
34+
* <a href="https://www.cnts.ua.ac.be/conll2000/chunking/">
35+
* https://www.cnts.ua.ac.be/conll2000/chunking/</a>
3636
*/
3737
public class ChunkSampleStream extends FilterObjectStream<String, ChunkSample> {
3838

@@ -57,7 +57,7 @@ public ChunkSample read() throws IOException {
5757
for (String line = samples.read(); line != null && !line.isEmpty(); line = samples.read()) {
5858
String[] parts = line.split(" ");
5959
if (parts.length != 3) {
60-
logger.error("Skipping corrupt line: {}", line);
60+
logger.warn("Skipping corrupt line: {}", line);
6161
}
6262
else {
6363
toks.add(parts[0]);
@@ -66,11 +66,11 @@ public ChunkSample read() throws IOException {
6666
}
6767
}
6868

69-
if (toks.size() > 0) {
69+
if (!toks.isEmpty()) {
7070
return new ChunkSample(toks.toArray(new String[0]),
7171
tags.toArray(new String[0]), preds.toArray(new String[0]));
72+
} else {
73+
return null;
7274
}
73-
74-
return null;
7575
}
7676
}

opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java

+48
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,15 @@
1717

1818
package opennlp.tools.formats;
1919

20+
import java.io.IOException;
21+
22+
import opennlp.tools.cmdline.ArgumentParser;
23+
import opennlp.tools.cmdline.CmdLineUtil;
2024
import opennlp.tools.cmdline.ObjectStreamFactory;
25+
import opennlp.tools.cmdline.params.BasicFormatParams;
26+
import opennlp.tools.util.InputStreamFactory;
27+
import opennlp.tools.util.ObjectStream;
28+
import opennlp.tools.util.PlainTextByLineStream;
2129

2230
/**
2331
* Base class for sample stream factories.
@@ -40,4 +48,44 @@ public String getLang() {
4048
public Class<P> getParameters() {
4149
return params;
4250
}
51+
52+
/**
53+
* Creates an {@link ObjectStream} for the specified arguments and
54+
* the generic type {@code P}.
55+
*
56+
* @param args A set of command line arguments.
57+
* @return The created {@link ObjectStream} instance.
58+
*/
59+
protected <P extends BasicFormatParams> ObjectStream<String> readData(String[] args,
60+
Class<P> parametersClass) {
61+
P params = validateBasicFormatParameters(args, parametersClass);
62+
ObjectStream<String> lineStream = null;
63+
try {
64+
InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
65+
lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
66+
} catch (IOException ex) {
67+
CmdLineUtil.handleCreateObjectStreamError(ex);
68+
}
69+
return lineStream;
70+
}
71+
72+
/**
73+
* Validates the specified arguments ({@code args}) given the
74+
* context the generic type {@code P} which provides at least all
75+
* {@link BasicFormatParams}.
76+
*
77+
* @implNote Additional checks for the basic {@code -data} argument are conducted, that is
78+
* wether the file exists or not.
79+
*
80+
* @param args A set of command line arguments.
81+
* @return The parsed (basic format) parameter instance.
82+
*/
83+
protected <P extends BasicFormatParams> P validateBasicFormatParameters(String[] args, Class<P> clazz) {
84+
if (args == null) {
85+
throw new IllegalArgumentException("Passed args must not be null!");
86+
}
87+
P params = ArgumentParser.parse(args, clazz);
88+
CmdLineUtil.checkInputFile("Data", params.getData());
89+
return params;
90+
}
4391
}

opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java

+17-21
Original file line numberDiff line numberDiff line change
@@ -45,25 +45,24 @@
4545
* <p>
4646
* Data can be found on this
4747
* <a href="http://www.geniaproject.org/shared-tasks/bionlp-jnlpba-shared-task-2004">website</a>,
48-
* or in
49-
* <a href="https://github.com/spyysalo/jnlpba">this repository</a>.
48+
* or in this
49+
* <a href="https://github.com/spyysalo/jnlpba">GitHub repository</a>.
5050
* <p>
51-
* The BioNLP/NLPBA 2004 data were originally published here:
52-
* <p>
53-
* <a href="http://www-tsujii.is.s.u-tokyo.ac.jp/GENIA/ERtask/report.html">
54-
* http://www-tsujii.is.s.u-tokyo.ac.jp/GENIA/ERtask/report.html</a>,
51+
* The BioNLP/NLPBA 2004 data were originally published
52+
* <a href="http://www-tsujii.is.s.u-tokyo.ac.jp/GENIA/ERtask/report.html">here</a>,
5553
* <p>
5654
* yet this page was gone when last checked in December 2022.
5755
* <p>
58-
* It looks like this repo contains a copy of the data located on the original page:
59-
* The BioNLP 2004 seems to be related to http://www.geniaproject.org/shared-tasks/bionlp-jnlpba-shared-task-2004
60-
* <p>
6156
* <b>Note:</b>
6257
* Do not use this class, internal use only!
6358
*/
6459
@Internal
6560
public class BioNLP2004NameSampleStream implements ObjectStream<NameSample> {
6661

62+
private static final String CODEC_TAG_O = "O";
63+
private static final String CODEC_TAG_B = "B-";
64+
private static final String CODEC_TAG_I = "I-";
65+
6766
public static final int GENERATE_DNA_ENTITIES = 0x01;
6867
public static final int GENERATE_PROTEIN_ENTITIES = 0x01 << 1;
6968
public static final int GENERATE_CELLTYPE_ENTITIES = 0x01 << 2;
@@ -96,7 +95,6 @@ public NameSample read() throws IOException {
9695
boolean isClearAdaptiveData = false;
9796

9897
// Empty line indicates end of sentence
99-
10098
String line;
10199
while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line.trim())) {
102100

@@ -121,7 +119,7 @@ public NameSample read() throws IOException {
121119
}
122120
}
123121

124-
if (sentence.size() > 0) {
122+
if (!sentence.isEmpty()) {
125123

126124
// convert name tags into spans
127125
List<Span> names = new ArrayList<>();
@@ -133,34 +131,32 @@ public NameSample read() throws IOException {
133131
String tag = tags.get(i);
134132

135133
if (tag.endsWith("DNA") && (types & GENERATE_DNA_ENTITIES) == 0)
136-
tag = "O";
134+
tag = CODEC_TAG_O;
137135

138136
if (tag.endsWith("protein") && (types & GENERATE_PROTEIN_ENTITIES) == 0)
139-
tag = "O";
137+
tag = CODEC_TAG_O;
140138

141139
if (tag.endsWith("cell_type") && (types & GENERATE_CELLTYPE_ENTITIES) == 0)
142-
tag = "O";
140+
tag = CODEC_TAG_O;
143141

144142
if (tag.endsWith("cell_line") && (types & GENERATE_CELLTYPE_ENTITIES) == 0)
145-
tag = "O";
143+
tag = CODEC_TAG_O;
146144
if (tag.endsWith("RNA") && (types & GENERATE_RNA_ENTITIES) == 0)
147-
tag = "O";
145+
tag = CODEC_TAG_O;
148146

149-
if (tag.startsWith("B-")) {
147+
if (tag.startsWith(CODEC_TAG_B)) {
150148

151149
if (beginIndex != -1) {
152150
names.add(new Span(beginIndex, endIndex, tags.get(beginIndex).substring(2)));
153-
beginIndex = -1;
154-
endIndex = -1;
155151
}
156152

157153
beginIndex = i;
158154
endIndex = i + 1;
159155
}
160-
else if (tag.startsWith("I-")) {
156+
else if (tag.startsWith(CODEC_TAG_I)) {
161157
endIndex++;
162158
}
163-
else if (tag.equals("O")) {
159+
else if (tag.equals(CODEC_TAG_O)) {
164160
if (beginIndex != -1) {
165161
names.add(new Span(beginIndex, endIndex, tags.get(beginIndex).substring(2)));
166162
beginIndex = -1;

opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java

+15-15
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
import java.io.IOException;
2121

22-
import opennlp.tools.cmdline.ArgumentParser;
2322
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
2423
import opennlp.tools.cmdline.CmdLineUtil;
2524
import opennlp.tools.cmdline.StreamFactoryRegistry;
@@ -30,55 +29,56 @@
3029
/**
3130
* @see BioNLP2004NameSampleStream
3231
*/
33-
public class BioNLP2004NameSampleStreamFactory<P> extends AbstractSampleStreamFactory<NameSample, P> {
32+
public class BioNLP2004NameSampleStreamFactory extends
33+
AbstractSampleStreamFactory<NameSample, BioNLP2004NameSampleStreamFactory.Parameters> {
3434

35-
interface Parameters extends BasicFormatParams {
35+
public interface Parameters extends BasicFormatParams {
3636
@ParameterDescription(valueName = "DNA,protein,cell_type,cell_line,RNA")
3737
String getTypes();
3838
}
3939

4040
public static void registerFactory() {
4141
StreamFactoryRegistry.registerFactory(NameSample.class,
42-
"bionlp2004", new BioNLP2004NameSampleStreamFactory<>(Parameters.class));
42+
"bionlp2004", new BioNLP2004NameSampleStreamFactory(Parameters.class));
4343
}
4444

45-
protected BioNLP2004NameSampleStreamFactory(Class<P> params) {
45+
protected BioNLP2004NameSampleStreamFactory(Class<Parameters> params) {
4646
super(params);
4747
}
4848

4949
@Override
5050
public ObjectStream<NameSample> create(String[] args) {
51-
52-
Parameters params = ArgumentParser.parse(args, Parameters.class);
51+
Parameters params = validateBasicFormatParameters(args, Parameters.class);
5352

5453
int typesToGenerate = 0;
55-
56-
if (params.getTypes().contains("DNA")) {
54+
String types = params.getTypes();
55+
if (types.contains("DNA")) {
5756
typesToGenerate = typesToGenerate |
5857
BioNLP2004NameSampleStream.GENERATE_DNA_ENTITIES;
5958
}
60-
else if (params.getTypes().contains("protein")) {
59+
if (types.contains("protein")) {
6160
typesToGenerate = typesToGenerate |
6261
BioNLP2004NameSampleStream.GENERATE_PROTEIN_ENTITIES;
6362
}
64-
else if (params.getTypes().contains("cell_type")) {
63+
if (types.contains("cell_type")) {
6564
typesToGenerate = typesToGenerate |
6665
BioNLP2004NameSampleStream.GENERATE_CELLTYPE_ENTITIES;
6766
}
68-
else if (params.getTypes().contains("cell_line")) {
67+
if (types.contains("cell_line")) {
6968
typesToGenerate = typesToGenerate |
7069
BioNLP2004NameSampleStream.GENERATE_CELLLINE_ENTITIES;
7170
}
72-
else if (params.getTypes().contains("RNA")) {
71+
if (types.contains("RNA")) {
7372
typesToGenerate = typesToGenerate |
7473
BioNLP2004NameSampleStream.GENERATE_RNA_ENTITIES;
7574
}
7675

7776
try {
7877
return new BioNLP2004NameSampleStream(
7978
CmdLineUtil.createInputStreamFactory(params.getData()), typesToGenerate);
80-
} catch (IOException e) {
81-
throw new IllegalStateException(e);
79+
} catch (IOException ex) {
80+
CmdLineUtil.handleCreateObjectStreamError(ex);
8281
}
82+
return null;
8383
}
8484
}

opennlp-tools/src/main/java/opennlp/tools/formats/ChunkerSampleStreamFactory.java

+6-22
Original file line numberDiff line numberDiff line change
@@ -17,49 +17,33 @@
1717

1818
package opennlp.tools.formats;
1919

20-
import java.io.IOException;
21-
2220
import opennlp.tools.chunker.ChunkSample;
2321
import opennlp.tools.chunker.ChunkSampleStream;
24-
import opennlp.tools.cmdline.ArgumentParser;
25-
import opennlp.tools.cmdline.CmdLineUtil;
2622
import opennlp.tools.cmdline.StreamFactoryRegistry;
2723
import opennlp.tools.cmdline.params.BasicFormatParams;
28-
import opennlp.tools.util.InputStreamFactory;
2924
import opennlp.tools.util.ObjectStream;
30-
import opennlp.tools.util.PlainTextByLineStream;
3125

3226
/**
3327
* Factory producing OpenNLP {@link ChunkSampleStream}s.
3428
*/
35-
public class ChunkerSampleStreamFactory<P> extends AbstractSampleStreamFactory<ChunkSample, P> {
29+
public class ChunkerSampleStreamFactory extends
30+
AbstractSampleStreamFactory<ChunkSample, ChunkerSampleStreamFactory.Parameters> {
3631

37-
interface Parameters extends BasicFormatParams {
32+
public interface Parameters extends BasicFormatParams {
3833
}
3934

4035
public static void registerFactory() {
4136
StreamFactoryRegistry.registerFactory(ChunkSample.class,
42-
StreamFactoryRegistry.DEFAULT_FORMAT, new ChunkerSampleStreamFactory<>(Parameters.class));
37+
StreamFactoryRegistry.DEFAULT_FORMAT, new ChunkerSampleStreamFactory(Parameters.class));
4338
}
4439

45-
protected ChunkerSampleStreamFactory(Class<P> params) {
40+
protected ChunkerSampleStreamFactory(Class<Parameters> params) {
4641
super(params);
4742
}
4843

4944
@Override
5045
public ObjectStream<ChunkSample> create(String[] args) {
51-
Parameters params = ArgumentParser.parse(args, Parameters.class);
52-
53-
CmdLineUtil.checkInputFile("Data", params.getData());
54-
InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
55-
ObjectStream<String> lineStream = null;
56-
try {
57-
lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
58-
59-
} catch (IOException ex) {
60-
CmdLineUtil.handleCreateObjectStreamError(ex);
61-
}
62-
46+
ObjectStream<String> lineStream = readData(args, Parameters.class);
6347
return new ChunkSampleStream(lineStream);
6448
}
6549
}

opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStreamFactory.java

+6-7
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
import java.io.IOException;
2121

22-
import opennlp.tools.cmdline.ArgumentParser;
2322
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
2423
import opennlp.tools.cmdline.CmdLineUtil;
2524
import opennlp.tools.cmdline.StreamFactoryRegistry;
@@ -37,9 +36,10 @@
3736
* @see Conll02NameSampleStream
3837
*/
3938
@Internal
40-
public class Conll02NameSampleStreamFactory<P> extends LanguageSampleStreamFactory<NameSample, P> {
39+
public class Conll02NameSampleStreamFactory extends
40+
LanguageSampleStreamFactory<NameSample, Conll02NameSampleStreamFactory.Parameters> {
4141

42-
interface Parameters extends BasicFormatParams {
42+
public interface Parameters extends BasicFormatParams {
4343
@ParameterDescription(valueName = "spa|nld")
4444
String getLang();
4545

@@ -49,17 +49,17 @@ interface Parameters extends BasicFormatParams {
4949

5050
public static void registerFactory() {
5151
StreamFactoryRegistry.registerFactory(NameSample.class,
52-
"conll02", new Conll02NameSampleStreamFactory<>(Parameters.class));
52+
"conll02", new Conll02NameSampleStreamFactory(Parameters.class));
5353
}
5454

55-
protected Conll02NameSampleStreamFactory(Class<P> params) {
55+
protected Conll02NameSampleStreamFactory(Class<Parameters> params) {
5656
super(params);
5757
}
5858

5959
@Override
6060
public ObjectStream<NameSample> create(String[] args) {
6161

62-
Parameters params = ArgumentParser.parse(args, Parameters.class);
62+
Parameters params = validateBasicFormatParameters(args, Parameters.class);
6363

6464
LANGUAGE lang;
6565
if ("nl".equals(params.getLang()) || "nld".equals(params.getLang())) {
@@ -93,7 +93,6 @@ else if ("es".equals(params.getLang()) || "spa".equals(params.getLang())) {
9393
Conll02NameSampleStream.GENERATE_MISC_ENTITIES;
9494
}
9595

96-
9796
try {
9897
return new Conll02NameSampleStream(lang,
9998
CmdLineUtil.createInputStreamFactory(params.getData()), typesToGenerate);

0 commit comments

Comments
 (0)