Skip to content

Commit

Permalink
OPENNLP-1369 - NPE when serializing a TokenNameFinder model trained w…
Browse files Browse the repository at this point in the history
…ith POSTaggerNameFeatureGeneratorFactory (#571)

* OPENNLP-1369 NPE when serializing a TokenNameFinder model trained with POSTaggerNameFeatureGeneratorFactory
- adds first reproducer to check if 1369 is broken => yes it is
- confirms basic workaround is curing the issue, however this is not pretty.

* OPENNLP-1369 - Fixes NPE when serializing TokenNameFinder model trained with 1.5 Pos Models

Adds a test-case to reproduce the NPE reported in OPENNLP-1369

* OPENNLP-1369 - Checkstyle fixes

---------

Co-authored-by: Martin Wiesner <[email protected]>
  • Loading branch information
rzo1 and mawiesne authored Dec 22, 2023
1 parent b74c6df commit 186ecf9
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 6 deletions.
12 changes: 9 additions & 3 deletions opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,12 @@ protected void validateArtifactMap() throws InvalidFormatException {
}
}

@Override
protected boolean skipEntryForSerialization(Map.Entry<String, Object> entry) {
// An old model format was detected, skipping the process for this entry, see: OPENNLP-1369
return GENERATOR_DESCRIPTOR_ENTRY_NAME.equals(entry.getKey()) && entry.getValue() == null;
}

/**
* @deprecated use {@link POSModel#getPosSequenceModel} instead. This method will be removed soon.
* Only required for Parser 1.5.x backward compatibility. Newer models don't need this anymore.
Expand Down Expand Up @@ -232,7 +238,7 @@ public Class<POSModelSerializer> getArtifactSerializerClass() {
@Override
public int hashCode() {
return Objects.hash(artifactMap.get("manifest.properties"), artifactMap.get("pos.model"),
Arrays.hashCode((byte[]) artifactMap.get("generator.featuregen"))
Arrays.hashCode((byte[]) artifactMap.get(GENERATOR_DESCRIPTOR_ENTRY_NAME))
);
}

Expand All @@ -248,8 +254,8 @@ public boolean equals(Object obj) {

return artifactMap.get("manifest.properties").equals(artifactMapToCheck.get("manifest.properties")) &&
artifactMap.get("pos.model").equals(abstractModel) &&
Arrays.equals((byte[]) artifactMap.get("generator.featuregen"),
(byte[]) artifactMapToCheck.get("generator.featuregen"));
Arrays.equals((byte[]) artifactMap.get(GENERATOR_DESCRIPTOR_ENTRY_NAME),
(byte[]) artifactMapToCheck.get(GENERATOR_DESCRIPTOR_ENTRY_NAME));
}
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,10 @@ public final void serialize(OutputStream out) throws IOException {

Object artifact = entry.getValue();

if (skipEntryForSerialization(entry)) {
continue;
}

ArtifactSerializer serializer = getArtifactSerializer(name);

// If model is serialize-able always use the provided serializer
Expand Down Expand Up @@ -684,4 +688,12 @@ private void readObject(final ObjectInputStream in) throws IOException {

this.loadModel(in);
}

/**
* @param entry the entry to check
* @return {@code true}, if the given entry should be skipped for serialization.
*/
protected boolean skipEntryForSerialization(Entry<String, Object> entry) {
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
import org.junit.jupiter.api.extension.ExecutionCondition;
import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.jupiter.api.extension.ExtensionContext;
import org.junit.jupiter.params.ParameterizedTest;

import static org.junit.platform.commons.util.AnnotationUtils.findAnnotation;

Expand All @@ -36,7 +35,6 @@
*/
@Retention(RetentionPolicy.RUNTIME)
@ExtendWith(EnabledWhenCDNAvailable.CDNAvailableCondition.class)
@ParameterizedTest
public @interface EnabledWhenCDNAvailable {

String hostname();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,20 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.Map;
import java.util.stream.Collectors;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;

import opennlp.tools.EnabledWhenCDNAvailable;
import opennlp.tools.cmdline.AbstractModelLoaderTest;
import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool;
import opennlp.tools.postag.POSModel;
Expand All @@ -43,7 +48,7 @@
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelType;

public class TokenNameFinderModelTest {
public class TokenNameFinderModelTest extends AbstractModelLoaderTest {

@Test
void testNERWithPOSModel() throws IOException {
Expand Down Expand Up @@ -104,4 +109,80 @@ void testNERWithPOSModel() throws IOException {
FileUtil.deleteDirectory(resourcesFolder.toFile());
}
}

/*
* OPENNLP-1369
*/
@EnabledWhenCDNAvailable(hostname = "opennlp.sourceforge.net")
@Test
void testNERWithPOSModelV15() throws IOException, URISyntaxException {

// 0. Download model from sourceforge and place at the right location
final String modelName = "pt-pos-perceptron.bin";

downloadVersion15Model(modelName);

final Path model = OPENNLP_DIR.resolve(modelName);
final Path resourcesFolder = Files.createTempDirectory("resources").toAbsolutePath();

Assertions.assertNotNull(model);
Assertions.assertNotNull(resourcesFolder);

// 1. Copy the downloaded model to the temporary resource folder, so it can be referenced from
// the feature gen xml file.

final Path copy = resourcesFolder.resolve(modelName);

Files.copy(OPENNLP_DIR.resolve(modelName), copy, StandardCopyOption.REPLACE_EXISTING);

Assertions.assertTrue(copy.toFile().exists());

// 2. Load feature generator xml bytes
final URL featureGeneratorXmlUrl = this.getClass().getResource("ner-pos-features-v15.xml");
Assertions.assertNotNull(featureGeneratorXmlUrl);

final Path featureGeneratorXmlPath = Path.of(featureGeneratorXmlUrl.toURI());
Assertions.assertNotNull(featureGeneratorXmlPath);

final Path featureGenerator = Files.createTempFile("ner-featuregen-v15", ".xml");
Assertions.assertNotNull(featureGenerator);

Files.copy(featureGeneratorXmlPath, featureGenerator, StandardCopyOption.REPLACE_EXISTING);
Assertions.assertTrue(featureGenerator.toFile().exists());

Map<String, Object> resources;
try {
resources = TokenNameFinderTrainerTool.loadResources(resourcesFolder.toFile(),
featureGenerator.toAbsolutePath().toFile());
} catch (IOException e) {
throw new TerminateToolException(-1, e.getMessage(), e);
} finally {
Files.delete(featureGenerator);
}


// train a name finder
ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
new PlainTextByLineStream(new MockInputStreamFactory(
new File("opennlp/tools/namefind/voa1.train")), StandardCharsets.UTF_8));

TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, 70);
params.put(TrainingParameters.CUTOFF_PARAM, 1);

TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
params, TokenNameFinderFactory.create(null,
Files.readString(featureGeneratorXmlPath, StandardCharsets.UTF_8)
.getBytes(StandardCharsets.UTF_8), resources, new BioCodec()));


File nerModel = Files.createTempFile("nermodel", ".bin").toFile();
try (FileOutputStream modelOut = new FileOutputStream(nerModel)) {
nameFinderModel.serialize(modelOut);
Assertions.assertTrue(nerModel.exists());
} finally {
Assertions.assertTrue(nerModel.delete());
FileUtil.deleteDirectory(resourcesFolder.toFile());
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
<!--
~ Licensed to the Apache Software Foundation (ASF) under one or more
~ contributor license agreements. See the NOTICE file distributed with
~ this work for additional information regarding copyright ownership.
~ The ASF licenses this file to You under the Apache License, Version 2.0
~ (the "License"); you may not use this file except in compliance with
~ the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->

<featureGenerators cache="true" name="nameFinder">
<generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory">
<int name="prevLength">2</int>
<int name="nextLength">2</int>
<generator class="opennlp.tools.util.featuregen.TokenClassFeatureGeneratorFactory"/>
</generator>
<generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory">
<int name="prevLength">2</int>
<int name="nextLength">2</int>
<generator class="opennlp.tools.util.featuregen.TokenFeatureGeneratorFactory"/>
</generator>
<generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory">
<int name="prevLength">2</int>
<int name="nextLength">2</int>
<generator class="opennlp.tools.util.featuregen.POSTaggerNameFeatureGeneratorFactory">
<str name="model">pt-pos-perceptron.bin</str>
</generator>
</generator>
<generator class="opennlp.tools.util.featuregen.PreviousMapFeatureGeneratorFactory"/>
<generator class="opennlp.tools.util.featuregen.DefinitionFeatureGeneratorFactory"/>
<generator class="opennlp.tools.util.featuregen.BigramNameFeatureGeneratorFactory"/>
<generator class="opennlp.tools.util.featuregen.SentenceFeatureGeneratorFactory">
<bool name="begin">true</bool>
<bool name="end">false</bool>
</generator>
</featureGenerators>

0 comments on commit 186ecf9

Please sign in to comment.