Skip to content

Commit 9b2d184

Browse files
committed
OPENNLP-1523 - Use the snowball-data set to write language-specific stemmer eval tests
1 parent 7d2722e commit 9b2d184

File tree

1 file changed

+75
-0
lines changed

1 file changed

+75
-0
lines changed
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package opennlp.tools.eval;
18+
19+
import java.io.FileNotFoundException;
20+
import java.io.IOException;
21+
import java.nio.file.Files;
22+
import java.nio.file.Path;
23+
import java.util.List;
24+
import java.util.Locale;
25+
26+
import org.junit.jupiter.params.ParameterizedTest;
27+
import org.junit.jupiter.params.provider.EnumSource;
28+
29+
import opennlp.tools.stemmer.snowball.SnowballStemmer;
30+
31+
import static org.junit.jupiter.api.Assertions.assertEquals;
32+
33+
/**
34+
* Eval tests for the {@link SnowballStemmer} class.
35+
* <p>
36+
* Uses the data set provided by <a href="https://github.com/snowballstem/snowball-data"></a>
37+
* to test all languages available in OpenNLP.
38+
* <p>
39+
*/
40+
public class SnowballTokenizerEval extends AbstractEvalTest {
41+
42+
@ParameterizedTest
43+
@EnumSource(SnowballStemmer.ALGORITHM.class)
44+
public void test(SnowballStemmer.ALGORITHM lang) throws IOException {
45+
46+
final List<String> vocabulary = getData(lang, "voc.txt");
47+
final List<String> expectedOutputs = getData(lang, "output.txt");
48+
49+
assertEquals(vocabulary.size(), expectedOutputs.size(), "Expected equally sized lists.");
50+
final SnowballStemmer stemmer = new SnowballStemmer(lang);
51+
52+
for (int i = 0; i < vocabulary.size(); i++) {
53+
54+
final String word = vocabulary.get(i);
55+
final String stem = expectedOutputs.get(i);
56+
57+
assertEquals(stem, stemmer.stem(word));
58+
}
59+
}
60+
61+
private List<String> getData(SnowballStemmer.ALGORITHM lang, String name) throws IOException {
62+
final Path expectedOutput = getSnowballDataLanguagePath(
63+
getSnowballDataPath(), lang).resolve(name);
64+
return Files.readAllLines(expectedOutput);
65+
}
66+
67+
private Path getSnowballDataPath() throws FileNotFoundException {
68+
return getOpennlpDataDir().toPath().resolve("snowball-data");
69+
}
70+
71+
private Path getSnowballDataLanguagePath(Path root, SnowballStemmer.ALGORITHM lang) {
72+
return root.resolve(lang.toString().toLowerCase(Locale.ROOT));
73+
}
74+
75+
}

0 commit comments

Comments
 (0)