Skip to content

Commit 889e68c

Browse files
authored
OPENNLP-1526 Add Spanish abbreviation dictionary (#566)
- moves abbreviation dictionaries to a common location: "tools/lang", independent of 'sentdetect' use cases, test scope accordingly - adds abb_ES.xml to opennlp-tools/lang - adds new test cases for the ES localization - adjusts and enhances existing test cases for new dictionary locations
1 parent 9b2d184 commit 889e68c

File tree

13 files changed

+540
-16
lines changed

13 files changed

+540
-16
lines changed

opennlp-tools/lang/es/abb_ES.xml

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
3+
<!--
4+
Licensed to the Apache Software Foundation (ASF) under one
5+
or more contributor license agreements. See the NOTICE file
6+
distributed with this work for additional information
7+
regarding copyright ownership. The ASF licenses this file
8+
to you under the Apache License, Version 2.0 (the
9+
"License"); you may not use this file except in compliance
10+
with the License. You may obtain a copy of the License at
11+
12+
http://www.apache.org/licenses/LICENSE-2.0
13+
14+
Unless required by applicable law or agreed to in writing,
15+
software distributed under the License is distributed on an
16+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17+
KIND, either express or implied. See the License for the
18+
specific language governing permissions and limitations
19+
under the License.
20+
-->
21+
22+
<dictionary case_sensitive="false">
23+
<entry>
24+
<token>a.C.</token>
25+
</entry>
26+
<entry>
27+
<token>a. de C.</token>
28+
</entry>
29+
<entry>
30+
<token>a.J.C.</token>
31+
</entry>
32+
<entry>
33+
<token>a. de J.C.</token>
34+
</entry>
35+
<entry>
36+
<token>a. m.</token>
37+
</entry>
38+
<entry>
39+
<token>apdo.</token>
40+
</entry>
41+
<entry>
42+
<token>apdo.</token>
43+
</entry>
44+
<entry>
45+
<token>aprox.</token>
46+
</entry>
47+
<entry>
48+
<token>Av.</token>
49+
</entry>
50+
<entry>
51+
<token>Avda.</token>
52+
</entry>
53+
<entry>
54+
<token>Bs. As.</token>
55+
</entry>
56+
<entry>
57+
<token>c.c.</token>
58+
</entry>
59+
<entry>
60+
<token>cap.</token>
61+
</entry>
62+
<entry>
63+
<token>D.</token>
64+
</entry>
65+
<entry>
66+
<token>Da.</token>
67+
</entry>
68+
<entry>
69+
<token>Dña.</token>
70+
</entry>
71+
<entry>
72+
<token>d.C.</token>
73+
</entry>
74+
<entry>
75+
<token>d. de C.</token>
76+
</entry>
77+
<entry>
78+
<token>d.J.C.</token>
79+
</entry>
80+
<entry>
81+
<token>d. de J.C</token>
82+
</entry>
83+
<entry>
84+
<token>dna.</token>
85+
</entry>
86+
<entry>
87+
<token>EE. UU.</token>
88+
</entry>
89+
<entry>
90+
<token>etc.</token>
91+
</entry>
92+
<entry>
93+
<token>f.c.</token>
94+
</entry>
95+
<entry>
96+
<token>F.C.</token>
97+
</entry>
98+
<entry>
99+
<token>FF. AA.</token>
100+
</entry>
101+
<entry>
102+
<token>Dr.</token>
103+
</entry>
104+
<entry>
105+
<token>Dra.</token>
106+
</entry>
107+
<entry>
108+
<token>Gob.</token>
109+
</entry>
110+
<entry>
111+
<token>Lic.</token>
112+
</entry>
113+
<entry>
114+
<token>Ing.</token>
115+
</entry>
116+
<entry>
117+
<token>Pdte.</token>
118+
</entry>
119+
<entry>
120+
<token>Pdta.</token>
121+
</entry>
122+
<entry>
123+
<token>p.</token>
124+
</entry>
125+
<entry>
126+
<token>pág.</token>
127+
</entry>
128+
<entry>
129+
<token>n.°</token>
130+
</entry>
131+
<entry>
132+
<token>no.</token>
133+
</entry>
134+
<entry>
135+
<token>núm.</token>
136+
</entry>
137+
<entry>
138+
<token>p.ej.</token>
139+
</entry>
140+
<entry>
141+
<token>p. m.</token>
142+
</entry>
143+
<entry>
144+
<token>Prof.</token>
145+
</entry>
146+
<entry>
147+
<token>Profa.</token>
148+
</entry>
149+
<entry>
150+
<token>q.e.p.d.</token>
151+
</entry>
152+
<entry>
153+
<token>S.A.</token>
154+
</entry>
155+
<entry>
156+
<token>S.L.</token>
157+
</entry>
158+
<entry>
159+
<token>Sr.</token>
160+
</entry>
161+
<entry>
162+
<token>Sra.</token>
163+
</entry>
164+
<entry>
165+
<token>Srta.</token>
166+
</entry>
167+
<entry>
168+
<token>Ud.</token>
169+
</entry>
170+
<entry>
171+
<token>Vd.</token>
172+
</entry>
173+
<entry>
174+
<token>Uds.</token>
175+
</entry>
176+
<entry>
177+
<token>Vds.</token>
178+
</entry>
179+
<entry>
180+
<token>vol.</token>
181+
</entry>
182+
<entry>
183+
<token>v.</token>
184+
</entry>
185+
<entry>
186+
<token>lu.</token>
187+
</entry>
188+
<entry>
189+
<token>ma.</token>
190+
</entry>
191+
<entry>
192+
<token>mi.</token>
193+
</entry>
194+
<entry>
195+
<token>ju.</token>
196+
</entry>
197+
<entry>
198+
<token>vi.</token>
199+
</entry>
200+
<entry>
201+
<token>sá.</token>
202+
</entry>
203+
<entry>
204+
<token>do.</token>
205+
</entry>
206+
<entry>
207+
<token>en.</token>
208+
</entry>
209+
<entry>
210+
<token>febr.</token>
211+
</entry>
212+
<entry>
213+
<token>mzo.</token>
214+
</entry>
215+
<entry>
216+
<token>abr.</token>
217+
</entry>
218+
<entry>
219+
<token>my.</token>
220+
</entry>
221+
<entry>
222+
<token>jun.</token>
223+
</entry>
224+
<entry>
225+
<token>jul.</token>
226+
</entry>
227+
<entry>
228+
<token>ag.</token>
229+
</entry>
230+
<entry>
231+
<token>agt.</token>
232+
</entry>
233+
<entry>
234+
<token>set.</token>
235+
</entry>
236+
<entry>
237+
<token>sept.</token>
238+
</entry>
239+
<entry>
240+
<token>oct.</token>
241+
</entry>
242+
<entry>
243+
<token>nov.</token>
244+
</entry>
245+
<entry>
246+
<token>novbre.</token>
247+
</entry>
248+
<entry>
249+
<token>dic.</token>
250+
</entry>
251+
<entry>
252+
<token>dicbre.</token>
253+
</entry>
254+
</dictionary>

opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerToolTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ public void testGetShortDescription() {
6363

6464
@Test
6565
public void testLoadDictHappyCase() throws IOException {
66-
File dictFile = new File("lang/ga/sentdetect/abb.xml");
66+
File dictFile = new File("lang/ga/abb_GA.xml");
6767
Dictionary dict = TokenizerTrainerTool.loadDict(dictFile);
6868
Assertions.assertNotNull(dict);
6969
}

opennlp-tools/src/test/java/opennlp/tools/sentdetect/AbstractSentenceDetectorTest.java

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,15 @@
2929
import opennlp.tools.util.TrainingParameters;
3030

3131
public abstract class AbstractSentenceDetectorTest {
32-
32+
33+
protected static final Locale LOCALE_SPANISH = new Locale("es");
34+
3335
static ObjectStream<SentenceSample> createSampleStream(Locale loc) throws IOException {
3436
final String trainingResource;
3537
if (loc.equals(Locale.GERMAN)) {
3638
trainingResource = "/opennlp/tools/sentdetect/Sentences_DE.txt";
39+
} else if (loc.equals(LOCALE_SPANISH)) {
40+
trainingResource = "/opennlp/tools/sentdetect/Sentences_ES.txt";
3741
} else {
3842
trainingResource = "/opennlp/tools/sentdetect/Sentences.txt";
3943
}
@@ -43,22 +47,26 @@ static ObjectStream<SentenceSample> createSampleStream(Locale loc) throws IOExce
4347
}
4448

4549
static SentenceModel train(SentenceDetectorFactory factory, Locale loc) throws IOException {
46-
final String languageCode;
50+
final String lang;
4751
if (loc.equals(Locale.GERMAN)) {
48-
languageCode = "deu";
52+
lang = "deu";
53+
} else if (loc.equals(LOCALE_SPANISH)) {
54+
lang = "spa";
4955
} else {
50-
languageCode = "eng";
56+
lang = "eng";
5157
}
52-
return SentenceDetectorME.train(languageCode, createSampleStream(loc), factory,
58+
return SentenceDetectorME.train(lang, createSampleStream(loc), factory,
5359
TrainingParameters.defaultParams());
5460
}
5561

5662
static Dictionary loadAbbDictionary(Locale loc) throws IOException {
5763
final String abbrevDict;
5864
if (loc.equals(Locale.GERMAN)) {
59-
abbrevDict = "opennlp/tools/sentdetect/abb_DE.xml";
65+
abbrevDict = "opennlp/tools/lang/abb_DE.xml";
66+
} else if (loc.equals(LOCALE_SPANISH)) {
67+
abbrevDict = "opennlp/tools/lang/abb_ES.xml";
6068
} else {
61-
abbrevDict = "opennlp/tools/sentdetect/abb.xml";
69+
abbrevDict = "opennlp/tools/lang/abb_EN.xml";
6270
}
6371
return new Dictionary(AbstractSentenceDetectorTest.class.getClassLoader()
6472
.getResourceAsStream(abbrevDict));

opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717

1818
package opennlp.tools.sentdetect;
1919

20-
2120
import java.io.IOException;
2221
import java.util.Locale;
2322

@@ -33,8 +32,8 @@
3332
* Verifies OPENNLP-793 in combination with OPENNLP-570.
3433
* <p>
3534
* In this context, well-known known German (de_DE) abbreviations must be respected,
36-
* so that non-sentence breaks (words abbreviated with one or more '.' characters)
37-
* result in incorrect sentence boundaries .
35+
* so that words abbreviated with one or more '.' characters do not
36+
* result in incorrect sentence boundaries.
3837
* <p>
3938
* See:
4039
* <a href="https://issues.apache.org/jira/projects/OPENNLP/issues/OPENNLP-793">OPENNLP-793</a>
@@ -65,8 +64,8 @@ void testSentDetectWithInlineAbbreviationsEx1() {
6564
final String sent2 = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von der botanischen Monographie.";
6665

6766
SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
68-
String sampleSentences1 = sent1 + " " + sent2;
69-
String[] sents = sentDetect.sentDetect(sampleSentences1);
67+
String sampleSentences = sent1 + " " + sent2;
68+
String[] sents = sentDetect.sentDetect(sampleSentences);
7069
Assertions.assertEquals(2, sents.length);
7170
Assertions.assertEquals(sent1, sents[0]);
7271
Assertions.assertEquals(sent2, sents[1]);

0 commit comments

Comments
 (0)