Skip to content

Commit 5b846a3

Browse files
authored
OPENNLP-1679: Extend JavaDoc of SgmlParser (#719)
1 parent 49678c3 commit 5b846a3

File tree

3 files changed

+75
-36
lines changed

3 files changed

+75
-36
lines changed

opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ else if (startDocElement != endDocElement) {
6969
}
7070
}
7171

72-
if (docs.size() > 0) {
72+
if (!docs.isEmpty()) {
7373
return docs.remove(0);
7474
}
7575
else {

opennlp-tools/src/main/java/opennlp/tools/formats/muc/SgmlParser.java

+59-33
Original file line numberDiff line numberDiff line change
@@ -26,38 +26,61 @@
2626
import opennlp.tools.util.StringUtil;
2727

2828
/**
29-
* SAX style SGML parser.
30-
* <p>
31-
* Note:<br>
32-
* The implementation is very limited, but good enough to
33-
* parse the MUC corpora. Its must very likely be extended/improved/fixed to parse
34-
* a different SGML corpora.
29+
* A SAX style <a href="https://www.w3.org/TR/WD-html40-970708/intro/sgmltut.html">SGML</a> parser.
30+
*
31+
* @implNote The implementation is very limited, but good enough to parse the
32+
* <a href="https://catalog.ldc.upenn.edu/LDC2003T13">MUC corpora</a>.
33+
* Its must very likely be extended/improved/fixed to parse a different SGML corpora.
3534
*/
3635
public class SgmlParser {
3736

38-
public static abstract class ContentHandler {
39-
40-
public void startElement(String name, Map<String, String> attributes) throws InvalidFormatException {
41-
}
37+
private static final char SYMBOL_CLOSE = '>';
38+
private static final char SYMBOL_OPEN = '<';
39+
private static final char SYMBOL_SLASH = '/';
40+
private static final char SYMBOL_EQUALS = '=';
41+
private static final char SYMBOL_QUOT = '"';
4242

43-
public void characters(CharSequence chars) throws InvalidFormatException{
44-
}
43+
/**
44+
* Defines methods to handle content produced by a {@link SgmlParser}.
45+
* A concrete implementation interprets the document specific details.
46+
*/
47+
public static abstract class ContentHandler {
4548

46-
public void endElement(String name) throws InvalidFormatException {
47-
}
49+
/**
50+
* Handles a SGML start element.
51+
*
52+
* @param name The name of the element's start tag.
53+
* @param attributes The attributes supplied with the start tag. It may be empty.
54+
* @throws InvalidFormatException Thrown if parameters were invalid.
55+
*/
56+
public abstract void startElement(String name, Map<String, String> attributes)
57+
throws InvalidFormatException;
58+
59+
/**
60+
* Handles a set of characters between SGML start and end tag.
61+
*
62+
* @param chars The characters to process.
63+
* @throws InvalidFormatException Thrown if parameters were invalid.
64+
*/
65+
public abstract void characters(CharSequence chars)
66+
throws InvalidFormatException;
67+
68+
/**
69+
* Handles a SGML end element.
70+
* @param name The name of the element's end tag.
71+
*/
72+
public abstract void endElement(String name);
4873
}
4974

5075
private static String extractTagName(CharSequence tagChars) throws InvalidFormatException {
5176

5277
int fromOffset = 1;
53-
54-
if (tagChars.length() > 1 && tagChars.charAt(1) == '/') {
78+
if (tagChars.length() > 1 && tagChars.charAt(1) == SYMBOL_SLASH) {
5579
fromOffset = 2;
5680
}
5781

5882
for (int ci = 1; ci < tagChars.length(); ci++) {
59-
60-
if (tagChars.charAt(ci) == '>' || StringUtil.isWhitespace(tagChars.charAt(ci))) {
83+
if (tagChars.charAt(ci) == SYMBOL_CLOSE || StringUtil.isWhitespace(tagChars.charAt(ci))) {
6184
return tagChars.subSequence(fromOffset, ci).toString();
6285
}
6386
}
@@ -90,15 +113,16 @@ private static Map<String, String> getAttributes(CharSequence tagChars) {
90113
extractKey = true;
91114
}
92115
// Equals sign indicated end of key name
93-
else if (extractKey && ('=' == tagChars.charAt(i) || StringUtil.isWhitespace(tagChars.charAt(i)))) {
116+
else if (extractKey && (SYMBOL_EQUALS == tagChars.charAt(i) ||
117+
StringUtil.isWhitespace(tagChars.charAt(i)))) {
94118
extractKey = false;
95119
}
96120
// Inside key name, extract all chars
97121
else if (extractKey) {
98122
key.append(tagChars.charAt(i));
99123
}
100124
// " Indicates begin or end of value chars
101-
else if ('"' == tagChars.charAt(i)) {
125+
else if (SYMBOL_QUOT == tagChars.charAt(i)) {
102126

103127
if (extractValue) {
104128
attributes.put(key.toString(), value.toString());
@@ -107,7 +131,6 @@ else if ('"' == tagChars.charAt(i)) {
107131
key.setLength(0);
108132
value.setLength(0);
109133
}
110-
111134
extractValue = !extractValue;
112135
}
113136
// Inside value, extract all chars
@@ -119,6 +142,17 @@ else if (extractValue) {
119142
return attributes;
120143
}
121144

145+
/**
146+
* Parses an SGML document available via the input in {@link Reader}.
147+
* The specified {@link ContentHandler} is responsible of how to interpret the document
148+
* specific details.
149+
*
150+
* @param in A {@link Reader} that provides the data of the SGML document.
151+
* @param handler The {@link ContentHandler} to interpret the document with.
152+
*
153+
* @throws IOException Thrown if IO errors occurred.
154+
* @throws InvalidFormatException Thrown if parameters were invalid.
155+
*/
122156
public void parse(Reader in, ContentHandler handler) throws IOException {
123157

124158
StringBuilder buffer = new StringBuilder();
@@ -130,45 +164,37 @@ public void parse(Reader in, ContentHandler handler) throws IOException {
130164
int c;
131165
while ((c = in.read()) != -1) {
132166

133-
if ('<' == c) {
167+
if (SYMBOL_OPEN == c) {
134168
if (isInsideTag) {
135169
throw new InvalidFormatException("Did not expect < char!");
136170
}
137-
138-
if (buffer.toString().trim().length() > 0) {
171+
if (!buffer.toString().trim().isEmpty()) {
139172
handler.characters(buffer.toString().trim());
140173
}
141-
142174
buffer.setLength(0);
143-
144175
isInsideTag = true;
145176
isStartTag = true;
146177
}
147-
148178
buffer.appendCodePoint(c);
149179

150-
if ('/' == c && lastChar == '<') {
180+
if (SYMBOL_SLASH == c && lastChar == SYMBOL_OPEN) {
151181
isStartTag = false;
152182
}
153183

154-
if ('>' == c) {
184+
if (SYMBOL_CLOSE == c) {
155185

156186
if (!isInsideTag) {
157187
throw new InvalidFormatException("Did not expect > char!");
158188
}
159-
160189
if (isStartTag) {
161190
handler.startElement(extractTagName(buffer), getAttributes(buffer));
162191
}
163192
else {
164193
handler.endElement(extractTagName(buffer));
165194
}
166-
167195
buffer.setLength(0);
168-
169196
isInsideTag = false;
170197
}
171-
172198
lastChar = c;
173199
}
174200

opennlp-tools/src/test/java/opennlp/tools/formats/muc/SgmlParserTest.java

+15-2
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.io.InputStreamReader;
2222
import java.io.Reader;
2323
import java.nio.charset.StandardCharsets;
24+
import java.util.Map;
2425

2526
import org.junit.jupiter.api.Test;
2627

@@ -34,9 +35,21 @@ void testParse1() throws IOException {
3435
try (Reader in = new InputStreamReader(getResourceStream("muc/parsertest1.sgml"),
3536
StandardCharsets.UTF_8)) {
3637
SgmlParser parser = new SgmlParser();
37-
parser.parse(in, new SgmlParser.ContentHandler() {
38-
});
38+
parser.parse(in, new DummyContentHandler());
3939
}
4040
}
4141

42+
private static class DummyContentHandler extends SgmlParser.ContentHandler {
43+
@Override
44+
public void startElement(String name, Map<String, String> attributes) {
45+
}
46+
47+
@Override
48+
public void characters(CharSequence chars) {
49+
}
50+
51+
@Override
52+
public void endElement(String name) {
53+
}
54+
}
4255
}

0 commit comments

Comments
 (0)