26
26
import opennlp .tools .util .StringUtil ;
27
27
28
28
/**
29
- * SAX style SGML parser.
30
- * <p>
31
- * Note:<br>
32
- * The implementation is very limited, but good enough to
33
- * parse the MUC corpora. Its must very likely be extended/improved/fixed to parse
34
- * a different SGML corpora.
29
+ * A SAX style <a href="https://www.w3.org/TR/WD-html40-970708/intro/sgmltut.html">SGML</a> parser.
30
+ *
31
+ * @implNote The implementation is very limited, but good enough to parse the
32
+ * <a href="https://catalog.ldc.upenn.edu/LDC2003T13">MUC corpora</a>.
33
+ * Its must very likely be extended/improved/fixed to parse a different SGML corpora.
35
34
*/
36
35
public class SgmlParser {
37
36
38
- public static abstract class ContentHandler {
39
-
40
- public void startElement (String name , Map <String , String > attributes ) throws InvalidFormatException {
41
- }
37
+ private static final char SYMBOL_CLOSE = '>' ;
38
+ private static final char SYMBOL_OPEN = '<' ;
39
+ private static final char SYMBOL_SLASH = '/' ;
40
+ private static final char SYMBOL_EQUALS = '=' ;
41
+ private static final char SYMBOL_QUOT = '"' ;
42
42
43
- public void characters (CharSequence chars ) throws InvalidFormatException {
44
- }
43
+ /**
44
+ * Defines methods to handle content produced by a {@link SgmlParser}.
45
+ * A concrete implementation interprets the document specific details.
46
+ */
47
+ public static abstract class ContentHandler {
45
48
46
- public void endElement (String name ) throws InvalidFormatException {
47
- }
49
+ /**
50
+ * Handles a SGML start element.
51
+ *
52
+ * @param name The name of the element's start tag.
53
+ * @param attributes The attributes supplied with the start tag. It may be empty.
54
+ * @throws InvalidFormatException Thrown if parameters were invalid.
55
+ */
56
+ public abstract void startElement (String name , Map <String , String > attributes )
57
+ throws InvalidFormatException ;
58
+
59
+ /**
60
+ * Handles a set of characters between SGML start and end tag.
61
+ *
62
+ * @param chars The characters to process.
63
+ * @throws InvalidFormatException Thrown if parameters were invalid.
64
+ */
65
+ public abstract void characters (CharSequence chars )
66
+ throws InvalidFormatException ;
67
+
68
+ /**
69
+ * Handles a SGML end element.
70
+ * @param name The name of the element's end tag.
71
+ */
72
+ public abstract void endElement (String name );
48
73
}
49
74
50
75
private static String extractTagName (CharSequence tagChars ) throws InvalidFormatException {
51
76
52
77
int fromOffset = 1 ;
53
-
54
- if (tagChars .length () > 1 && tagChars .charAt (1 ) == '/' ) {
78
+ if (tagChars .length () > 1 && tagChars .charAt (1 ) == SYMBOL_SLASH ) {
55
79
fromOffset = 2 ;
56
80
}
57
81
58
82
for (int ci = 1 ; ci < tagChars .length (); ci ++) {
59
-
60
- if (tagChars .charAt (ci ) == '>' || StringUtil .isWhitespace (tagChars .charAt (ci ))) {
83
+ if (tagChars .charAt (ci ) == SYMBOL_CLOSE || StringUtil .isWhitespace (tagChars .charAt (ci ))) {
61
84
return tagChars .subSequence (fromOffset , ci ).toString ();
62
85
}
63
86
}
@@ -90,15 +113,16 @@ private static Map<String, String> getAttributes(CharSequence tagChars) {
90
113
extractKey = true ;
91
114
}
92
115
// Equals sign indicated end of key name
93
- else if (extractKey && ('=' == tagChars .charAt (i ) || StringUtil .isWhitespace (tagChars .charAt (i )))) {
116
+ else if (extractKey && (SYMBOL_EQUALS == tagChars .charAt (i ) ||
117
+ StringUtil .isWhitespace (tagChars .charAt (i )))) {
94
118
extractKey = false ;
95
119
}
96
120
// Inside key name, extract all chars
97
121
else if (extractKey ) {
98
122
key .append (tagChars .charAt (i ));
99
123
}
100
124
// " Indicates begin or end of value chars
101
- else if ('"' == tagChars .charAt (i )) {
125
+ else if (SYMBOL_QUOT == tagChars .charAt (i )) {
102
126
103
127
if (extractValue ) {
104
128
attributes .put (key .toString (), value .toString ());
@@ -107,7 +131,6 @@ else if ('"' == tagChars.charAt(i)) {
107
131
key .setLength (0 );
108
132
value .setLength (0 );
109
133
}
110
-
111
134
extractValue = !extractValue ;
112
135
}
113
136
// Inside value, extract all chars
@@ -119,6 +142,17 @@ else if (extractValue) {
119
142
return attributes ;
120
143
}
121
144
145
+ /**
146
+ * Parses an SGML document available via the input in {@link Reader}.
147
+ * The specified {@link ContentHandler} is responsible of how to interpret the document
148
+ * specific details.
149
+ *
150
+ * @param in A {@link Reader} that provides the data of the SGML document.
151
+ * @param handler The {@link ContentHandler} to interpret the document with.
152
+ *
153
+ * @throws IOException Thrown if IO errors occurred.
154
+ * @throws InvalidFormatException Thrown if parameters were invalid.
155
+ */
122
156
public void parse (Reader in , ContentHandler handler ) throws IOException {
123
157
124
158
StringBuilder buffer = new StringBuilder ();
@@ -130,45 +164,37 @@ public void parse(Reader in, ContentHandler handler) throws IOException {
130
164
int c ;
131
165
while ((c = in .read ()) != -1 ) {
132
166
133
- if ('<' == c ) {
167
+ if (SYMBOL_OPEN == c ) {
134
168
if (isInsideTag ) {
135
169
throw new InvalidFormatException ("Did not expect < char!" );
136
170
}
137
-
138
- if (buffer .toString ().trim ().length () > 0 ) {
171
+ if (!buffer .toString ().trim ().isEmpty ()) {
139
172
handler .characters (buffer .toString ().trim ());
140
173
}
141
-
142
174
buffer .setLength (0 );
143
-
144
175
isInsideTag = true ;
145
176
isStartTag = true ;
146
177
}
147
-
148
178
buffer .appendCodePoint (c );
149
179
150
- if ('/' == c && lastChar == '<' ) {
180
+ if (SYMBOL_SLASH == c && lastChar == SYMBOL_OPEN ) {
151
181
isStartTag = false ;
152
182
}
153
183
154
- if ('>' == c ) {
184
+ if (SYMBOL_CLOSE == c ) {
155
185
156
186
if (!isInsideTag ) {
157
187
throw new InvalidFormatException ("Did not expect > char!" );
158
188
}
159
-
160
189
if (isStartTag ) {
161
190
handler .startElement (extractTagName (buffer ), getAttributes (buffer ));
162
191
}
163
192
else {
164
193
handler .endElement (extractTagName (buffer ));
165
194
}
166
-
167
195
buffer .setLength (0 );
168
-
169
196
isInsideTag = false ;
170
197
}
171
-
172
198
lastChar = c ;
173
199
}
174
200
0 commit comments