@@ -39,25 +39,26 @@ public class DocumentMatcher implements Matcher {
39
39
40
40
/**
41
41
* Set to 512K {@code int}, but {@code NUMCHARS_FIRST_LOOK} and
42
- * {@code LINE_LIMIT} should apply beforehand
42
+ * {@code LINE_LIMIT} should apply beforehand. This value is "effectively
43
+ * unbounded" without being literally 2_147_483_647 -- as the other limits
44
+ * will apply first, and the {@link java.io.BufferedInputStream} will
45
+ * manage a reasonably-sized buffer.
43
46
*/
44
47
private static final int MARK_READ_LIMIT = 1024 * 512 ;
45
48
46
49
private static final int LINE_LIMIT = 100 ;
47
50
48
51
private static final int FIRST_LOOK_WIDTH = 300 ;
49
52
50
- private static final int FIRST_CONTENT_WIDTH = 8 ;
51
-
52
53
private final FileAnalyzerFactory factory ;
53
54
54
55
private final String [] lineStarters ;
55
56
56
57
/**
57
- * Initializes an instance for the required parameters
58
+ * Initializes an instance for the required parameters.
58
59
* @param factory required factory to return when matched
59
60
* @param lineStarters required list of line starters that indicate a match
60
- * @throws IllegalArgumentException if any parameter is null
61
+ * @throws IllegalArgumentException thrown if any parameter is null
61
62
*/
62
63
public DocumentMatcher (FileAnalyzerFactory factory , String [] lineStarters ) {
63
64
if (factory == null ) {
@@ -83,9 +84,9 @@ public DocumentMatcher(FileAnalyzerFactory factory, String[] lineStarters) {
83
84
}
84
85
85
86
/**
86
- * Try to match the file contents by first affirming the document starts
87
- * with "." or "'" and then looks for {@code lineStarters} in the first
88
- * 100 lines .
87
+ * Try to match the file contents by looking for {@code lineStarters} in
88
+ * the first 100 lines while also affirming that the document starts
89
+ * with "." or "'" after a limited amount of whitespace .
89
90
* <p>
90
91
* The stream is reset before returning.
91
92
*
@@ -102,6 +103,7 @@ public FileAnalyzerFactory isMagic(byte[] contents, InputStream in)
102
103
if (!in .markSupported ()) return null ;
103
104
in .mark (MARK_READ_LIMIT );
104
105
106
+ // read encoding, and skip past any BOM
105
107
int bomLength = 0 ;
106
108
String encoding = IOUtils .findBOMEncoding (contents );
107
109
if (encoding == null ) {
@@ -114,56 +116,17 @@ public FileAnalyzerFactory isMagic(byte[] contents, InputStream in)
114
116
}
115
117
}
116
118
117
- BufferedReader rdr = new BufferedReader (new InputStreamReader (
118
- in , encoding ));
119
-
120
- // Before reading a line, read some characters for a first look
121
- char [] buf = new char [FIRST_LOOK_WIDTH ];
122
- int lenFirstLook ;
123
- if ((lenFirstLook = rdr .read (buf )) < 1 ) {
124
- in .reset ();
125
- return null ;
126
- }
127
-
128
- // Require a "." or "'" as the first non-whitespace character after
129
- // only a limited number of whitespaces or else infer it is not troff
130
- // or mandoc.
131
- int actualFirstContentWidth = lenFirstLook < FIRST_CONTENT_WIDTH ?
132
- lenFirstLook : FIRST_CONTENT_WIDTH ;
133
- boolean foundContent = false ;
134
- for (int i = 0 ; i < actualFirstContentWidth ; ++i ) {
135
- if (buf [i ] == '.' || buf [i ] == '\'' ) {
136
- foundContent = true ;
137
- break ;
138
- } else if (!Character .isWhitespace (buf [i ])) {
139
- in .reset ();
140
- return null ;
141
- }
142
- }
143
- if (!foundContent ) {
144
- in .reset ();
145
- return null ;
146
- }
147
-
148
- // affirm that a LF is seen in the first look or else quickly
149
- // infer it is not troff
150
- boolean foundLF = false ;
151
- for (int i = 0 ; i < lenFirstLook ; ++i ) {
152
- if (buf [i ] == '\n' ) {
153
- foundLF = true ;
154
- break ;
155
- }
156
- }
157
- if (!foundLF ) {
158
- in .reset ();
159
- return null ;
160
- }
161
-
162
- // reset for line-by-line reading below
119
+ // affirm that a LF exists in a first block
120
+ boolean foundLF = hasLineFeed (in , encoding );
163
121
in .reset ();
122
+ if (!foundLF ) return null ;
164
123
if (bomLength > 0 ) in .skip (bomLength );
165
- rdr = new BufferedReader (new InputStreamReader (in , encoding ));
166
124
125
+ // read line-by-line for a first few lines
126
+ BufferedReader rdr = new BufferedReader (new InputStreamReader (
127
+ in , encoding ));
128
+ boolean foundContent = false ;
129
+ int numFirstChars = 0 ;
167
130
int numLines = 0 ;
168
131
String line ;
169
132
while ((line = rdr .readLine ()) != null ) {
@@ -177,9 +140,74 @@ public FileAnalyzerFactory isMagic(byte[] contents, InputStream in)
177
140
in .reset ();
178
141
return null ;
179
142
}
143
+
144
+ // If not yet `foundContent', then only a limited allowance is
145
+ // given until a sentinel '.' or '\'' must be seen after nothing
146
+ // else but whitespace.
147
+ if (!foundContent ) {
148
+ for (int i = 0 ; i < line .length () && numFirstChars <
149
+ FIRST_LOOK_WIDTH ; ++i , ++numFirstChars ) {
150
+ char c = line .charAt (i );
151
+ if (c == '.' || c == '\'' ) {
152
+ foundContent = true ;
153
+ break ;
154
+ } else if (!Character .isWhitespace (c )) {
155
+ in .reset ();
156
+ return null ;
157
+ }
158
+ }
159
+ if (!foundContent && numFirstChars >= FIRST_LOOK_WIDTH ) {
160
+ in .reset ();
161
+ return null ;
162
+ }
163
+ }
180
164
}
181
165
182
166
in .reset ();
183
167
return null ;
184
168
}
169
+
170
+ /**
171
+ * Determines if the {@code in} stream has a line feed character within the
172
+ * first {@code FIRST_LOOK_WIDTH} characters.
173
+ * @param in the input stream has any BOM (not {@code reset} after use)
174
+ * @param encoding the input stream charset
175
+ * @return true if a line feed '\n' was found
176
+ * @throws IOException thrown on any error in reading
177
+ */
178
+ private boolean hasLineFeed (InputStream in , String encoding )
179
+ throws IOException {
180
+ byte [] buf ;
181
+ int nextra ;
182
+ int noff ;
183
+ switch (encoding ) {
184
+ case "UTF-16LE" :
185
+ buf = new byte [FIRST_LOOK_WIDTH * 2 ];
186
+ nextra = 1 ;
187
+ noff = 0 ;
188
+ break ;
189
+ case "UTF-16BE" :
190
+ buf = new byte [FIRST_LOOK_WIDTH * 2 ];
191
+ nextra = 1 ;
192
+ noff = 1 ;
193
+ break ;
194
+ default :
195
+ buf = new byte [FIRST_LOOK_WIDTH ];
196
+ nextra = 0 ;
197
+ noff = 0 ;
198
+ break ;
199
+ }
200
+
201
+ int nread = in .read (buf );
202
+ for (int i = 0 ; i + nextra < nread ; i += 1 + nextra ) {
203
+ if (nextra > 0 ) {
204
+ if (buf [i + noff ] == '\n' && buf [i + 1 - noff ] == '\0' ) {
205
+ return true ;
206
+ }
207
+ } else {
208
+ if (buf [i ] == '\n' ) return true ;
209
+ }
210
+ }
211
+ return false ;
212
+ }
185
213
}
0 commit comments