Skip to content

Commit 1591a0d

Browse files
committed
Create at most one BufferedReader for DocumentMatcher.isMagic()
- check for LF in the bytes not the chars - affirm '.' or '\'' contemporaneously with matching lineStarters
1 parent fa18568 commit 1591a0d

File tree

1 file changed

+83
-55
lines changed

1 file changed

+83
-55
lines changed

src/org/opensolaris/opengrok/analysis/document/DocumentMatcher.java

Lines changed: 83 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -39,25 +39,26 @@ public class DocumentMatcher implements Matcher {
3939

4040
/**
4141
* Set to 512K {@code int}, but {@code NUMCHARS_FIRST_LOOK} and
42-
* {@code LINE_LIMIT} should apply beforehand
42+
* {@code LINE_LIMIT} should apply beforehand. This value is "effectively
43+
* unbounded" without being literally 2_147_483_647 -- as the other limits
44+
* will apply first, and the {@link java.io.BufferedInputStream} will
45+
* manage a reasonably-sized buffer.
4346
*/
4447
private static final int MARK_READ_LIMIT = 1024 * 512;
4548

4649
private static final int LINE_LIMIT = 100;
4750

4851
private static final int FIRST_LOOK_WIDTH = 300;
4952

50-
private static final int FIRST_CONTENT_WIDTH = 8;
51-
5253
private final FileAnalyzerFactory factory;
5354

5455
private final String[] lineStarters;
5556

5657
/**
57-
* Initializes an instance for the required parameters
58+
* Initializes an instance for the required parameters.
5859
* @param factory required factory to return when matched
5960
* @param lineStarters required list of line starters that indicate a match
60-
* @throws IllegalArgumentException if any parameter is null
61+
* @throws IllegalArgumentException thrown if any parameter is null
6162
*/
6263
public DocumentMatcher(FileAnalyzerFactory factory, String[] lineStarters) {
6364
if (factory == null) {
@@ -83,9 +84,9 @@ public DocumentMatcher(FileAnalyzerFactory factory, String[] lineStarters) {
8384
}
8485

8586
/**
86-
* Try to match the file contents by first affirming the document starts
87-
* with "." or "'" and then looks for {@code lineStarters} in the first
88-
* 100 lines.
87+
* Try to match the file contents by looking for {@code lineStarters} in
88+
* the first 100 lines while also affirming that the document starts
89+
* with "." or "'" after a limited amount of whitespace.
8990
* <p>
9091
* The stream is reset before returning.
9192
*
@@ -102,6 +103,7 @@ public FileAnalyzerFactory isMagic(byte[] contents, InputStream in)
102103
if (!in.markSupported()) return null;
103104
in.mark(MARK_READ_LIMIT);
104105

106+
// read encoding, and skip past any BOM
105107
int bomLength = 0;
106108
String encoding = IOUtils.findBOMEncoding(contents);
107109
if (encoding == null) {
@@ -114,56 +116,17 @@ public FileAnalyzerFactory isMagic(byte[] contents, InputStream in)
114116
}
115117
}
116118

117-
BufferedReader rdr = new BufferedReader(new InputStreamReader(
118-
in, encoding));
119-
120-
// Before reading a line, read some characters for a first look
121-
char[] buf = new char[FIRST_LOOK_WIDTH];
122-
int lenFirstLook;
123-
if ((lenFirstLook = rdr.read(buf)) < 1) {
124-
in.reset();
125-
return null;
126-
}
127-
128-
// Require a "." or "'" as the first non-whitespace character after
129-
// only a limited number of whitespaces or else infer it is not troff
130-
// or mandoc.
131-
int actualFirstContentWidth = lenFirstLook < FIRST_CONTENT_WIDTH ?
132-
lenFirstLook : FIRST_CONTENT_WIDTH;
133-
boolean foundContent = false;
134-
for (int i = 0; i < actualFirstContentWidth; ++i) {
135-
if (buf[i] == '.' || buf[i] == '\'') {
136-
foundContent = true;
137-
break;
138-
} else if (!Character.isWhitespace(buf[i])) {
139-
in.reset();
140-
return null;
141-
}
142-
}
143-
if (!foundContent) {
144-
in.reset();
145-
return null;
146-
}
147-
148-
// affirm that a LF is seen in the first look or else quickly
149-
// infer it is not troff
150-
boolean foundLF = false;
151-
for (int i = 0; i < lenFirstLook; ++i) {
152-
if (buf[i] == '\n') {
153-
foundLF = true;
154-
break;
155-
}
156-
}
157-
if (!foundLF) {
158-
in.reset();
159-
return null;
160-
}
161-
162-
// reset for line-by-line reading below
119+
// affirm that a LF exists in a first block
120+
boolean foundLF = hasLineFeed(in, encoding);
163121
in.reset();
122+
if (!foundLF) return null;
164123
if (bomLength > 0) in.skip(bomLength);
165-
rdr = new BufferedReader(new InputStreamReader(in, encoding));
166124

125+
// read line-by-line for a first few lines
126+
BufferedReader rdr = new BufferedReader(new InputStreamReader(
127+
in, encoding));
128+
boolean foundContent = false;
129+
int numFirstChars = 0;
167130
int numLines = 0;
168131
String line;
169132
while ((line = rdr.readLine()) != null) {
@@ -177,9 +140,74 @@ public FileAnalyzerFactory isMagic(byte[] contents, InputStream in)
177140
in.reset();
178141
return null;
179142
}
143+
144+
// If not yet `foundContent', then only a limited allowance is
145+
// given until a sentinel '.' or '\'' must be seen after nothing
146+
// else but whitespace.
147+
if (!foundContent) {
148+
for (int i = 0; i < line.length() && numFirstChars <
149+
FIRST_LOOK_WIDTH; ++i, ++numFirstChars) {
150+
char c = line.charAt(i);
151+
if (c == '.' || c == '\'') {
152+
foundContent = true;
153+
break;
154+
} else if (!Character.isWhitespace(c)) {
155+
in.reset();
156+
return null;
157+
}
158+
}
159+
if (!foundContent && numFirstChars >= FIRST_LOOK_WIDTH) {
160+
in.reset();
161+
return null;
162+
}
163+
}
180164
}
181165

182166
in.reset();
183167
return null;
184168
}
169+
170+
/**
171+
* Determines if the {@code in} stream has a line feed character within the
172+
* first {@code FIRST_LOOK_WIDTH} characters.
173+
* @param in the input stream has any BOM (not {@code reset} after use)
174+
* @param encoding the input stream charset
175+
* @return true if a line feed '\n' was found
176+
* @throws IOException thrown on any error in reading
177+
*/
178+
private boolean hasLineFeed(InputStream in, String encoding)
179+
throws IOException {
180+
byte[] buf;
181+
int nextra;
182+
int noff;
183+
switch (encoding) {
184+
case "UTF-16LE":
185+
buf = new byte[FIRST_LOOK_WIDTH * 2];
186+
nextra = 1;
187+
noff = 0;
188+
break;
189+
case "UTF-16BE":
190+
buf = new byte[FIRST_LOOK_WIDTH * 2];
191+
nextra = 1;
192+
noff = 1;
193+
break;
194+
default:
195+
buf = new byte[FIRST_LOOK_WIDTH];
196+
nextra = 0;
197+
noff = 0;
198+
break;
199+
}
200+
201+
int nread = in.read(buf);
202+
for (int i = 0; i + nextra < nread; i += 1 + nextra) {
203+
if (nextra > 0) {
204+
if (buf[i + noff] == '\n' && buf[i + 1 - noff] == '\0') {
205+
return true;
206+
}
207+
} else {
208+
if (buf[i] == '\n') return true;
209+
}
210+
}
211+
return false;
212+
}
185213
}

0 commit comments

Comments
 (0)