Skip to content

Commit c9440e6

Browse files
mawiesnerzo1
authored andcommitted
OPENNLP-1702: BratDocumentStream should process files in bratCorpusDir deterministically
- fix by sorting all candidate files from dir lexicographically - extracts constants where applicable
1 parent c5cc1f0 commit c9440e6

File tree

1 file changed

+12
-17
lines changed

1 file changed

+12
-17
lines changed

opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import java.io.FileInputStream;
2424
import java.io.IOException;
2525
import java.io.InputStream;
26+
import java.util.Arrays;
2627
import java.util.Iterator;
2728
import java.util.LinkedList;
2829
import java.util.List;
@@ -32,6 +33,9 @@
3233

3334
public class BratDocumentStream implements ObjectStream<BratDocument> {
3435

36+
private static final String SUFFIX_ANN = ".ann";
37+
private static final String SUFFIX_TXT = ".txt";
38+
3539
private final AnnotationConfiguration config;
3640
private List<String> documentIds = new LinkedList<>();
3741
private Iterator<String> documentIdIterator;
@@ -45,7 +49,7 @@ public class BratDocumentStream implements ObjectStream<BratDocument> {
4549
* to find training data files.
4650
* @param fileFilter a custom file filter to filter out certain files or null to accept all files
4751
*
48-
* @throws IOException if reading from the brat directory fails in anyway
52+
* @throws IOException if reading from the brat directory fails in any way.
4953
*/
5054
public BratDocumentStream(AnnotationConfiguration config, File bratCorpusDirectory,
5155
boolean searchRecursive, FileFilter fileFilter) throws IOException {
@@ -54,24 +58,20 @@ public BratDocumentStream(AnnotationConfiguration config, File bratCorpusDirecto
5458
throw new IOException("Input corpus directory must be a directory " +
5559
"according to File.isDirectory()!");
5660
}
57-
5861
this.config = config;
5962

6063
Stack<File> directoryStack = new Stack<>();
6164
directoryStack.add(bratCorpusDirectory);
62-
6365
while (!directoryStack.isEmpty()) {
64-
for (File file : directoryStack.pop().listFiles(fileFilter)) {
65-
66+
final File[] files = directoryStack.pop().listFiles(fileFilter);
67+
Arrays.sort(files);
68+
for (File file : files) {
6669
if (file.isFile()) {
6770
String annFilePath = file.getAbsolutePath();
68-
if (annFilePath.endsWith(".ann")) {
69-
71+
if (annFilePath.endsWith(SUFFIX_ANN)) {
7072
// cutoff last 4 chars ...
7173
String documentId = annFilePath.substring(0, annFilePath.length() - 4);
72-
73-
File txtFile = new File(documentId + ".txt");
74-
74+
File txtFile = new File(documentId + SUFFIX_TXT);
7575
if (txtFile.exists() && txtFile.isFile()) {
7676
documentIds.add(documentId);
7777
}
@@ -82,24 +82,19 @@ else if (searchRecursive && file.isDirectory()) {
8282
}
8383
}
8484
}
85-
8685
reset();
8786
}
8887

8988
@Override
9089
public BratDocument read() throws IOException {
91-
9290
BratDocument doc = null;
93-
9491
if (documentIdIterator.hasNext()) {
9592
String id = documentIdIterator.next();
96-
97-
try (InputStream txtIn = new BufferedInputStream(new FileInputStream(id + ".txt"));
98-
InputStream annIn = new BufferedInputStream(new FileInputStream(id + ".ann"))) {
93+
try (InputStream txtIn = new BufferedInputStream(new FileInputStream(id + SUFFIX_TXT));
94+
InputStream annIn = new BufferedInputStream(new FileInputStream(id + SUFFIX_ANN))) {
9995
doc = BratDocument.parseDocument(config, id, txtIn, annIn);
10096
}
10197
}
102-
10398
return doc;
10499
}
105100

0 commit comments

Comments
 (0)