Skip to content

Commit

Permalink
OPENNLP-1702: BratDocumentStream should process files in bratCorpusDi…
Browse files Browse the repository at this point in the history
…r deterministically

- fix by sorting all candidate files from dir lexicographically
- extracts constants where applicable
  • Loading branch information
mawiesne authored and rzo1 committed Jan 24, 2025
1 parent c5cc1f0 commit c9440e6
Showing 1 changed file with 12 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
Expand All @@ -32,6 +33,9 @@

public class BratDocumentStream implements ObjectStream<BratDocument> {

private static final String SUFFIX_ANN = ".ann";
private static final String SUFFIX_TXT = ".txt";

private final AnnotationConfiguration config;
private List<String> documentIds = new LinkedList<>();
private Iterator<String> documentIdIterator;
Expand All @@ -45,7 +49,7 @@ public class BratDocumentStream implements ObjectStream<BratDocument> {
* to find training data files.
* @param fileFilter a custom file filter to filter out certain files or null to accept all files
*
* @throws IOException if reading from the brat directory fails in anyway
* @throws IOException if reading from the brat directory fails in any way.
*/
public BratDocumentStream(AnnotationConfiguration config, File bratCorpusDirectory,
boolean searchRecursive, FileFilter fileFilter) throws IOException {
Expand All @@ -54,24 +58,20 @@ public BratDocumentStream(AnnotationConfiguration config, File bratCorpusDirecto
throw new IOException("Input corpus directory must be a directory " +
"according to File.isDirectory()!");
}

this.config = config;

Stack<File> directoryStack = new Stack<>();
directoryStack.add(bratCorpusDirectory);

while (!directoryStack.isEmpty()) {
for (File file : directoryStack.pop().listFiles(fileFilter)) {

final File[] files = directoryStack.pop().listFiles(fileFilter);
Arrays.sort(files);
for (File file : files) {
if (file.isFile()) {
String annFilePath = file.getAbsolutePath();
if (annFilePath.endsWith(".ann")) {

if (annFilePath.endsWith(SUFFIX_ANN)) {
// cutoff last 4 chars ...
String documentId = annFilePath.substring(0, annFilePath.length() - 4);

File txtFile = new File(documentId + ".txt");

File txtFile = new File(documentId + SUFFIX_TXT);
if (txtFile.exists() && txtFile.isFile()) {
documentIds.add(documentId);
}
Expand All @@ -82,24 +82,19 @@ else if (searchRecursive && file.isDirectory()) {
}
}
}

reset();
}

@Override
public BratDocument read() throws IOException {

BratDocument doc = null;

if (documentIdIterator.hasNext()) {
String id = documentIdIterator.next();

try (InputStream txtIn = new BufferedInputStream(new FileInputStream(id + ".txt"));
InputStream annIn = new BufferedInputStream(new FileInputStream(id + ".ann"))) {
try (InputStream txtIn = new BufferedInputStream(new FileInputStream(id + SUFFIX_TXT));
InputStream annIn = new BufferedInputStream(new FileInputStream(id + SUFFIX_ANN))) {
doc = BratDocument.parseDocument(config, id, txtIn, annIn);
}
}

return doc;
}

Expand Down

0 comments on commit c9440e6

Please sign in to comment.