From c9440e68e098654a9f4315c816d0f0dd0cbef7b5 Mon Sep 17 00:00:00 2001 From: Martin Wiesner Date: Fri, 24 Jan 2025 15:41:46 +0100 Subject: [PATCH] OPENNLP-1702: BratDocumentStream should process files in bratCorpusDir deterministically - fix by sorting all candidate files from dir lexicographically - extracts constants where applicable --- .../formats/brat/BratDocumentStream.java | 29 ++++++++----------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java index 5abaf0ea8..36d6d287e 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java @@ -23,6 +23,7 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.util.Arrays; import java.util.Iterator; import java.util.LinkedList; import java.util.List; @@ -32,6 +33,9 @@ public class BratDocumentStream implements ObjectStream { + private static final String SUFFIX_ANN = ".ann"; + private static final String SUFFIX_TXT = ".txt"; + private final AnnotationConfiguration config; private List documentIds = new LinkedList<>(); private Iterator documentIdIterator; @@ -45,7 +49,7 @@ public class BratDocumentStream implements ObjectStream { * to find training data files. * @param fileFilter a custom file filter to filter out certain files or null to accept all files * - * @throws IOException if reading from the brat directory fails in anyway + * @throws IOException if reading from the brat directory fails in any way. */ public BratDocumentStream(AnnotationConfiguration config, File bratCorpusDirectory, boolean searchRecursive, FileFilter fileFilter) throws IOException { @@ -54,24 +58,20 @@ public BratDocumentStream(AnnotationConfiguration config, File bratCorpusDirecto throw new IOException("Input corpus directory must be a directory " + "according to File.isDirectory()!"); } - this.config = config; Stack directoryStack = new Stack<>(); directoryStack.add(bratCorpusDirectory); - while (!directoryStack.isEmpty()) { - for (File file : directoryStack.pop().listFiles(fileFilter)) { - + final File[] files = directoryStack.pop().listFiles(fileFilter); + Arrays.sort(files); + for (File file : files) { if (file.isFile()) { String annFilePath = file.getAbsolutePath(); - if (annFilePath.endsWith(".ann")) { - + if (annFilePath.endsWith(SUFFIX_ANN)) { // cutoff last 4 chars ... String documentId = annFilePath.substring(0, annFilePath.length() - 4); - - File txtFile = new File(documentId + ".txt"); - + File txtFile = new File(documentId + SUFFIX_TXT); if (txtFile.exists() && txtFile.isFile()) { documentIds.add(documentId); } @@ -82,24 +82,19 @@ else if (searchRecursive && file.isDirectory()) { } } } - reset(); } @Override public BratDocument read() throws IOException { - BratDocument doc = null; - if (documentIdIterator.hasNext()) { String id = documentIdIterator.next(); - - try (InputStream txtIn = new BufferedInputStream(new FileInputStream(id + ".txt")); - InputStream annIn = new BufferedInputStream(new FileInputStream(id + ".ann"))) { + try (InputStream txtIn = new BufferedInputStream(new FileInputStream(id + SUFFIX_TXT)); + InputStream annIn = new BufferedInputStream(new FileInputStream(id + SUFFIX_ANN))) { doc = BratDocument.parseDocument(config, id, txtIn, annIn); } } - return doc; }