2323import java .io .FileInputStream ;
2424import java .io .IOException ;
2525import java .io .InputStream ;
26+ import java .util .Arrays ;
2627import java .util .Iterator ;
2728import java .util .LinkedList ;
2829import java .util .List ;
3233
3334public class BratDocumentStream implements ObjectStream <BratDocument > {
3435
36+ private static final String SUFFIX_ANN = ".ann" ;
37+ private static final String SUFFIX_TXT = ".txt" ;
38+
3539 private final AnnotationConfiguration config ;
3640 private List <String > documentIds = new LinkedList <>();
3741 private Iterator <String > documentIdIterator ;
@@ -45,7 +49,7 @@ public class BratDocumentStream implements ObjectStream<BratDocument> {
4549 * to find training data files.
4650 * @param fileFilter a custom file filter to filter out certain files or null to accept all files
4751 *
48- * @throws IOException if reading from the brat directory fails in anyway
52+ * @throws IOException if reading from the brat directory fails in any way.
4953 */
5054 public BratDocumentStream (AnnotationConfiguration config , File bratCorpusDirectory ,
5155 boolean searchRecursive , FileFilter fileFilter ) throws IOException {
@@ -54,24 +58,20 @@ public BratDocumentStream(AnnotationConfiguration config, File bratCorpusDirecto
5458 throw new IOException ("Input corpus directory must be a directory " +
5559 "according to File.isDirectory()!" );
5660 }
57-
5861 this .config = config ;
5962
6063 Stack <File > directoryStack = new Stack <>();
6164 directoryStack .add (bratCorpusDirectory );
62-
6365 while (!directoryStack .isEmpty ()) {
64- for (File file : directoryStack .pop ().listFiles (fileFilter )) {
65-
66+ final File [] files = directoryStack .pop ().listFiles (fileFilter );
67+ Arrays .sort (files );
68+ for (File file : files ) {
6669 if (file .isFile ()) {
6770 String annFilePath = file .getAbsolutePath ();
68- if (annFilePath .endsWith (".ann" )) {
69-
71+ if (annFilePath .endsWith (SUFFIX_ANN )) {
7072 // cutoff last 4 chars ...
7173 String documentId = annFilePath .substring (0 , annFilePath .length () - 4 );
72-
73- File txtFile = new File (documentId + ".txt" );
74-
74+ File txtFile = new File (documentId + SUFFIX_TXT );
7575 if (txtFile .exists () && txtFile .isFile ()) {
7676 documentIds .add (documentId );
7777 }
@@ -82,24 +82,19 @@ else if (searchRecursive && file.isDirectory()) {
8282 }
8383 }
8484 }
85-
8685 reset ();
8786 }
8887
8988 @ Override
9089 public BratDocument read () throws IOException {
91-
9290 BratDocument doc = null ;
93-
9491 if (documentIdIterator .hasNext ()) {
9592 String id = documentIdIterator .next ();
96-
97- try (InputStream txtIn = new BufferedInputStream (new FileInputStream (id + ".txt" ));
98- InputStream annIn = new BufferedInputStream (new FileInputStream (id + ".ann" ))) {
93+ try (InputStream txtIn = new BufferedInputStream (new FileInputStream (id + SUFFIX_TXT ));
94+ InputStream annIn = new BufferedInputStream (new FileInputStream (id + SUFFIX_ANN ))) {
9995 doc = BratDocument .parseDocument (config , id , txtIn , annIn );
10096 }
10197 }
102-
10398 return doc ;
10499 }
105100
0 commit comments