23
23
import java .io .FileInputStream ;
24
24
import java .io .IOException ;
25
25
import java .io .InputStream ;
26
+ import java .util .Arrays ;
26
27
import java .util .Iterator ;
27
28
import java .util .LinkedList ;
28
29
import java .util .List ;
32
33
33
34
public class BratDocumentStream implements ObjectStream <BratDocument > {
34
35
36
+ private static final String SUFFIX_ANN = ".ann" ;
37
+ private static final String SUFFIX_TXT = ".txt" ;
38
+
35
39
private final AnnotationConfiguration config ;
36
40
private List <String > documentIds = new LinkedList <>();
37
41
private Iterator <String > documentIdIterator ;
@@ -45,7 +49,7 @@ public class BratDocumentStream implements ObjectStream<BratDocument> {
45
49
* to find training data files.
46
50
* @param fileFilter a custom file filter to filter out certain files or null to accept all files
47
51
*
48
- * @throws IOException if reading from the brat directory fails in anyway
52
+ * @throws IOException if reading from the brat directory fails in any way.
49
53
*/
50
54
public BratDocumentStream (AnnotationConfiguration config , File bratCorpusDirectory ,
51
55
boolean searchRecursive , FileFilter fileFilter ) throws IOException {
@@ -54,24 +58,20 @@ public BratDocumentStream(AnnotationConfiguration config, File bratCorpusDirecto
54
58
throw new IOException ("Input corpus directory must be a directory " +
55
59
"according to File.isDirectory()!" );
56
60
}
57
-
58
61
this .config = config ;
59
62
60
63
Stack <File > directoryStack = new Stack <>();
61
64
directoryStack .add (bratCorpusDirectory );
62
-
63
65
while (!directoryStack .isEmpty ()) {
64
- for (File file : directoryStack .pop ().listFiles (fileFilter )) {
65
-
66
+ final File [] files = directoryStack .pop ().listFiles (fileFilter );
67
+ Arrays .sort (files );
68
+ for (File file : files ) {
66
69
if (file .isFile ()) {
67
70
String annFilePath = file .getAbsolutePath ();
68
- if (annFilePath .endsWith (".ann" )) {
69
-
71
+ if (annFilePath .endsWith (SUFFIX_ANN )) {
70
72
// cutoff last 4 chars ...
71
73
String documentId = annFilePath .substring (0 , annFilePath .length () - 4 );
72
-
73
- File txtFile = new File (documentId + ".txt" );
74
-
74
+ File txtFile = new File (documentId + SUFFIX_TXT );
75
75
if (txtFile .exists () && txtFile .isFile ()) {
76
76
documentIds .add (documentId );
77
77
}
@@ -82,24 +82,19 @@ else if (searchRecursive && file.isDirectory()) {
82
82
}
83
83
}
84
84
}
85
-
86
85
reset ();
87
86
}
88
87
89
88
@ Override
90
89
public BratDocument read () throws IOException {
91
-
92
90
BratDocument doc = null ;
93
-
94
91
if (documentIdIterator .hasNext ()) {
95
92
String id = documentIdIterator .next ();
96
-
97
- try (InputStream txtIn = new BufferedInputStream (new FileInputStream (id + ".txt" ));
98
- InputStream annIn = new BufferedInputStream (new FileInputStream (id + ".ann" ))) {
93
+ try (InputStream txtIn = new BufferedInputStream (new FileInputStream (id + SUFFIX_TXT ));
94
+ InputStream annIn = new BufferedInputStream (new FileInputStream (id + SUFFIX_ANN ))) {
99
95
doc = BratDocument .parseDocument (config , id , txtIn , annIn );
100
96
}
101
97
}
102
-
103
98
return doc ;
104
99
}
105
100
0 commit comments