16
16
import java .util .NoSuchElementException ;
17
17
18
18
import org .jdom2 .Document ;
19
+ import org .jdom2 .Element ;
19
20
import org .jdom2 .input .SAXBuilder ;
20
21
import org .jdom2 .input .sax .XMLReaders ;
21
22
import org .jdom2 .output .Format ;
33
34
import edu .unc .lib .deposit .work .AbstractDepositJob ;
34
35
import edu .unc .lib .dl .fedora .PID ;
35
36
import edu .unc .lib .dl .util .ContentModelHelper ;
37
+ import edu .unc .lib .dl .util .ContentModelHelper .CDRProperty ;
36
38
import edu .unc .lib .dl .util .ContentModelHelper .DepositRelationship ;
37
39
import edu .unc .lib .dl .util .DepositConstants ;
38
40
import edu .unc .lib .dl .util .PremisEventLogger .Type ;
@@ -65,6 +67,7 @@ public void runJob() {
65
67
Property hasModel = model .createProperty (ContentModelHelper .FedoraProperty .hasModel .getURI ().toString ());
66
68
Resource aggregateModel = model .createProperty (ContentModelHelper .Model .AGGREGATE_WORK .getURI ().toString ());
67
69
Property fileLocation = model .createProperty (ContentModelHelper .DepositRelationship .stagingLocation .toString ());
70
+ Property mimetype = model .createProperty (ContentModelHelper .DepositRelationship .mimetype .toString ());
68
71
69
72
// Find the aggregate objects resource
70
73
Bag aggregate = null ;
@@ -79,87 +82,110 @@ public void runJob() {
79
82
failJob (e , "Cannot find top contents of deposit." );
80
83
}
81
84
log .debug ("identified aggregate {}" , aggregate );
82
-
83
- // structure aggregate work, find the article XML
84
- String articleXMLPath = null ;
85
- for (NodeIterator children = aggregate .iterator (); children .hasNext ();) {
86
- Resource child = children .nextNode ().asResource ();
87
- String location = child .getProperty (fileLocation ).getString ();
88
- log .debug ("examining child location {}" , location );
89
- if (location .matches ("data/[\\ w\\ -]+\\ -S\\ d+\\ .\\ w+$" )) continue ;
90
- if (location .matches ("data/[\\ w\\ -]+\\ .[xX][mM][lL]$" )) {
91
- log .debug ("Found primary Biomed XML document {}" , location );
92
- articleXMLPath = location ;
93
-
94
- // Remove the article xml as a supplemental file
95
- child .removeProperties ();
96
- StmtIterator sIt = model .listStatements (aggregate , null , child );
97
- aggregate .remove (sIt .nextStatement ());
98
- sIt .close ();
99
- } else {
85
+
86
+ // Disable DTD validation of the article xml
87
+ SAXBuilder sb = new SAXBuilder (XMLReaders .NONVALIDATING );
88
+ sb .setFeature ("http://xml.org/sax/features/validation" , false );
89
+ sb .setFeature ("http://apache.org/xml/features/nonvalidating/load-dtd-grammar" , false );
90
+ sb .setFeature ("http://apache.org/xml/features/nonvalidating/load-external-dtd" , false );
91
+
92
+ Document articleDocument = null ;
93
+ String articleId = null ;
94
+
95
+ try {
96
+ // Search through the incoming children files to find the primary article XML document
97
+ for (NodeIterator children = aggregate .iterator (); children .hasNext ();) {
98
+ Resource child = children .nextNode ().asResource ();
99
+ String location = child .getProperty (fileLocation ).getString ();
100
+ if (location .matches ("data/[\\ w\\ -]+\\ .[xX][mM][lL]$" )) {
101
+ File articleXMLFile = new File (getDepositDirectory (), location );
102
+ articleDocument = sb .build (articleXMLFile );
103
+
104
+ Element articleEl = articleDocument .getRootElement ();
105
+ // Store the identifier for this article to track down the primary file
106
+ articleId = articleEl .getChildText ("ui" );
107
+
108
+ if (!"art" .equals (articleEl .getName ()) || articleId == null ) {
109
+ // False alarm, this is a supplemental xml file
110
+ continue ;
111
+ }
112
+
113
+ log .debug ("Found primary Biomed XML document {}" , location );
114
+ // Assign the article xml as a source metadata datastream
115
+ setSourceMetadata (model , aggregate , location );
116
+ // Remove the article xml as a supplemental file
117
+ child .removeProperties ();
118
+ StmtIterator sIt = model .listStatements (aggregate , null , child );
119
+ aggregate .remove (sIt .nextStatement ());
120
+ sIt .close ();
121
+
122
+ break ;
123
+ }
124
+ }
125
+
126
+ if (articleDocument == null || articleId == null ) {
127
+ failJob ("Invalid BioMed package, could not locate the primary article XML document or its identifer." ,
128
+ null );
129
+ }
130
+
131
+ // Find the main article file
132
+ for (NodeIterator children = aggregate .iterator (); children .hasNext ();) {
133
+ Resource child = children .nextNode ().asResource ();
134
+ String location = child .getProperty (fileLocation ).getString ();
135
+ String mimetypeValue = child .getProperty (mimetype ).getString ();
136
+ // filename will be the article ID, but not XML
137
+ if ("text/xml" .equals (mimetypeValue ) || location .indexOf (articleId + "." ) == -1 ) continue ;
138
+
100
139
log .debug ("Found primary Biomed content document {}" , location );
101
140
// If this is a main object, then designate it as a default web object for its parent container
102
- Property defaultObject = model .getProperty (ContentModelHelper . CDRProperty .defaultWebObject .getURI ().toString ());
141
+ Property defaultObject = model .getProperty (CDRProperty .defaultWebObject .getURI ().toString ());
103
142
model .add (aggregate , defaultObject , child );
104
143
}
105
- }
106
-
107
- if (articleXMLPath != null ) {
108
- // Assign the article xml as a source metadata datastream
109
- setSourceMetadata (model , aggregate , articleXMLPath );
110
144
111
145
// Build the descriptive MODS document from the article XML and any existing MODS
112
- File articleXMLFile = new File (getDepositDirectory (), articleXMLPath );
113
146
PID aggregatePID = new PID (aggregate .getURI ());
114
147
File modsFile = new File (getDescriptionDir (), aggregatePID .getUUID ()+".xml" );
115
148
116
- try {
117
- // Disable DTD validation of the article xml
118
- SAXBuilder sb = new SAXBuilder (XMLReaders .NONVALIDATING );
119
- sb .setFeature ("http://xml.org/sax/features/validation" , false );
120
- sb .setFeature ("http://apache.org/xml/features/nonvalidating/load-dtd-grammar" , false );
121
- sb .setFeature ("http://apache.org/xml/features/nonvalidating/load-external-dtd" , false );
122
-
123
- Document existingModsDocument = null ;
124
- // Start from an existing MODS document if there is one
125
- if (modsFile .exists ()) {
126
- existingModsDocument = sb .build (modsFile );
127
- } else {
128
- // Make sure the description directory exists since there was no MODS doc
129
- File descriptionDir = new File (getDepositDirectory (), DepositConstants .DESCRIPTION_DIR );
130
- if (!descriptionDir .exists ())
131
- descriptionDir .mkdir ();
132
- }
149
+ Document existingModsDocument = null ;
150
+ // Start from an existing MODS document if there is one
151
+ if (modsFile .exists ()) {
152
+ existingModsDocument = sb .build (modsFile );
153
+ } else {
154
+ // Make sure the description directory exists since there was no MODS doc
155
+ File descriptionDir = new File (getDepositDirectory (), DepositConstants .DESCRIPTION_DIR );
156
+ if (!descriptionDir .exists ())
157
+ descriptionDir .mkdir ();
158
+ }
133
159
134
- Document articleDocument = sb .build (articleXMLFile );
135
- BioMedArticleHelper biohelper = new BioMedArticleHelper ();
136
- Document mods = biohelper .extractMODS (articleDocument , existingModsDocument );
137
- Map <String , String > fileLC2supplementLabels = biohelper .getFilesLC2SupplementLabels (articleDocument );
160
+ BioMedArticleHelper biohelper = new BioMedArticleHelper ();
161
+ Document mods = biohelper .extractMODS (articleDocument , existingModsDocument );
162
+ Map <String , String > fileLC2supplementLabels = biohelper .getFilesLC2SupplementLabels (articleDocument );
138
163
139
- // Output the new MODS file, overwriting the existing one if it was present
140
- try (FileOutputStream out = new FileOutputStream (modsFile , false )) {
141
- new XMLOutputter (Format .getPrettyFormat ()).output (mods , out );
142
- }
164
+ // Output the new MODS file, overwriting the existing one if it was present
165
+ try (FileOutputStream out = new FileOutputStream (modsFile , false )) {
166
+ new XMLOutputter (Format .getPrettyFormat ()).output (mods , out );
167
+ }
143
168
144
- // Label the supplemental files with values from the article xml
145
- if (fileLC2supplementLabels != null ) {
146
- for (NodeIterator children = aggregate .iterator (); children .hasNext ();) {
147
- Resource child = children .nextNode ().asResource ();
148
- String location = child .getProperty (fileLocation ).getString ();
149
- String filename = location .substring ("data/" .length ()).toLowerCase ();
150
- if (fileLC2supplementLabels .containsKey (filename )) {
151
- model .add (child , dprop (model , DepositRelationship .label ), fileLC2supplementLabels .get (filename ));
152
- }
169
+ // Label the supplemental files with values from the article xml
170
+ if (fileLC2supplementLabels != null ) {
171
+ for (NodeIterator children = aggregate .iterator (); children .hasNext ();) {
172
+ Resource child = children .nextNode ().asResource ();
173
+ String location = child .getProperty (fileLocation ).getString ();
174
+ String filename = location .substring ("data/" .length ()).toLowerCase ();
175
+ if (fileLC2supplementLabels .containsKey (filename )) {
176
+ model .add (child , dprop (model , DepositRelationship .label ), fileLC2supplementLabels .get (filename ));
177
+ } else {
178
+ model .add (child , dprop (model , DepositRelationship .label ), filename );
153
179
}
154
180
}
155
- } catch (Exception e ) {
156
- failJob (e , "Cannot extract metadata from BioMed Central article XML." );
157
181
}
182
+ } catch (Exception e ) {
183
+ failJob (e , "Cannot extract metadata from BioMed Central article XML." );
158
184
}
159
185
160
186
recordDepositEvent (Type .NORMALIZATION , "Normalized BioMed Central article as aggregate with extracted description" );
161
187
}
162
-
188
+
163
189
private void setSourceMetadata (Model model , Resource primaryResource , String path ) {
164
190
// Add the data file as a metadata datastream of the primary object
165
191
PID sourceMDPID = new PID (primaryResource .getURI () + "/" + MD_SOURCE .getName ());
0 commit comments