Skip to content

Commit 66c1e05

Browse files
committed
Merge pull request #353 from UNC-Libraries/biomed-titles
Biomed title assignment issues
2 parents ae88961 + 03599f9 commit 66c1e05

File tree

1 file changed

+88
-62
lines changed

1 file changed

+88
-62
lines changed

deposit/src/main/java/edu/unc/lib/deposit/normalize/BioMedCentralExtrasJob.java

+88-62
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import java.util.NoSuchElementException;
1717

1818
import org.jdom2.Document;
19+
import org.jdom2.Element;
1920
import org.jdom2.input.SAXBuilder;
2021
import org.jdom2.input.sax.XMLReaders;
2122
import org.jdom2.output.Format;
@@ -33,6 +34,7 @@
3334
import edu.unc.lib.deposit.work.AbstractDepositJob;
3435
import edu.unc.lib.dl.fedora.PID;
3536
import edu.unc.lib.dl.util.ContentModelHelper;
37+
import edu.unc.lib.dl.util.ContentModelHelper.CDRProperty;
3638
import edu.unc.lib.dl.util.ContentModelHelper.DepositRelationship;
3739
import edu.unc.lib.dl.util.DepositConstants;
3840
import edu.unc.lib.dl.util.PremisEventLogger.Type;
@@ -65,6 +67,7 @@ public void runJob() {
6567
Property hasModel = model.createProperty(ContentModelHelper.FedoraProperty.hasModel.getURI().toString());
6668
Resource aggregateModel = model.createProperty(ContentModelHelper.Model.AGGREGATE_WORK.getURI().toString());
6769
Property fileLocation = model.createProperty(ContentModelHelper.DepositRelationship.stagingLocation.toString());
70+
Property mimetype = model.createProperty(ContentModelHelper.DepositRelationship.mimetype.toString());
6871

6972
// Find the aggregate objects resource
7073
Bag aggregate = null;
@@ -79,87 +82,110 @@ public void runJob() {
7982
failJob(e, "Cannot find top contents of deposit.");
8083
}
8184
log.debug("identified aggregate {}", aggregate);
82-
83-
// structure aggregate work, find the article XML
84-
String articleXMLPath = null;
85-
for(NodeIterator children = aggregate.iterator(); children.hasNext();) {
86-
Resource child = children.nextNode().asResource();
87-
String location = child.getProperty(fileLocation).getString();
88-
log.debug("examining child location {}", location);
89-
if(location.matches("data/[\\w\\-]+\\-S\\d+\\.\\w+$")) continue;
90-
if(location.matches("data/[\\w\\-]+\\.[xX][mM][lL]$")) {
91-
log.debug("Found primary Biomed XML document {}", location);
92-
articleXMLPath = location;
93-
94-
// Remove the article xml as a supplemental file
95-
child.removeProperties();
96-
StmtIterator sIt = model.listStatements(aggregate, null, child);
97-
aggregate.remove(sIt.nextStatement());
98-
sIt.close();
99-
} else {
85+
86+
// Disable DTD validation of the article xml
87+
SAXBuilder sb = new SAXBuilder(XMLReaders.NONVALIDATING);
88+
sb.setFeature("http://xml.org/sax/features/validation", false);
89+
sb.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
90+
sb.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
91+
92+
Document articleDocument = null;
93+
String articleId = null;
94+
95+
try {
96+
// Search through the incoming children files to find the primary article XML document
97+
for(NodeIterator children = aggregate.iterator(); children.hasNext();) {
98+
Resource child = children.nextNode().asResource();
99+
String location = child.getProperty(fileLocation).getString();
100+
if(location.matches("data/[\\w\\-]+\\.[xX][mM][lL]$")) {
101+
File articleXMLFile = new File(getDepositDirectory(), location);
102+
articleDocument = sb.build(articleXMLFile);
103+
104+
Element articleEl = articleDocument.getRootElement();
105+
// Store the identifier for this article to track down the primary file
106+
articleId = articleEl.getChildText("ui");
107+
108+
if (!"art".equals(articleEl.getName()) || articleId == null) {
109+
// False alarm, this is a supplemental xml file
110+
continue;
111+
}
112+
113+
log.debug("Found primary Biomed XML document {}", location);
114+
// Assign the article xml as a source metadata datastream
115+
setSourceMetadata(model, aggregate, location);
116+
// Remove the article xml as a supplemental file
117+
child.removeProperties();
118+
StmtIterator sIt = model.listStatements(aggregate, null, child);
119+
aggregate.remove(sIt.nextStatement());
120+
sIt.close();
121+
122+
break;
123+
}
124+
}
125+
126+
if (articleDocument == null || articleId == null) {
127+
failJob("Invalid BioMed package, could not locate the primary article XML document or its identifer.",
128+
null);
129+
}
130+
131+
// Find the main article file
132+
for (NodeIterator children = aggregate.iterator(); children.hasNext();) {
133+
Resource child = children.nextNode().asResource();
134+
String location = child.getProperty(fileLocation).getString();
135+
String mimetypeValue = child.getProperty(mimetype).getString();
136+
// filename will be the article ID, but not XML
137+
if("text/xml".equals(mimetypeValue) || location.indexOf(articleId + ".") == -1) continue;
138+
100139
log.debug("Found primary Biomed content document {}", location);
101140
// If this is a main object, then designate it as a default web object for its parent container
102-
Property defaultObject = model.getProperty(ContentModelHelper.CDRProperty.defaultWebObject.getURI().toString());
141+
Property defaultObject = model.getProperty(CDRProperty.defaultWebObject.getURI().toString());
103142
model.add(aggregate, defaultObject, child);
104143
}
105-
}
106-
107-
if (articleXMLPath != null) {
108-
// Assign the article xml as a source metadata datastream
109-
setSourceMetadata(model, aggregate, articleXMLPath);
110144

111145
// Build the descriptive MODS document from the article XML and any existing MODS
112-
File articleXMLFile = new File(getDepositDirectory(), articleXMLPath);
113146
PID aggregatePID = new PID(aggregate.getURI());
114147
File modsFile = new File(getDescriptionDir(), aggregatePID.getUUID()+".xml");
115148

116-
try {
117-
// Disable DTD validation of the article xml
118-
SAXBuilder sb = new SAXBuilder(XMLReaders.NONVALIDATING);
119-
sb.setFeature("http://xml.org/sax/features/validation", false);
120-
sb.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
121-
sb.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
122-
123-
Document existingModsDocument = null;
124-
// Start from an existing MODS document if there is one
125-
if (modsFile.exists()) {
126-
existingModsDocument = sb.build(modsFile);
127-
} else {
128-
// Make sure the description directory exists since there was no MODS doc
129-
File descriptionDir = new File(getDepositDirectory(), DepositConstants.DESCRIPTION_DIR);
130-
if (!descriptionDir.exists())
131-
descriptionDir.mkdir();
132-
}
149+
Document existingModsDocument = null;
150+
// Start from an existing MODS document if there is one
151+
if (modsFile.exists()) {
152+
existingModsDocument = sb.build(modsFile);
153+
} else {
154+
// Make sure the description directory exists since there was no MODS doc
155+
File descriptionDir = new File(getDepositDirectory(), DepositConstants.DESCRIPTION_DIR);
156+
if (!descriptionDir.exists())
157+
descriptionDir.mkdir();
158+
}
133159

134-
Document articleDocument = sb.build(articleXMLFile);
135-
BioMedArticleHelper biohelper = new BioMedArticleHelper();
136-
Document mods = biohelper.extractMODS(articleDocument, existingModsDocument);
137-
Map<String, String> fileLC2supplementLabels = biohelper.getFilesLC2SupplementLabels(articleDocument);
160+
BioMedArticleHelper biohelper = new BioMedArticleHelper();
161+
Document mods = biohelper.extractMODS(articleDocument, existingModsDocument);
162+
Map<String, String> fileLC2supplementLabels = biohelper.getFilesLC2SupplementLabels(articleDocument);
138163

139-
// Output the new MODS file, overwriting the existing one if it was present
140-
try (FileOutputStream out = new FileOutputStream(modsFile, false)) {
141-
new XMLOutputter(Format.getPrettyFormat()).output(mods, out);
142-
}
164+
// Output the new MODS file, overwriting the existing one if it was present
165+
try (FileOutputStream out = new FileOutputStream(modsFile, false)) {
166+
new XMLOutputter(Format.getPrettyFormat()).output(mods, out);
167+
}
143168

144-
// Label the supplemental files with values from the article xml
145-
if (fileLC2supplementLabels != null) {
146-
for (NodeIterator children = aggregate.iterator(); children.hasNext();) {
147-
Resource child = children.nextNode().asResource();
148-
String location = child.getProperty(fileLocation).getString();
149-
String filename = location.substring("data/".length()).toLowerCase();
150-
if (fileLC2supplementLabels.containsKey(filename)) {
151-
model.add(child, dprop(model, DepositRelationship.label), fileLC2supplementLabels.get(filename));
152-
}
169+
// Label the supplemental files with values from the article xml
170+
if (fileLC2supplementLabels != null) {
171+
for (NodeIterator children = aggregate.iterator(); children.hasNext();) {
172+
Resource child = children.nextNode().asResource();
173+
String location = child.getProperty(fileLocation).getString();
174+
String filename = location.substring("data/".length()).toLowerCase();
175+
if (fileLC2supplementLabels.containsKey(filename)) {
176+
model.add(child, dprop(model, DepositRelationship.label), fileLC2supplementLabels.get(filename));
177+
} else {
178+
model.add(child, dprop(model, DepositRelationship.label), filename);
153179
}
154180
}
155-
} catch (Exception e) {
156-
failJob(e, "Cannot extract metadata from BioMed Central article XML.");
157181
}
182+
} catch (Exception e) {
183+
failJob(e, "Cannot extract metadata from BioMed Central article XML.");
158184
}
159185

160186
recordDepositEvent(Type.NORMALIZATION, "Normalized BioMed Central article as aggregate with extracted description");
161187
}
162-
188+
163189
private void setSourceMetadata(Model model, Resource primaryResource, String path) {
164190
// Add the data file as a metadata datastream of the primary object
165191
PID sourceMDPID = new PID(primaryResource.getURI() + "/" + MD_SOURCE.getName());

0 commit comments

Comments
 (0)