Skip to content

Commit 5350cf0

Browse files
authored
Adjust FITS mimetype conflict resolution to rank returned options, filter out invalid results. Use a temp dir for storing symlinks, since nano time can very occasionally produce the same name. Move symlinks to deposit directory so that FITS will have read permission (#1745)
1 parent 46a2151 commit 5350cf0

File tree

4 files changed

+125
-23
lines changed

4 files changed

+125
-23
lines changed

Diff for: deposit-app/src/main/java/edu/unc/lib/boxc/deposit/validate/ExtractTechnicalMetadataJob.java

+32-21
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import edu.unc.lib.boxc.deposit.work.AbstractConcurrentDepositJob;
77
import edu.unc.lib.boxc.deposit.work.JobFailedException;
88
import edu.unc.lib.boxc.deposit.work.JobInterruptedException;
9+
import edu.unc.lib.boxc.model.api.exceptions.RepositoryException;
910
import edu.unc.lib.boxc.model.api.ids.PID;
1011
import edu.unc.lib.boxc.model.api.rdf.CdrDeposit;
1112
import edu.unc.lib.boxc.model.fcrepo.ids.DatastreamPids;
@@ -43,13 +44,15 @@
4344
import java.nio.file.Paths;
4445
import java.util.ArrayList;
4546
import java.util.Arrays;
47+
import java.util.Comparator;
4648
import java.util.HashSet;
4749
import java.util.Iterator;
4850
import java.util.List;
4951
import java.util.Map.Entry;
5052
import java.util.Objects;
5153
import java.util.Set;
5254
import java.util.regex.Pattern;
55+
import java.util.stream.Collectors;
5356

5457
import static edu.unc.lib.boxc.common.xml.SecureXMLFactory.createSAXBuilder;
5558
import static edu.unc.lib.boxc.model.api.rdf.CdrDeposit.mimetype;
@@ -73,7 +76,7 @@ public class ExtractTechnicalMetadataJob extends AbstractConcurrentDepositJob {
7376

7477
private static final String FITS_SINGLE_STATUS = "SINGLE_RESULT";
7578
private static final String FITS_EXAMINE_PATH = "examine";
76-
private static final Path TMP_PATH = Paths.get(System.getProperty("java.io.tmpdir"));
79+
private static final String MIMETYPE_ATTR = "mimetype";
7780

7881
private CloseableHttpClient httpClient;
7982

@@ -215,10 +218,12 @@ public void run() {
215218

216219
// Symlink the file before processing
217220
Path linkPath = makeSymlinkForStagedPath(stagedPath, providedLabel);
218-
// Generate the FITS report as a document
219-
Document fitsDoc = getFitsDocument(objPid, linkPath);
220221

222+
Document fitsDoc = null;
221223
try {
224+
// Generate the FITS report as a document
225+
fitsDoc = getFitsDocument(objPid, linkPath);
226+
222227
// Create the PREMIS report wrapper for the FITS results
223228
Document premisDoc = generatePremisReport(objPid, fitsDoc);
224229
Element premisObjCharsEl = getObjectCharacteristics(premisDoc);
@@ -234,11 +239,11 @@ public void run() {
234239
writePremisReport(objPid, premisDoc);
235240

236241
receiveResult(result);
237-
} catch (JobFailedException | JobInterruptedException e) {
242+
} catch (JobFailedException | JobInterruptedException | RepositoryException e) {
238243
throw e;
239244
} catch (Exception e) {
240245
failJob(e, "Failed to extract FITS details for file '{0}' with id {1} from document:\n{2}",
241-
stagedPath, objPid.getId(), getXMLOutputter().outputString(fitsDoc));
246+
stagedPath, objPid.getId(), fitsDoc != null ? getXMLOutputter().outputString(fitsDoc) : "null");
242247
} finally {
243248
try {
244249
Files.delete(linkPath);
@@ -263,7 +268,7 @@ private void addFileIdentification(Document fitsDoc, Element premisObjCharsEl) {
263268
String fitsMimetype = null;
264269
String format;
265270
if (identity != null) {
266-
fitsMimetype = identity.getAttributeValue("mimetype");
271+
fitsMimetype = identity.getAttributeValue(MIMETYPE_ATTR);
267272
format = identity.getAttributeValue("format");
268273
} else {
269274
format = "Unknown";
@@ -347,15 +352,18 @@ protected Path makeSymlinkForStagedPath(String stagedUriString, String label) {
347352
// Resolve the path from a URI and make it absolute
348353
URI stagedUri = URI.create(stagedUriString);
349354
Path stagedPath;
355+
File depositDirectory = getDepositDirectory();
350356
if (!stagedUri.isAbsolute()) {
351-
stagedPath = Paths.get(getDepositDirectory().toString(), stagedUriString);
357+
stagedPath = Paths.get(depositDirectory.toString(), stagedUriString);
352358
} else {
353359
stagedPath = Paths.get(stagedUri);
354360
}
355361
try {
362+
// Create a unique parent directory for the symlink to avoid filename conflicts
363+
var parentDir = Files.createTempDirectory(depositDirectory.toPath(), "fits_staging");
364+
// Assign the same permissions as the parent directory to the temp dir, since createTempDirectory is restrictive
365+
Files.setPosixFilePermissions(parentDir, Files.getPosixFilePermissions(parentDir.getParent()));
356366
// Create a symlink to the file to make use of the original filename and avoid issues with non-ascii characters
357-
var parentDir = TMP_PATH.resolve(Long.toString(System.nanoTime()));
358-
Files.createDirectories(parentDir);
359367
String symlinkName = label != null ? label : stagedPath.getFileName().toString();
360368
var linkPath = sanitizeCliPath(parentDir.resolve(symlinkName));
361369
Files.createSymbolicLink(linkPath, stagedPath);
@@ -511,19 +519,22 @@ private Element getFitsIdentificationInformation(Document fitsDoc) {
511519
return null;
512520
}
513521

514-
// Conflicting identification from FITS, try to resolve
515-
// Don't trust Exiftool if it detects a symlink, which is does not follow to the file.
516-
// Trust any answer agreed on by multiple tools
517-
for (Element el : identification.getChildren("identity", FITS_NS)) {
518-
if (el.getChildren("tool", FITS_NS).size() > 1
519-
|| !("Exiftool".equals(el.getChild("tool", FITS_NS).getAttributeValue("toolname"))
520-
&& "application/x-symlink".equals(el.getAttributeValue("mimetype")))) {
521-
return el;
522-
}
523-
}
522+
// Sort the identification elements to find the best value returned by FITS
523+
var identityEls = identification.getChildren("identity", FITS_NS).stream()
524+
// Filter out any invalid entries
525+
.filter(el -> MimetypeHelpers.isValidMimetype(el.getAttributeValue(MIMETYPE_ATTR)))
526+
// Primarily sort by the best ranking mimetype
527+
.sorted(Comparator.comparingInt((Element el) -> rankMimetype(el.getAttributeValue(MIMETYPE_ATTR)))
528+
// Then rank by the number of tools that agreed on the mimetype
529+
.thenComparingInt(el -> el.getChildren("tool", FITS_NS).size())
530+
// Reverse so both rank and tool count is in descending order
531+
.reversed()
532+
// And then favor more application specific mimetypes
533+
.thenComparingInt(el -> el.getAttributeValue(MIMETYPE_ATTR).contains("x-") ? -1 : 0))
534+
.collect(Collectors.toList());
535+
// Return the best ranking identification, or null if none are valid
536+
return identityEls.isEmpty() ? null : identityEls.get(0);
524537
}
525-
526-
return null;
527538
}
528539

529540
private int rankMimetype(String mimetype) {

Diff for: deposit-app/src/test/java/edu/unc/lib/boxc/deposit/validate/ExtractTechnicalMetadataJobTest.java

+15-1
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,20 @@ public void exifSymlinkConflictMimetypeTest() throws Exception {
227227
verifyFileResults(filePid, CONFLICT_MIMETYPE, CONFLICT_FORMAT, CONFLICT_MD5, 1);
228228
}
229229

230+
@Test
231+
public void multiRankingSpecificityConflictMimetypeTest() throws Exception {
232+
respondWithFile("/fitsReports/conflictRankingReport.xml");
233+
234+
// Providing octet stream mimetype to be overridden
235+
PID filePid = addFileObject(depositBag, CONFLICT_FILEPATH, OCTET_MIMETYPE, null);
236+
job.closeModel();
237+
238+
job.run();
239+
240+
verifyRequestParameters(CONFLICT_FILEPATH);
241+
verifyFileResults(filePid, "image/x-nikon-nef", "NEF EXIF", CONFLICT_MD5, 1);
242+
}
243+
230244
@Test
231245
public void exifMimetypeTest() throws Exception {
232246
respondWithFile("/fitsReports/exifReport.xml");
@@ -568,7 +582,7 @@ private void verifyRequestParameters(String expectedFilepath) throws Exception {
568582
String submittedPath = getSubmittedFilePath(request);
569583

570584
String failMessage = "FITS service called with wrong path. Expected " + expectedFilepath + " but got " + submittedPath;
571-
assertTrue(submittedPath.startsWith(TMP_PATH.toString().replace("/", "%2F")), failMessage);
585+
assertTrue(submittedPath.startsWith(job.getDepositDirectory().toString().replace("/", "%2F")), failMessage);
572586
assertTrue(submittedPath.endsWith("%2F" + Paths.get(expectedFilepath).getFileName()), failMessage);
573587
}
574588

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="1.6.0" timestamp="2/19/24 11:32 AM">
3+
<identification status="CONFLICT">
4+
<identity format="symbolic link to `/path/to/2217c4ce-9ab7-409a-9538-ef961ba119cf/data/_e5c9e83b-0427-4ec3-bb0f-fd24a992a56e'" mimetype="inode/symlink" toolname="FITS" toolversion="1.6.0">
5+
<tool toolname="file utility" toolversion="5.11" />
6+
</identity>
7+
<identity format="Text Plain" mimetype="text/plain" toolname="FITS" toolversion="1.6.0">
8+
<tool toolname="Droid" toolversion="6.1.5" />
9+
</identity>
10+
<identity format="Tagged Image File Format" mimetype="image/tiff" toolname="FITS" toolversion="1.6.0">
11+
<tool toolname="Tika" toolversion="2.6.0" />
12+
</identity>
13+
<identity format="NEF EXIF" mimetype="image/x-nikon-nef" toolname="FITS" toolversion="1.6.0">
14+
<tool toolname="Exiftool" toolversion="12.50" />
15+
</identity>
16+
</identification>
17+
<fileinfo>
18+
<lastmodified toolname="Exiftool" toolversion="12.50">2017-08-22T17:00:41</lastmodified>
19+
<created toolname="Exiftool" toolversion="12.50">2017-08-22T17:00:41</created>
20+
<filepath toolname="OIS File Information" toolversion="1.0" status="SINGLE_RESULT">/path/to/temp/9064692952992/20170822_068.nef</filepath>
21+
<filename toolname="OIS File Information" toolversion="1.0" status="SINGLE_RESULT">20170822_068.nef</filename>
22+
<size toolname="OIS File Information" toolversion="1.0">13887221</size>
23+
<md5checksum toolname="OIS File Information" toolversion="1.0" status="SINGLE_RESULT">238b8c4a61fda89ef829c6b7d69b57fa</md5checksum>
24+
<fslastmodified toolname="OIS File Information" toolversion="1.0" status="SINGLE_RESULT">1717086826457</fslastmodified>
25+
</fileinfo>
26+
<filestatus />
27+
<metadata>
28+
<image>
29+
<compressionScheme toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">JPEG</compressionScheme>
30+
<imageWidth toolname="Exiftool" toolversion="12.50">4304</imageWidth>
31+
<imageHeight toolname="Exiftool" toolversion="12.50">2864</imageHeight>
32+
<colorSpace toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">RGB</colorSpace>
33+
<referenceBlackWhite toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">0 255 0 255 0 255</referenceBlackWhite>
34+
<YCbCrPositioning toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">2</YCbCrPositioning>
35+
<orientation toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">normal*</orientation>
36+
<samplingFrequencyUnit toolname="Exiftool" toolversion="12.50">in.</samplingFrequencyUnit>
37+
<xSamplingFrequency toolname="Exiftool" toolversion="12.50">300</xSamplingFrequency>
38+
<ySamplingFrequency toolname="Exiftool" toolversion="12.50">300</ySamplingFrequency>
39+
<bitsPerSample toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">8 8 8</bitsPerSample>
40+
<samplesPerPixel toolname="Exiftool" toolversion="12.50" status="CONFLICT">3</samplesPerPixel>
41+
<samplesPerPixel toolname="Tika" toolversion="2.6.0" status="CONFLICT">1</samplesPerPixel>
42+
<captureDevice toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">digital still camera</captureDevice>
43+
<digitalCameraManufacturer toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">NIKON CORPORATION</digitalCameraManufacturer>
44+
<digitalCameraModelName toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">NIKON D500</digitalCameraModelName>
45+
<scanningSoftwareName toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">Ver.1.13</scanningSoftwareName>
46+
<fNumber toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">3.5</fNumber>
47+
<exposureTime toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">0.002</exposureTime>
48+
<exposureProgram toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">Manual</exposureProgram>
49+
<isoSpeedRating toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">6400</isoSpeedRating>
50+
<exposureBiasValue toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">0</exposureBiasValue>
51+
<maxApertureValue toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">2.8</maxApertureValue>
52+
<meteringMode toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">Pattern</meteringMode>
53+
<lightSource toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">unknown</lightSource>
54+
<flash toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">Flash did not fire</flash>
55+
<focalLength toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">200.0</focalLength>
56+
<sensingMethod toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">One-chip color area sensor</sensingMethod>
57+
<cfaPattern toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">[Red,Green][Green,Blue]</cfaPattern>
58+
<cfaPattern2 toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">0 1 1 2</cfaPattern2>
59+
<gpsVersionID toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">2.3.0.0</gpsVersionID>
60+
</image>
61+
</metadata>
62+
<statistics fitsExecutionTime="8234">
63+
<tool toolname="MediaInfo" toolversion="23.09" status="did not run" />
64+
<tool toolname="OIS Audio Information" toolversion="0.1" status="did not run" />
65+
<tool toolname="ADL Tool" toolversion="0.1" status="did not run" />
66+
<tool toolname="VTT Tool" toolversion="0.1" status="did not run" />
67+
<tool toolname="Droid" toolversion="6.5.2" executionTime="5814" />
68+
<tool toolname="jpylyzer" toolversion="2.1.0" status="did not run" />
69+
<tool toolname="embARC" toolversion="0.2" status="did not run" />
70+
<tool toolname="file utility" toolversion="5.11" executionTime="8031" />
71+
<tool toolname="Exiftool" toolversion="12.50" executionTime="8053" />
72+
<tool toolname="NLNZ Metadata Extractor" toolversion="3.6GA" status="did not run" />
73+
<tool toolname="OIS File Information" toolversion="1.0" executionTime="5717" />
74+
<tool toolname="OIS XML Metadata" toolversion="0.2" status="did not run" />
75+
<tool toolname="Tika" toolversion="2.6.0" executionTime="7168" />
76+
</statistics>
77+
</fits>

Diff for: deposit-app/src/test/resources/fitsReports/conflictTypeReport.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<tool toolname="Exiftool" toolversion="10.00" />
99
<externalIdentifier toolname="Droid" toolversion="6.1.5" type="puid">fmt/141</externalIdentifier>
1010
</identity>
11-
<identity format="WAVE" mimetype="audio/x-wave" toolname="FITS" toolversion="1.0.5">
11+
<identity format="WAVERLY" mimetype="audio/x-waverly" toolname="FITS" toolversion="1.0.5">
1212
<tool toolname="Jhove" toolversion="1.11" />
1313
</identity>
1414
</identification>

0 commit comments

Comments
 (0)