Skip to content

Commit 1b9fee9

Browse files
committed
Update FITS job to always use a symlink before processing, and use the provided label for the filename if present. Files with unicode characters can be processed by the FITS web app now since the paths get sanitized there now too
1 parent f933e7c commit 1b9fee9

File tree

2 files changed

+106
-66
lines changed

2 files changed

+106
-66
lines changed

deposit-app/src/main/java/edu/unc/lib/boxc/deposit/validate/ExtractTechnicalMetadataJob.java

Lines changed: 57 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import edu.unc.lib.boxc.deposit.work.JobFailedException;
88
import edu.unc.lib.boxc.deposit.work.JobInterruptedException;
99
import edu.unc.lib.boxc.model.api.ids.PID;
10+
import edu.unc.lib.boxc.model.api.rdf.CdrDeposit;
1011
import edu.unc.lib.boxc.model.fcrepo.ids.DatastreamPids;
1112
import edu.unc.lib.boxc.model.fcrepo.ids.PIDs;
1213
import org.apache.commons.io.FilenameUtils;
@@ -124,8 +125,10 @@ public void runJob() {
124125
String stagedPath = stagedPair.getValue();
125126
PID originalPid = DatastreamPids.getOriginalFilePid(objPid);
126127
final String providedMimetype = getProvidedMimetype(originalPid, model);
128+
final String providedLabel = getProvidedLabel(objPid, model);
127129

128-
submitTask(new ExtractTechnicalMetadataRunnable(objPid, originalPid, stagedPath, providedMimetype));
130+
submitTask(new ExtractTechnicalMetadataRunnable(objPid, originalPid, stagedPath,
131+
providedMimetype, providedLabel));
129132
}
130133

131134
waitForCompletion();
@@ -144,6 +147,16 @@ private String getProvidedMimetype(PID originalPid, Model model) {
144147
}
145148
}
146149

150+
private String getProvidedLabel(PID filePid, Model model) {
151+
Resource fileResc = model.getResource(filePid.getRepositoryPath());
152+
Statement labelStmt = fileResc.getProperty(CdrDeposit.label);
153+
if (labelStmt != null) {
154+
return labelStmt.getString();
155+
} else {
156+
return null;
157+
}
158+
}
159+
147160
@Override
148161
protected void registrationAction() {
149162
List<Object> results = new ArrayList<>();
@@ -177,14 +190,16 @@ private class ExtractTechnicalMetadataRunnable implements Runnable {
177190
private PID originalPid;
178191
private String stagedPath;
179192
private String providedMimetype;
193+
private String providedLabel;
180194
private ExtractTechnicalMetadataResult result = new ExtractTechnicalMetadataResult();
181195

182196
public ExtractTechnicalMetadataRunnable(PID objPid, PID originalPid, String stagedPath,
183-
String providedMimetype) {
197+
String providedMimetype, String providedLabel) {
184198
this.objPid = objPid;
185199
this.originalPid = originalPid;
186200
this.stagedPath = stagedPath;
187201
this.providedMimetype = providedMimetype;
202+
this.providedLabel = providedLabel;
188203
result.objPid = objPid;
189204
result.originalPid = originalPid;
190205
result.hasProvidedMimetype = providedMimetype != null;
@@ -198,8 +213,10 @@ public void run() {
198213

199214
interruptJobIfStopped();
200215

216+
// Symlink the file before processing
217+
Path linkPath = makeSymlinkForStagedPath(stagedPath, providedLabel);
201218
// Generate the FITS report as a document
202-
Document fitsDoc = getFitsDocument(objPid, stagedPath);
219+
Document fitsDoc = getFitsDocument(objPid, linkPath);
203220

204221
try {
205222
// Create the PREMIS report wrapper for the FITS results
@@ -222,6 +239,13 @@ public void run() {
222239
} catch (Exception e) {
223240
failJob(e, "Failed to extract FITS details for file '{0}' with id {1} from document:\n{2}",
224241
stagedPath, objPid.getId(), getXMLOutputter().outputString(fitsDoc));
242+
} finally {
243+
try {
244+
Files.delete(linkPath);
245+
Files.delete(linkPath.getParent());
246+
} catch (IOException e) {
247+
log.warn("Failed to delete symlink", e);
248+
}
225249
}
226250
}
227251

@@ -261,7 +285,6 @@ private void addFileIdentification(Document fitsDoc, Element premisObjCharsEl) {
261285
* Overrides the mimetype for this object in the deposit model when the FITS
262286
* generated value is preferred.
263287
*
264-
* @param objResc
265288
* @param fitsExtractMimetype
266289
*/
267290
private void overrideDepositMimetype(String fitsExtractMimetype) {
@@ -301,31 +324,48 @@ private void overrideDepositMimetype(String fitsExtractMimetype) {
301324
* XML document
302325
*
303326
* @param objPid
327+
* @param filePath
328+
* @return
329+
*/
330+
private Document getFitsDocument(PID objPid, Path filePath) {
331+
if (shouldProcessWithWebService(filePath)) {
332+
return extractUsingWebService(objPid, filePath);
333+
} else {
334+
return extractUsingCLI(objPid, filePath);
335+
}
336+
}
337+
338+
/**
339+
* Creates a symlink to the provided stagedUri, where the symlink is sanitized of problematic characters
340+
* and uses the label as the filename to ensure the original file extensions is present, if available.
341+
* @param objPid
304342
* @param stagedUriString
343+
* @param label
305344
* @return
306345
*/
307-
private Document getFitsDocument(PID objPid, String stagedUriString) {
346+
protected Path makeSymlinkForStagedPath(String stagedUriString, String label) {
347+
// Resolve the path from a URI and make it absolute
308348
URI stagedUri = URI.create(stagedUriString);
309349
Path stagedPath;
310350
if (!stagedUri.isAbsolute()) {
311351
stagedPath = Paths.get(getDepositDirectory().toString(), stagedUriString);
312352
} else {
313353
stagedPath = Paths.get(stagedUri);
314354
}
315-
316-
if (shouldProcessWithWebService(stagedPath)) {
317-
return extractUsingWebService(objPid, stagedPath);
318-
} else {
319-
return extractUsingCLI(objPid, stagedPath);
355+
try {
356+
// Create a symlink to the file to make use of the original filename and avoid issues with non-ascii characters
357+
var parentDir = TMP_PATH.resolve(Long.toString(System.nanoTime()));
358+
Files.createDirectories(parentDir);
359+
String symlinkName = label != null ? label : stagedPath.getFileName().toString();
360+
var linkPath = sanitizeCliPath(parentDir.resolve(symlinkName));
361+
Files.createSymbolicLink(linkPath, stagedPath);
362+
return linkPath;
363+
} catch (IOException e) {
364+
throw new JobFailedException("Failed to create symlink for file " + stagedPath, e);
320365
}
321366
}
322367

323368
private boolean shouldProcessWithWebService(Path path) {
324-
// FITS cannot currently handle file paths that contain unicode characters
325-
if (!CharMatcher.ascii().matchesAllOf(path.toString())) {
326-
log.debug("File {} not applicable for web service due to unacceptable characters", path);
327-
return false;
328-
}
329369
String filename = path.getFileName().toString();
330370
String extension = FilenameUtils.getExtension(filename).toLowerCase();
331371
if (FILE_EXTS_FOR_CLI.contains(extension)) {
@@ -392,35 +432,9 @@ protected Path sanitizeCliPath(Path stagedPath) {
392432
return Paths.get(path);
393433
}
394434

395-
/**
396-
* Symlink the provided file into the temp directory, inside of parent directory based on the id of the object.
397-
* Filename of the symlink is based off the sanitized form of the path.
398-
* @param objPid
399-
* @param sanitizedPath Sanitized version of the staging path, does not need to resolve to a file
400-
* @param stagedPath Original staging path of the file, must resolve to the file being linked
401-
* @return Symlink path
402-
* @throws IOException
403-
*/
404-
protected Path symlinkFile(PID objPid, Path sanitizedPath, Path stagedPath) throws IOException {
405-
var parentDir = TMP_PATH.resolve(objPid.getId());
406-
Files.createDirectories(parentDir);
407-
var linkPath = parentDir.resolve(sanitizedPath.getFileName());
408-
Files.createSymbolicLink(linkPath, stagedPath);
409-
return linkPath;
410-
}
411-
412-
private Document extractUsingCLI(PID objPid, Path stagedPath) {
435+
private Document extractUsingCLI(PID objPid, Path targetPath) {
413436
String stdout = null;
414-
Path fileLink = null;
415437
try {
416-
Path targetPath = stagedPath;
417-
var sanitizedPath = sanitizeCliPath(stagedPath);
418-
// Create a symlink to the file to avoid problems with non-ascii characters and reserved linux characters
419-
// otherwise there will be encoding mismatches between boxc, the terminal, and FITS.
420-
if (!stagedPath.equals(sanitizedPath)) {
421-
fileLink = symlinkFile(objPid, sanitizedPath, stagedPath);
422-
targetPath = fileLink;
423-
}
424438
String[] command = new String[] { fitsCommandPath.toString(), "-i", targetPath.toString() };
425439
Process process = Runtime.getRuntime().exec(command);
426440
int exitCode = process.waitFor();
@@ -434,17 +448,7 @@ private Document extractUsingCLI(PID objPid, Path stagedPath) {
434448
return createSAXBuilder().build(new ByteArrayInputStream(stdout.getBytes(UTF_8)));
435449
} catch (IOException | JDOMException | InterruptedException e) {
436450
failJob(e, "Failed to generate report for file {0} with id {1}, output was:\n{2}",
437-
stagedPath, objPid.getId(), stdout);
438-
} finally {
439-
// Cleanup symlink and parent directory containing symlink, if a symlink was used
440-
if (fileLink != null) {
441-
try {
442-
Files.delete(fileLink);
443-
Files.delete(fileLink.getParent());
444-
} catch (IOException e) {
445-
log.warn("Failed to cleanup symlink", e);
446-
}
447-
}
451+
targetPath, objPid.getId(), stdout);
448452
}
449453
return null;
450454
}

deposit-app/src/test/java/edu/unc/lib/boxc/deposit/validate/ExtractTechnicalMetadataJobTest.java

Lines changed: 49 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ public class ExtractTechnicalMetadataJobTest extends AbstractDepositJobTest {
9898
private final static String UNKNOWN_MD5 = "2748ba561254b629c2103cb2e1be3fc2";
9999
private final static String UNKNOWN_FORMAT = "Unknown";
100100

101+
private static final Path TMP_PATH = Paths.get(System.getProperty("java.io.tmpdir"));
102+
101103
@Mock
102104
private CloseableHttpClient httpClient;
103105
@Mock
@@ -429,14 +431,39 @@ public void unicodeFilenameTest() throws Exception {
429431
Bag workBag = model.createBag(workPid.getRepositoryPath());
430432
workBag.addProperty(RDF.type, Cdr.Work);
431433
depositBag.add(workBag);
432-
String filename = "weird\uD83D\uDC7D.txt";
433-
File sourceFile = tmpFolder.resolve(filename).toFile();
434-
PID filePid = addFileObject(workBag, sourceFile.getAbsolutePath(), IMAGE_MIMETYPE, IMAGE_MD5);
434+
String filename = "weird\uD83D\uDC7D.mov";
435+
Path sourcePath = tmpFolder.resolve(filename);
436+
Files.createFile(sourcePath);
437+
PID filePid = addFileObject(workBag, sourcePath.toFile().toURI().toString(), IMAGE_MIMETYPE, IMAGE_MD5);
438+
439+
job.closeModel();
440+
441+
job.run();
442+
443+
verifyFileResults(filePid, IMAGE_MIMETYPE, IMAGE_FORMAT, IMAGE_MD5, 1);
444+
}
445+
446+
@Test
447+
public void filenameFromLabelTest() throws Exception {
448+
respondWithFile("/fitsReports/imageReport.xml");
449+
450+
// Create the work object which nests the file
451+
PID workPid = makePid(RepositoryPathConstants.CONTENT_BASE);
452+
Bag workBag = model.createBag(workPid.getRepositoryPath());
453+
workBag.addProperty(RDF.type, Cdr.Work);
454+
depositBag.add(workBag);
455+
String filename = "ambiguous_file_name";
456+
Path sourcePath = tmpFolder.resolve(filename);
457+
Files.createFile(sourcePath);
458+
PID filePid = addFileObject(workBag, sourcePath.toFile().toURI().toString(), IMAGE_MIMETYPE, IMAGE_MD5);
459+
Resource fileResc = model.getResource(filePid.getRepositoryPath());
460+
fileResc.addProperty(CdrDeposit.label, "boxys_favorite_file.nef");
435461

436462
job.closeModel();
437463

438464
job.run();
439465

466+
verifyRequestParameters("boxys_favorite_file.nef");
440467
verifyFileResults(filePid, IMAGE_MIMETYPE, IMAGE_FORMAT, IMAGE_MD5, 1);
441468
}
442469

@@ -503,15 +530,23 @@ public void sanitizeCliPathReservedTest() throws Exception {
503530
}
504531

505532
@Test
506-
public void symlinkFileTest() throws Exception {
507-
PID pid = makePid();
508-
var originalPath = tmpFolder.resolve("file.txt");
509-
var sanitizedPath = tmpFolder.resolve("fi_l_e.txt");
510-
var result = job.symlinkFile(pid, sanitizedPath, originalPath);
511-
512-
assertEquals("fi_l_e.txt", result.getFileName().toString());
513-
assertEquals(pid.getId(), result.getParent().getFileName().toString());
533+
public void makeSymlinkForStagedPathProblemCharactersTest() throws Exception {
534+
var originalPath = tmpFolder.resolve("fil£.txt");
535+
var result = job.makeSymlinkForStagedPath(originalPath.toUri().toString(), null);
536+
537+
assertEquals("fil_.txt", result.getFileName().toString());
538+
assertTrue(Files.isSymbolicLink(result));
539+
assertEquals(originalPath, Files.readSymbolicLink(result));
540+
}
541+
542+
@Test
543+
public void makeSymlinkForStagedPathWithLabelTest() throws Exception {
544+
var originalPath = tmpFolder.resolve("file");
545+
var result = job.makeSymlinkForStagedPath(originalPath.toUri().toString(), "boxys_favorite_file.nef");
546+
547+
assertEquals("boxys_favorite_file.nef", result.getFileName().toString());
514548
assertTrue(Files.isSymbolicLink(result));
549+
assertEquals(originalPath, Files.readSymbolicLink(result));
515550
}
516551

517552
private HttpUriRequest getRequest() throws Exception {
@@ -529,11 +564,12 @@ private String getSubmittedFilePath(HttpUriRequest request) {
529564
}
530565

531566
private void verifyRequestParameters(String expectedFilepath) throws Exception {
532-
String absFilePath = Paths.get(depositDir.getAbsolutePath(), expectedFilepath).toString();
533567
HttpUriRequest request = getRequest();
534568
String submittedPath = getSubmittedFilePath(request);
535569

536-
assertEquals(absFilePath.replace("/", "%2F"), submittedPath, "FITS service not called with the expected path");
570+
String failMessage = "FITS service called with wrong path. Expected " + expectedFilepath + " but got " + submittedPath;
571+
assertTrue(submittedPath.startsWith(TMP_PATH.toString().replace("/", "%2F")), failMessage);
572+
assertTrue(submittedPath.endsWith("%2F" + Paths.get(expectedFilepath).getFileName()), failMessage);
537573
}
538574

539575
private void verifyFileResults(PID filePid, String expectedMimetype, String expectedFormat,

0 commit comments

Comments
 (0)