Skip to content

Commit 5de6bd7

Browse files
authored
Merge pull request #1743 from UNC-Libraries/bxc-4604
BXC-4604 - Use label as filename for TechMD extract
2 parents 676d710 + bf05512 commit 5de6bd7

File tree

4 files changed

+168
-67
lines changed

4 files changed

+168
-67
lines changed

common-utils/src/main/java/edu/unc/lib/boxc/common/http/MimetypeHelpers.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ private MimetypeHelpers() {
2121
* @return true if the given mimetype appears to be a valid mimetype
2222
*/
2323
public static boolean isValidMimetype(String mimetype) {
24-
if (StringUtils.isBlank(mimetype)) {
24+
if (StringUtils.isBlank(mimetype) || mimetype.contains("symlink")) {
2525
return false;
2626
}
2727
return VALID_MIMETYPE_PATTERN.matcher(mimetype).matches();
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
package edu.unc.lib.boxc.common.http;
2+
3+
import org.junit.jupiter.api.Test;
4+
5+
import static org.junit.jupiter.api.Assertions.assertEquals;
6+
import static org.junit.jupiter.api.Assertions.assertFalse;
7+
import static org.junit.jupiter.api.Assertions.assertNull;
8+
import static org.junit.jupiter.api.Assertions.assertTrue;
9+
10+
/**
11+
* @author bbpennel
12+
*/
13+
public class MimetypeHelpersTest {
14+
@Test
15+
public void isValidMimetypeNullTest() {
16+
String mimetype = null;
17+
assertFalse(MimetypeHelpers.isValidMimetype(mimetype));
18+
}
19+
20+
@Test
21+
public void isValidMimetypeBlankTest() {
22+
String mimetype = "";
23+
assertFalse(MimetypeHelpers.isValidMimetype(mimetype));
24+
}
25+
26+
@Test
27+
public void isValidMimetypeSymlinkTest() {
28+
String mimetype = "inode/symlink";
29+
assertFalse(MimetypeHelpers.isValidMimetype(mimetype));
30+
}
31+
32+
@Test
33+
public void isValidMimetypeInvalidFormatTest() {
34+
String mimetype = "invalid format";
35+
assertFalse(MimetypeHelpers.isValidMimetype(mimetype));
36+
}
37+
38+
@Test
39+
public void isValidMimetypeNefTest() {
40+
String mimetype = "image/x-nikon-nef";
41+
assertTrue(MimetypeHelpers.isValidMimetype(mimetype));
42+
}
43+
44+
@Test
45+
public void formatMimetypeNullTest() {
46+
String mimetype = null;
47+
assertNull(MimetypeHelpers.formatMimetype(mimetype));
48+
}
49+
50+
@Test
51+
public void formatMimetypeWithParamsTest() {
52+
String mimetype = "text/plain; charset=UTF-8";
53+
assertEquals("text/plain", MimetypeHelpers.formatMimetype(mimetype));
54+
}
55+
56+
@Test
57+
public void formatMimetypeRegularTest() {
58+
String mimetype = "text/plain";
59+
assertEquals("text/plain", MimetypeHelpers.formatMimetype(mimetype));
60+
}
61+
}

deposit-app/src/main/java/edu/unc/lib/boxc/deposit/validate/ExtractTechnicalMetadataJob.java

Lines changed: 57 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import edu.unc.lib.boxc.deposit.work.JobFailedException;
88
import edu.unc.lib.boxc.deposit.work.JobInterruptedException;
99
import edu.unc.lib.boxc.model.api.ids.PID;
10+
import edu.unc.lib.boxc.model.api.rdf.CdrDeposit;
1011
import edu.unc.lib.boxc.model.fcrepo.ids.DatastreamPids;
1112
import edu.unc.lib.boxc.model.fcrepo.ids.PIDs;
1213
import org.apache.commons.io.FilenameUtils;
@@ -124,8 +125,10 @@ public void runJob() {
124125
String stagedPath = stagedPair.getValue();
125126
PID originalPid = DatastreamPids.getOriginalFilePid(objPid);
126127
final String providedMimetype = getProvidedMimetype(originalPid, model);
128+
final String providedLabel = getProvidedLabel(objPid, model);
127129

128-
submitTask(new ExtractTechnicalMetadataRunnable(objPid, originalPid, stagedPath, providedMimetype));
130+
submitTask(new ExtractTechnicalMetadataRunnable(objPid, originalPid, stagedPath,
131+
providedMimetype, providedLabel));
129132
}
130133

131134
waitForCompletion();
@@ -144,6 +147,16 @@ private String getProvidedMimetype(PID originalPid, Model model) {
144147
}
145148
}
146149

150+
private String getProvidedLabel(PID filePid, Model model) {
151+
Resource fileResc = model.getResource(filePid.getRepositoryPath());
152+
Statement labelStmt = fileResc.getProperty(CdrDeposit.label);
153+
if (labelStmt != null) {
154+
return labelStmt.getString();
155+
} else {
156+
return null;
157+
}
158+
}
159+
147160
@Override
148161
protected void registrationAction() {
149162
List<Object> results = new ArrayList<>();
@@ -177,14 +190,16 @@ private class ExtractTechnicalMetadataRunnable implements Runnable {
177190
private PID originalPid;
178191
private String stagedPath;
179192
private String providedMimetype;
193+
private String providedLabel;
180194
private ExtractTechnicalMetadataResult result = new ExtractTechnicalMetadataResult();
181195

182196
public ExtractTechnicalMetadataRunnable(PID objPid, PID originalPid, String stagedPath,
183-
String providedMimetype) {
197+
String providedMimetype, String providedLabel) {
184198
this.objPid = objPid;
185199
this.originalPid = originalPid;
186200
this.stagedPath = stagedPath;
187201
this.providedMimetype = providedMimetype;
202+
this.providedLabel = providedLabel;
188203
result.objPid = objPid;
189204
result.originalPid = originalPid;
190205
result.hasProvidedMimetype = providedMimetype != null;
@@ -198,8 +213,10 @@ public void run() {
198213

199214
interruptJobIfStopped();
200215

216+
// Symlink the file before processing
217+
Path linkPath = makeSymlinkForStagedPath(stagedPath, providedLabel);
201218
// Generate the FITS report as a document
202-
Document fitsDoc = getFitsDocument(objPid, stagedPath);
219+
Document fitsDoc = getFitsDocument(objPid, linkPath);
203220

204221
try {
205222
// Create the PREMIS report wrapper for the FITS results
@@ -222,6 +239,13 @@ public void run() {
222239
} catch (Exception e) {
223240
failJob(e, "Failed to extract FITS details for file '{0}' with id {1} from document:\n{2}",
224241
stagedPath, objPid.getId(), getXMLOutputter().outputString(fitsDoc));
242+
} finally {
243+
try {
244+
Files.delete(linkPath);
245+
Files.delete(linkPath.getParent());
246+
} catch (IOException e) {
247+
log.warn("Failed to delete symlink", e);
248+
}
225249
}
226250
}
227251

@@ -261,7 +285,6 @@ private void addFileIdentification(Document fitsDoc, Element premisObjCharsEl) {
261285
* Overrides the mimetype for this object in the deposit model when the FITS
262286
* generated value is preferred.
263287
*
264-
* @param objResc
265288
* @param fitsExtractMimetype
266289
*/
267290
private void overrideDepositMimetype(String fitsExtractMimetype) {
@@ -301,31 +324,48 @@ private void overrideDepositMimetype(String fitsExtractMimetype) {
301324
* XML document
302325
*
303326
* @param objPid
327+
* @param filePath
328+
* @return
329+
*/
330+
private Document getFitsDocument(PID objPid, Path filePath) {
331+
if (shouldProcessWithWebService(filePath)) {
332+
return extractUsingWebService(objPid, filePath);
333+
} else {
334+
return extractUsingCLI(objPid, filePath);
335+
}
336+
}
337+
338+
/**
339+
* Creates a symlink to the provided stagedUri, where the symlink is sanitized of problematic characters
340+
* and uses the label as the filename to ensure the original file extension is present, if available.
341+
* @param objPid
304342
* @param stagedUriString
343+
* @param label
305344
* @return
306345
*/
307-
private Document getFitsDocument(PID objPid, String stagedUriString) {
346+
protected Path makeSymlinkForStagedPath(String stagedUriString, String label) {
347+
// Resolve the path from a URI and make it absolute
308348
URI stagedUri = URI.create(stagedUriString);
309349
Path stagedPath;
310350
if (!stagedUri.isAbsolute()) {
311351
stagedPath = Paths.get(getDepositDirectory().toString(), stagedUriString);
312352
} else {
313353
stagedPath = Paths.get(stagedUri);
314354
}
315-
316-
if (shouldProcessWithWebService(stagedPath)) {
317-
return extractUsingWebService(objPid, stagedPath);
318-
} else {
319-
return extractUsingCLI(objPid, stagedPath);
355+
try {
356+
// Create a symlink to the file to make use of the original filename and avoid issues with non-ascii characters
357+
var parentDir = TMP_PATH.resolve(Long.toString(System.nanoTime()));
358+
Files.createDirectories(parentDir);
359+
String symlinkName = label != null ? label : stagedPath.getFileName().toString();
360+
var linkPath = sanitizeCliPath(parentDir.resolve(symlinkName));
361+
Files.createSymbolicLink(linkPath, stagedPath);
362+
return linkPath;
363+
} catch (IOException e) {
364+
throw new JobFailedException("Failed to create symlink for file " + stagedPath, e);
320365
}
321366
}
322367

323368
private boolean shouldProcessWithWebService(Path path) {
324-
// FITS cannot currently handle file paths that contain unicode characters
325-
if (!CharMatcher.ascii().matchesAllOf(path.toString())) {
326-
log.debug("File {} not applicable for web service due to unacceptable characters", path);
327-
return false;
328-
}
329369
String filename = path.getFileName().toString();
330370
String extension = FilenameUtils.getExtension(filename).toLowerCase();
331371
if (FILE_EXTS_FOR_CLI.contains(extension)) {
@@ -392,35 +432,9 @@ protected Path sanitizeCliPath(Path stagedPath) {
392432
return Paths.get(path);
393433
}
394434

395-
/**
396-
* Symlink the provided file into the temp directory, inside of parent directory based on the id of the object.
397-
* Filename of the symlink is based off the sanitized form of the path.
398-
* @param objPid
399-
* @param sanitizedPath Sanitized version of the staging path, does not need to resolve to a file
400-
* @param stagedPath Original staging path of the file, must resolve to the file being linked
401-
* @return Symlink path
402-
* @throws IOException
403-
*/
404-
protected Path symlinkFile(PID objPid, Path sanitizedPath, Path stagedPath) throws IOException {
405-
var parentDir = TMP_PATH.resolve(objPid.getId());
406-
Files.createDirectories(parentDir);
407-
var linkPath = parentDir.resolve(sanitizedPath.getFileName());
408-
Files.createSymbolicLink(linkPath, stagedPath);
409-
return linkPath;
410-
}
411-
412-
private Document extractUsingCLI(PID objPid, Path stagedPath) {
435+
private Document extractUsingCLI(PID objPid, Path targetPath) {
413436
String stdout = null;
414-
Path fileLink = null;
415437
try {
416-
Path targetPath = stagedPath;
417-
var sanitizedPath = sanitizeCliPath(stagedPath);
418-
// Create a symlink to the file to avoid problems with non-ascii characters and reserved linux characters
419-
// otherwise there will be encoding mismatches between boxc, the terminal, and FITS.
420-
if (!stagedPath.equals(sanitizedPath)) {
421-
fileLink = symlinkFile(objPid, sanitizedPath, stagedPath);
422-
targetPath = fileLink;
423-
}
424438
String[] command = new String[] { fitsCommandPath.toString(), "-i", targetPath.toString() };
425439
Process process = Runtime.getRuntime().exec(command);
426440
int exitCode = process.waitFor();
@@ -434,17 +448,7 @@ private Document extractUsingCLI(PID objPid, Path stagedPath) {
434448
return createSAXBuilder().build(new ByteArrayInputStream(stdout.getBytes(UTF_8)));
435449
} catch (IOException | JDOMException | InterruptedException e) {
436450
failJob(e, "Failed to generate report for file {0} with id {1}, output was:\n{2}",
437-
stagedPath, objPid.getId(), stdout);
438-
} finally {
439-
// Cleanup symlink and parent directory containing symlink, if a symlink was used
440-
if (fileLink != null) {
441-
try {
442-
Files.delete(fileLink);
443-
Files.delete(fileLink.getParent());
444-
} catch (IOException e) {
445-
log.warn("Failed to cleanup symlink", e);
446-
}
447-
}
451+
targetPath, objPid.getId(), stdout);
448452
}
449453
return null;
450454
}

deposit-app/src/test/java/edu/unc/lib/boxc/deposit/validate/ExtractTechnicalMetadataJobTest.java

Lines changed: 49 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ public class ExtractTechnicalMetadataJobTest extends AbstractDepositJobTest {
9898
private final static String UNKNOWN_MD5 = "2748ba561254b629c2103cb2e1be3fc2";
9999
private final static String UNKNOWN_FORMAT = "Unknown";
100100

101+
private static final Path TMP_PATH = Paths.get(System.getProperty("java.io.tmpdir"));
102+
101103
@Mock
102104
private CloseableHttpClient httpClient;
103105
@Mock
@@ -429,14 +431,39 @@ public void unicodeFilenameTest() throws Exception {
429431
Bag workBag = model.createBag(workPid.getRepositoryPath());
430432
workBag.addProperty(RDF.type, Cdr.Work);
431433
depositBag.add(workBag);
432-
String filename = "weird\uD83D\uDC7D.txt";
433-
File sourceFile = tmpFolder.resolve(filename).toFile();
434-
PID filePid = addFileObject(workBag, sourceFile.getAbsolutePath(), IMAGE_MIMETYPE, IMAGE_MD5);
434+
String filename = "weird\uD83D\uDC7D.mov";
435+
Path sourcePath = tmpFolder.resolve(filename);
436+
Files.createFile(sourcePath);
437+
PID filePid = addFileObject(workBag, sourcePath.toFile().toURI().toString(), IMAGE_MIMETYPE, IMAGE_MD5);
438+
439+
job.closeModel();
440+
441+
job.run();
442+
443+
verifyFileResults(filePid, IMAGE_MIMETYPE, IMAGE_FORMAT, IMAGE_MD5, 1);
444+
}
445+
446+
@Test
447+
public void filenameFromLabelTest() throws Exception {
448+
respondWithFile("/fitsReports/imageReport.xml");
449+
450+
// Create the work object which nests the file
451+
PID workPid = makePid(RepositoryPathConstants.CONTENT_BASE);
452+
Bag workBag = model.createBag(workPid.getRepositoryPath());
453+
workBag.addProperty(RDF.type, Cdr.Work);
454+
depositBag.add(workBag);
455+
String filename = "ambiguous_file_name";
456+
Path sourcePath = tmpFolder.resolve(filename);
457+
Files.createFile(sourcePath);
458+
PID filePid = addFileObject(workBag, sourcePath.toFile().toURI().toString(), IMAGE_MIMETYPE, IMAGE_MD5);
459+
Resource fileResc = model.getResource(filePid.getRepositoryPath());
460+
fileResc.addProperty(CdrDeposit.label, "boxys_favorite_file.nef");
435461

436462
job.closeModel();
437463

438464
job.run();
439465

466+
verifyRequestParameters("boxys_favorite_file.nef");
440467
verifyFileResults(filePid, IMAGE_MIMETYPE, IMAGE_FORMAT, IMAGE_MD5, 1);
441468
}
442469

@@ -503,15 +530,23 @@ public void sanitizeCliPathReservedTest() throws Exception {
503530
}
504531

505532
@Test
506-
public void symlinkFileTest() throws Exception {
507-
PID pid = makePid();
508-
var originalPath = tmpFolder.resolve("file.txt");
509-
var sanitizedPath = tmpFolder.resolve("fi_l_e.txt");
510-
var result = job.symlinkFile(pid, sanitizedPath, originalPath);
511-
512-
assertEquals("fi_l_e.txt", result.getFileName().toString());
513-
assertEquals(pid.getId(), result.getParent().getFileName().toString());
533+
public void makeSymlinkForStagedPathProblemCharactersTest() throws Exception {
534+
var originalPath = tmpFolder.resolve("fil£.txt");
535+
var result = job.makeSymlinkForStagedPath(originalPath.toUri().toString(), null);
536+
537+
assertEquals("fil_.txt", result.getFileName().toString());
538+
assertTrue(Files.isSymbolicLink(result));
539+
assertEquals(originalPath, Files.readSymbolicLink(result));
540+
}
541+
542+
@Test
543+
public void makeSymlinkForStagedPathWithLabelTest() throws Exception {
544+
var originalPath = tmpFolder.resolve("file");
545+
var result = job.makeSymlinkForStagedPath(originalPath.toUri().toString(), "boxys_favorite_file.nef");
546+
547+
assertEquals("boxys_favorite_file.nef", result.getFileName().toString());
514548
assertTrue(Files.isSymbolicLink(result));
549+
assertEquals(originalPath, Files.readSymbolicLink(result));
515550
}
516551

517552
private HttpUriRequest getRequest() throws Exception {
@@ -529,11 +564,12 @@ private String getSubmittedFilePath(HttpUriRequest request) {
529564
}
530565

531566
private void verifyRequestParameters(String expectedFilepath) throws Exception {
532-
String absFilePath = Paths.get(depositDir.getAbsolutePath(), expectedFilepath).toString();
533567
HttpUriRequest request = getRequest();
534568
String submittedPath = getSubmittedFilePath(request);
535569

536-
assertEquals(absFilePath.replace("/", "%2F"), submittedPath, "FITS service not called with the expected path");
570+
String failMessage = "FITS service called with wrong path. Expected " + expectedFilepath + " but got " + submittedPath;
571+
assertTrue(submittedPath.startsWith(TMP_PATH.toString().replace("/", "%2F")), failMessage);
572+
assertTrue(submittedPath.endsWith("%2F" + Paths.get(expectedFilepath).getFileName()), failMessage);
537573
}
538574

539575
private void verifyFileResults(PID filePid, String expectedMimetype, String expectedFormat,

0 commit comments

Comments
 (0)