diff --git a/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentPlugin.java b/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentPlugin.java index e3aae85..439606b 100644 --- a/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentPlugin.java +++ b/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentPlugin.java @@ -3,15 +3,16 @@ import java.util.List; import java.util.logging.Logger; -import jakarta.inject.Inject; - import org.eclipse.microprofile.config.inject.ConfigProperty; import org.imixs.archive.core.SnapshotService; import org.imixs.workflow.ItemCollection; import org.imixs.workflow.WorkflowContext; import org.imixs.workflow.engine.plugins.AbstractPlugin; +import org.imixs.workflow.exceptions.AdapterException; import org.imixs.workflow.exceptions.PluginException; +import jakarta.inject.Inject; + /** * The TikaPlugin extracts the textual information from document attachments. * The plug-in sends each new attached document to an instance of an Apache Tika @@ -51,6 +52,7 @@ public void init(WorkflowContext actx) throws PluginException { * * * @throws PluginException + * @throws AdapterException */ @SuppressWarnings("unchecked") @Override @@ -58,7 +60,7 @@ public ItemCollection run(ItemCollection document, ItemCollection event) throws if ("model".equalsIgnoreCase(serviceMode)) { List tikaOptions = null; String filePattern = null; - int maxPdfPages=0; + int maxPdfPages = 0; // read optional tika options ItemCollection evalItemCollection = this.getWorkflowService().evalWorkflowResult(event, "tika", document, false); @@ -69,7 +71,12 @@ public ItemCollection run(ItemCollection document, ItemCollection event) throws } // update the dms meta data - ocrService.extractText(document, snapshotService.findSnapshot(document), null, tikaOptions,filePattern,maxPdfPages); + try { + ocrService.extractText(document, snapshotService.findSnapshot(document), null, tikaOptions, filePattern, + maxPdfPages); + } catch (AdapterException e) { + throw new PluginException(e); + } } else { logger.warning("unexpected TIKA_SERVICE_MODE=" + serviceMode + " - running the OCRDocumentAdapter the env TIKA_SERVICE_MODE should be set to 'model'. Plugin will be ignored!"); diff --git a/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentService.java b/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentService.java index 50ed94e..849a3ac 100644 --- a/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentService.java +++ b/imixs-archive-documents/src/main/java/org/imixs/archive/documents/OCRDocumentService.java @@ -3,16 +3,17 @@ import java.util.Optional; import java.util.logging.Logger; -import jakarta.ejb.Stateless; -import jakarta.enterprise.event.Observes; -import jakarta.inject.Inject; - import org.eclipse.microprofile.config.inject.ConfigProperty; import org.imixs.archive.core.SnapshotService; import org.imixs.workflow.ItemCollection; import org.imixs.workflow.engine.ProcessingEvent; +import org.imixs.workflow.exceptions.AdapterException; import org.imixs.workflow.exceptions.PluginException; +import jakarta.ejb.Stateless; +import jakarta.enterprise.event.Observes; +import jakarta.inject.Inject; + /** * The TikaDocumentService extracts the textual information from document * attachments. The CDI bean runs on the ProcessingEvent BEFORE_PROCESS. The @@ -48,12 +49,9 @@ public class OCRDocumentService { @ConfigProperty(name = TikaService.ENV_OCR_SERVICE_MODE, defaultValue = "auto") String serviceMode; - - @Inject SnapshotService snapshotService; - - + @Inject TikaService ocrService; @@ -64,21 +62,22 @@ public class OCRDocumentService { * @throws PluginException */ public void onBeforeProcess(@Observes ProcessingEvent processingEvent) throws PluginException { - + if (!serviceEndpoint.isPresent() || serviceEndpoint.get().isEmpty()) { return; } - + // Service only runs if the Tika Service mode is set to 'auto' if ("auto".equalsIgnoreCase(serviceMode)) { if (processingEvent.getEventType() == ProcessingEvent.BEFORE_PROCESS) { - ItemCollection workitem=processingEvent.getDocument(); - ocrService.extractText(workitem, snapshotService.findSnapshot(workitem)); - // extractText(processingEvent.getDocument()); + ItemCollection workitem = processingEvent.getDocument(); + try { + ocrService.extractText(workitem, snapshotService.findSnapshot(workitem)); + } catch (AdapterException e) { + throw new PluginException(e); + } } } } - - } \ No newline at end of file diff --git a/imixs-archive-documents/src/main/java/org/imixs/archive/documents/TikaService.java b/imixs-archive-documents/src/main/java/org/imixs/archive/documents/TikaService.java index ba82311..101fd05 100644 --- a/imixs-archive-documents/src/main/java/org/imixs/archive/documents/TikaService.java +++ b/imixs-archive-documents/src/main/java/org/imixs/archive/documents/TikaService.java @@ -18,15 +18,16 @@ import java.util.logging.Logger; import java.util.regex.Pattern; -import jakarta.ejb.Stateless; -import jakarta.inject.Inject; - import org.apache.pdfbox.pdmodel.PDDocument; import org.eclipse.microprofile.config.inject.ConfigProperty; import org.imixs.workflow.FileData; import org.imixs.workflow.ItemCollection; +import org.imixs.workflow.exceptions.AdapterException; import org.imixs.workflow.exceptions.PluginException; +import jakarta.ejb.Stateless; +import jakarta.inject.Inject; + /** * The OCRService extracts the textual information from document attachments of * a workitem and stores the data into the $file attribute 'text'. @@ -62,7 +63,9 @@ public class TikaService { public static final String FILE_ATTRIBUTE_TEXT = "text"; public static final String DEFAULT_ENCODING = "UTF-8"; - public static final String PLUGIN_ERROR = "PLUGIN_ERROR"; + // public static final String PLUGIN_ERROR = "PLUGIN_ERROR"; + public static final String API_ERROR = "API_ERROR"; + public static final String DOCUMENT_ERROR = "DOCUMENT_ERROR"; public static final String ENV_OCR_SERVICE_ENDPOINT = "ocr.service.endpoint"; public static final String ENV_OCR_SERVICE_MODE = "ocr.service.mode"; public static final String ENV_OCR_SERVICE_MAXFILESIZE = "ocr.service.maxfilesize"; @@ -102,8 +105,9 @@ public class TikaService { * * @param workitem * @throws PluginException + * @throws AdapterException */ - public void extractText(ItemCollection workitem, ItemCollection snapshot) throws PluginException { + public void extractText(ItemCollection workitem, ItemCollection snapshot) throws PluginException, AdapterException { extractText(workitem, snapshot, ocrStategy, null, null, 0); } @@ -133,9 +137,10 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot) throws * @param options - optional tika header params * @param filePatternRegex - optional regular expression to match files * @throws PluginException + * @throws AdapterException */ public void extractText(ItemCollection workitem, ItemCollection snapshot, String _ocrStategy, List options, - String filePatternRegex, int maxPdfPages) throws PluginException { + String filePatternRegex, int maxPdfPages) throws PluginException, AdapterException { boolean debug = logger.isLoggable(Level.FINE); Pattern filePattern = null; @@ -150,7 +155,7 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String // validate OCR MODE.... if ("AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION".indexOf(ocrStategy) == -1) { - throw new PluginException(TikaService.class.getSimpleName(), PLUGIN_ERROR, + throw new PluginException(TikaService.class.getSimpleName(), API_ERROR, "Invalid TIKA_OCR_MODE - expected one of the following options: NO_OCR | OCR_ONLY | OCR_AND_TEXT_EXTRACTION"); } @@ -165,12 +170,12 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String // print tika options... if (debug) { - logger.info("...... filepattern = "+filePatternRegex); + logger.info("...... filepattern = " + filePatternRegex); for (String opt : options) { logger.info("...... Tika Option = " + opt); } } - + // do we have a file pattern? if (filePatternRegex != null && !filePatternRegex.isEmpty()) { filePattern = Pattern.compile(filePatternRegex); @@ -179,17 +184,17 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String long l = System.currentTimeMillis(); // List currentDmsList = DMSHandler.getDmsList(workitem); List files = workitem.getFileData(); - + if (debug) { - logger.info("... found " + files.size() +" files"); + logger.info("... found " + files.size() + " files"); } for (FileData fileData : files) { - logger.fine("... processing file: "+fileData.getName()); + logger.fine("... processing file: " + fileData.getName()); // do we have an optional file pattern? if (filePattern != null && !filePattern.matcher(fileData.getName()).find()) { // the file did not match the given pattern! - logger.info("... filename does not match given pattern!"); + logger.info("... filename does not match given pattern!"); continue; } @@ -208,12 +213,14 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String // we ignore the file! if (originFileData.getContent() != null && originFileData.getContent().length > ocrMaxFileSize) { - logger.warning("The file size '" + fileData.getName() + "' excided the allowed max size of " - + ocrMaxFileSize + " bytes (file size=" + originFileData.getContent().length + ")"); - continue; + throw new AdapterException(TikaService.class.getSimpleName(), DOCUMENT_ERROR, + "The file '" + fileData.getName() + "' exceed the allowed max size of " + + ocrMaxFileSize + " bytes (file size=" + originFileData.getContent().length + + ")"); } if (debug) { - logger.info("...text extraction '" + originFileData.getName() + "' content size=" +originFileData.getContent().length +" ..."); + logger.info("...text extraction '" + originFileData.getName() + "' content size=" + + originFileData.getContent().length + " ..."); } textContent = doORCProcessing(originFileData, options, maxPdfPages); @@ -221,14 +228,14 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String if (textContent == null) { logger.warning("Unable to extract text-content for '" + fileData.getName() + "'"); textContent = ""; - } + } // store the ocrContent.... List list = new ArrayList(); list.add(textContent); fileData.setAttribute(FILE_ATTRIBUTE_TEXT, list); } catch (IOException e) { - throw new PluginException(TikaService.class.getSimpleName(), PLUGIN_ERROR, + throw new PluginException(TikaService.class.getSimpleName(), API_ERROR, "Unable to scan attached document '" + fileData.getName() + "'", e); } } @@ -265,7 +272,7 @@ public String doORCProcessing(FileData fileData, List options, int maxPd } if (debug) { - logger.info("...ocr scanning of document " + fileData.getName() +" ...."); + logger.info("...ocr scanning of document " + fileData.getName() + " ...."); } // adapt ContentType String contentType = adaptContentType(fileData); @@ -334,7 +341,7 @@ public String doORCProcessing(FileData fileData, List options, int maxPd } // compute length - + urlConnection.setRequestProperty("Content-Length", "" + Integer.valueOf(fileData.getContent().length)); OutputStream output = urlConnection.getOutputStream(); writer = new PrintWriter(new OutputStreamWriter(output, DEFAULT_ENCODING), true); @@ -347,8 +354,8 @@ public String doORCProcessing(FileData fileData, List options, int maxPd } if (resposeCode >= 200 && resposeCode <= 299) { logger.info("...call readResponse...."); - return readResponse(urlConnection, DEFAULT_ENCODING,debug); - } + return readResponse(urlConnection, DEFAULT_ENCODING, debug); + } logger.warning("... no data!"); // no data! @@ -376,17 +383,17 @@ private boolean hasOCRContent(FileData fileData) { List ocrContentList = (List) fileData.getAttribute(FILE_ATTRIBUTE_TEXT); // do we have a value list at all? // Issue #166 - if (ocrContentList==null || ocrContentList.size()==0) { + if (ocrContentList == null || ocrContentList.size() == 0) { // no attribute found return false; } - + // test the text value .... - String textValue=ocrContentList.get(0) ; - if (textValue==null || textValue.isEmpty()) { + String textValue = ocrContentList.get(0); + if (textValue == null || textValue.isEmpty()) { return false; } - + // else we do have a content! return true; } @@ -442,7 +449,7 @@ private FileData fetchOriginFileData(FileData fileData, ItemCollection snapshot) * @throws IOException */ private String readResponse(URLConnection urlConnection, String encoding, boolean debug) throws IOException { - + // get content of result if (debug) { logger.info("......readResponse...."); @@ -461,7 +468,7 @@ private String readResponse(URLConnection urlConnection, String encoding, boolea if (debug) { logger.info("......ContentEncoding=" + sContentEncoding); } - + // if an encoding is provided read stream with encoding..... if (sContentEncoding != null && !sContentEncoding.isEmpty()) in = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(), sContentEncoding));