Skip to content

Commit

Permalink
fixed
Browse files Browse the repository at this point in the history
issue #212
  • Loading branch information
rsoika committed Aug 13, 2024
1 parent 4308cc5 commit ebd3fbe
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
import java.util.List;
import java.util.logging.Logger;

import jakarta.inject.Inject;

import org.eclipse.microprofile.config.inject.ConfigProperty;
import org.imixs.archive.core.SnapshotService;
import org.imixs.workflow.ItemCollection;
import org.imixs.workflow.WorkflowContext;
import org.imixs.workflow.engine.plugins.AbstractPlugin;
import org.imixs.workflow.exceptions.AdapterException;
import org.imixs.workflow.exceptions.PluginException;

import jakarta.inject.Inject;

/**
* The TikaPlugin extracts the textual information from document attachments.
* The plug-in sends each new attached document to an instance of an Apache Tika
Expand Down Expand Up @@ -51,14 +52,15 @@ public void init(WorkflowContext actx) throws PluginException {
*
*
* @throws PluginException
* @throws AdapterException
*/
@SuppressWarnings("unchecked")
@Override
public ItemCollection run(ItemCollection document, ItemCollection event) throws PluginException {
if ("model".equalsIgnoreCase(serviceMode)) {
List<String> tikaOptions = null;
String filePattern = null;
int maxPdfPages=0;
int maxPdfPages = 0;
// read optional tika options
ItemCollection evalItemCollection = this.getWorkflowService().evalWorkflowResult(event, "tika", document,
false);
Expand All @@ -69,7 +71,12 @@ public ItemCollection run(ItemCollection document, ItemCollection event) throws
}

// update the dms meta data
ocrService.extractText(document, snapshotService.findSnapshot(document), null, tikaOptions,filePattern,maxPdfPages);
try {
ocrService.extractText(document, snapshotService.findSnapshot(document), null, tikaOptions, filePattern,
maxPdfPages);
} catch (AdapterException e) {
throw new PluginException(e);
}
} else {
logger.warning("unexpected TIKA_SERVICE_MODE=" + serviceMode
+ " - running the OCRDocumentAdapter the env TIKA_SERVICE_MODE should be set to 'model'. Plugin will be ignored!");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
import java.util.Optional;
import java.util.logging.Logger;

import jakarta.ejb.Stateless;
import jakarta.enterprise.event.Observes;
import jakarta.inject.Inject;

import org.eclipse.microprofile.config.inject.ConfigProperty;
import org.imixs.archive.core.SnapshotService;
import org.imixs.workflow.ItemCollection;
import org.imixs.workflow.engine.ProcessingEvent;
import org.imixs.workflow.exceptions.AdapterException;
import org.imixs.workflow.exceptions.PluginException;

import jakarta.ejb.Stateless;
import jakarta.enterprise.event.Observes;
import jakarta.inject.Inject;

/**
* The TikaDocumentService extracts the textual information from document
* attachments. The CDI bean runs on the ProcessingEvent BEFORE_PROCESS. The
Expand Down Expand Up @@ -48,12 +49,9 @@ public class OCRDocumentService {
@ConfigProperty(name = TikaService.ENV_OCR_SERVICE_MODE, defaultValue = "auto")
String serviceMode;



@Inject
SnapshotService snapshotService;



@Inject
TikaService ocrService;

Expand All @@ -64,21 +62,22 @@ public class OCRDocumentService {
* @throws PluginException
*/
public void onBeforeProcess(@Observes ProcessingEvent processingEvent) throws PluginException {

if (!serviceEndpoint.isPresent() || serviceEndpoint.get().isEmpty()) {
return;
}

// Service only runs if the Tika Service mode is set to 'auto'
if ("auto".equalsIgnoreCase(serviceMode)) {
if (processingEvent.getEventType() == ProcessingEvent.BEFORE_PROCESS) {
ItemCollection workitem=processingEvent.getDocument();
ocrService.extractText(workitem, snapshotService.findSnapshot(workitem));
// extractText(processingEvent.getDocument());
ItemCollection workitem = processingEvent.getDocument();
try {
ocrService.extractText(workitem, snapshotService.findSnapshot(workitem));
} catch (AdapterException e) {
throw new PluginException(e);
}
}
}
}



}
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,16 @@
import java.util.logging.Logger;
import java.util.regex.Pattern;

import jakarta.ejb.Stateless;
import jakarta.inject.Inject;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.eclipse.microprofile.config.inject.ConfigProperty;
import org.imixs.workflow.FileData;
import org.imixs.workflow.ItemCollection;
import org.imixs.workflow.exceptions.AdapterException;
import org.imixs.workflow.exceptions.PluginException;

import jakarta.ejb.Stateless;
import jakarta.inject.Inject;

/**
* The OCRService extracts the textual information from document attachments of
* a workitem and stores the data into the $file attribute 'text'.
Expand Down Expand Up @@ -62,7 +63,9 @@ public class TikaService {

public static final String FILE_ATTRIBUTE_TEXT = "text";
public static final String DEFAULT_ENCODING = "UTF-8";
public static final String PLUGIN_ERROR = "PLUGIN_ERROR";
// public static final String PLUGIN_ERROR = "PLUGIN_ERROR";
public static final String API_ERROR = "API_ERROR";
public static final String DOCUMENT_ERROR = "DOCUMENT_ERROR";
public static final String ENV_OCR_SERVICE_ENDPOINT = "ocr.service.endpoint";
public static final String ENV_OCR_SERVICE_MODE = "ocr.service.mode";
public static final String ENV_OCR_SERVICE_MAXFILESIZE = "ocr.service.maxfilesize";
Expand Down Expand Up @@ -102,8 +105,9 @@ public class TikaService {
*
* @param workitem
* @throws PluginException
* @throws AdapterException
*/
public void extractText(ItemCollection workitem, ItemCollection snapshot) throws PluginException {
public void extractText(ItemCollection workitem, ItemCollection snapshot) throws PluginException, AdapterException {
extractText(workitem, snapshot, ocrStategy, null, null, 0);
}

Expand Down Expand Up @@ -133,9 +137,10 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot) throws
* @param options - optional tika header params
* @param filePatternRegex - optional regular expression to match files
* @throws PluginException
* @throws AdapterException
*/
public void extractText(ItemCollection workitem, ItemCollection snapshot, String _ocrStategy, List<String> options,
String filePatternRegex, int maxPdfPages) throws PluginException {
String filePatternRegex, int maxPdfPages) throws PluginException, AdapterException {
boolean debug = logger.isLoggable(Level.FINE);
Pattern filePattern = null;

Expand All @@ -150,7 +155,7 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String

// validate OCR MODE....
if ("AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION".indexOf(ocrStategy) == -1) {
throw new PluginException(TikaService.class.getSimpleName(), PLUGIN_ERROR,
throw new PluginException(TikaService.class.getSimpleName(), API_ERROR,
"Invalid TIKA_OCR_MODE - expected one of the following options: NO_OCR | OCR_ONLY | OCR_AND_TEXT_EXTRACTION");
}

Expand All @@ -165,12 +170,12 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String

// print tika options...
if (debug) {
logger.info("...... filepattern = "+filePatternRegex);
logger.info("...... filepattern = " + filePatternRegex);
for (String opt : options) {
logger.info("...... Tika Option = " + opt);
}
}

// do we have a file pattern?
if (filePatternRegex != null && !filePatternRegex.isEmpty()) {
filePattern = Pattern.compile(filePatternRegex);
Expand All @@ -179,17 +184,17 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String
long l = System.currentTimeMillis();
// List<ItemCollection> currentDmsList = DMSHandler.getDmsList(workitem);
List<FileData> files = workitem.getFileData();

if (debug) {
logger.info("... found " + files.size() +" files");
logger.info("... found " + files.size() + " files");
}

for (FileData fileData : files) {
logger.fine("... processing file: "+fileData.getName());
logger.fine("... processing file: " + fileData.getName());
// do we have an optional file pattern?
if (filePattern != null && !filePattern.matcher(fileData.getName()).find()) {
// the file did not match the given pattern!
logger.info("... filename does not match given pattern!");
logger.info("... filename does not match given pattern!");
continue;
}

Expand All @@ -208,27 +213,29 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String
// we ignore the file!
if (originFileData.getContent() != null
&& originFileData.getContent().length > ocrMaxFileSize) {
logger.warning("The file size '" + fileData.getName() + "' excided the allowed max size of "
+ ocrMaxFileSize + " bytes (file size=" + originFileData.getContent().length + ")");
continue;
throw new AdapterException(TikaService.class.getSimpleName(), DOCUMENT_ERROR,
"The file '" + fileData.getName() + "' exceed the allowed max size of "
+ ocrMaxFileSize + " bytes (file size=" + originFileData.getContent().length
+ ")");
}
if (debug) {
logger.info("...text extraction '" + originFileData.getName() + "' content size=" +originFileData.getContent().length +" ...");
logger.info("...text extraction '" + originFileData.getName() + "' content size="
+ originFileData.getContent().length + " ...");
}

textContent = doORCProcessing(originFileData, options, maxPdfPages);

if (textContent == null) {
logger.warning("Unable to extract text-content for '" + fileData.getName() + "'");
textContent = "";
}
}
// store the ocrContent....
List<Object> list = new ArrayList<Object>();
list.add(textContent);
fileData.setAttribute(FILE_ATTRIBUTE_TEXT, list);

} catch (IOException e) {
throw new PluginException(TikaService.class.getSimpleName(), PLUGIN_ERROR,
throw new PluginException(TikaService.class.getSimpleName(), API_ERROR,
"Unable to scan attached document '" + fileData.getName() + "'", e);
}
}
Expand Down Expand Up @@ -265,7 +272,7 @@ public String doORCProcessing(FileData fileData, List<String> options, int maxPd
}

if (debug) {
logger.info("...ocr scanning of document " + fileData.getName() +" ....");
logger.info("...ocr scanning of document " + fileData.getName() + " ....");
}
// adapt ContentType
String contentType = adaptContentType(fileData);
Expand Down Expand Up @@ -334,7 +341,7 @@ public String doORCProcessing(FileData fileData, List<String> options, int maxPd
}

// compute length

urlConnection.setRequestProperty("Content-Length", "" + Integer.valueOf(fileData.getContent().length));
OutputStream output = urlConnection.getOutputStream();
writer = new PrintWriter(new OutputStreamWriter(output, DEFAULT_ENCODING), true);
Expand All @@ -347,8 +354,8 @@ public String doORCProcessing(FileData fileData, List<String> options, int maxPd
}
if (resposeCode >= 200 && resposeCode <= 299) {
logger.info("...call readResponse....");
return readResponse(urlConnection, DEFAULT_ENCODING,debug);
}
return readResponse(urlConnection, DEFAULT_ENCODING, debug);
}

logger.warning("... no data!");
// no data!
Expand Down Expand Up @@ -376,17 +383,17 @@ private boolean hasOCRContent(FileData fileData) {
List<String> ocrContentList = (List<String>) fileData.getAttribute(FILE_ATTRIBUTE_TEXT);
// do we have a value list at all?
// Issue #166
if (ocrContentList==null || ocrContentList.size()==0) {
if (ocrContentList == null || ocrContentList.size() == 0) {
// no attribute found
return false;
}

// test the text value ....
String textValue=ocrContentList.get(0) ;
if (textValue==null || textValue.isEmpty()) {
String textValue = ocrContentList.get(0);
if (textValue == null || textValue.isEmpty()) {
return false;
}

// else we do have a content!
return true;
}
Expand Down Expand Up @@ -442,7 +449,7 @@ private FileData fetchOriginFileData(FileData fileData, ItemCollection snapshot)
* @throws IOException
*/
private String readResponse(URLConnection urlConnection, String encoding, boolean debug) throws IOException {

// get content of result
if (debug) {
logger.info("......readResponse....");
Expand All @@ -461,7 +468,7 @@ private String readResponse(URLConnection urlConnection, String encoding, boolea
if (debug) {
logger.info("......ContentEncoding=" + sContentEncoding);
}

// if an encoding is provided read stream with encoding.....
if (sContentEncoding != null && !sContentEncoding.isEmpty())
in = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(), sContentEncoding));
Expand Down

0 comments on commit ebd3fbe

Please sign in to comment.