diff --git a/.gitignore b/.gitignore
index 2df6ebfd..730a692e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,7 +3,12 @@
**/__pycache__/**
.vscode
+.idea
build/
**/build
**/build/**
+
+target/
+**/target
+**/target/**
\ No newline at end of file
diff --git a/models/object_tracking_vittrack/README.md b/models/object_tracking_vittrack/README.md
index ad3f0a3e..c99e8c97 100644
--- a/models/object_tracking_vittrack/README.md
+++ b/models/object_tracking_vittrack/README.md
@@ -40,6 +40,21 @@ cmake --build build
./build/opencv_zoo_object_tracking_vittrack -h
```
+## Java
+
+Install Maven to get started.
+
+```shell
+# tracking on camera input
+mvn compile exec:java -q
+
+# tracking on video
+mvn compile exec:java -q -Dexec.args="-i /path/to/video"
+
+# get help messages
+mvn compile exec:java -q -Dexec.args="-h"
+```
+
# Example outputs
diff --git a/models/object_tracking_vittrack/demo.java b/models/object_tracking_vittrack/demo.java
new file mode 100644
index 00000000..353c3f84
--- /dev/null
+++ b/models/object_tracking_vittrack/demo.java
@@ -0,0 +1,206 @@
+import com.beust.jcommander.JCommander;
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.UnixStyleUsageFormatter;
+import org.bytedeco.javacpp.BytePointer;
+import org.bytedeco.opencv.global.opencv_dnn;
+import org.bytedeco.opencv.opencv_core.*;
+import org.bytedeco.opencv.opencv_video.TrackerVit;
+import org.bytedeco.opencv.opencv_videoio.VideoCapture;
+import org.bytedeco.opencv.opencv_videoio.VideoWriter;
+
+import static org.bytedeco.opencv.global.opencv_highgui.*;
+import static org.bytedeco.opencv.global.opencv_imgproc.*;
+import static org.bytedeco.opencv.global.opencv_videoio.CAP_PROP_FPS;
+
+public class demo {
+
+ // Valid combinations of backends and targets
+ static int[][] backendTargetPairs = {
+ {opencv_dnn.DNN_BACKEND_OPENCV, opencv_dnn.DNN_TARGET_CPU},
+ {opencv_dnn.DNN_BACKEND_CUDA, opencv_dnn.DNN_TARGET_CUDA},
+ {opencv_dnn.DNN_BACKEND_CUDA, opencv_dnn.DNN_TARGET_CUDA_FP16},
+ {opencv_dnn.DNN_BACKEND_TIMVX, opencv_dnn.DNN_TARGET_NPU},
+ {opencv_dnn.DNN_BACKEND_CANN, opencv_dnn.DNN_TARGET_NPU}
+ };
+
+ static class Args {
+ @Parameter(names = {"--help", "-h"}, order = 0, help = true,
+ description = "Print help message.")
+ boolean help;
+ @Parameter(names = {"--input", "-i"}, order = 1,
+ description = "Set path to the input video. Omit for using default camera.")
+ String input;
+ @Parameter(names = {"--model_path", "-m"}, order = 2,
+ description = "Set model path.")
+ String modelPath = "object_tracking_vittrack_2023sep.onnx";
+ @Parameter(names = {"--backend_target", "-bt"}, order = 3,
+ description = "Choose one of the backend-target pair to run this demo:" +
+ " 0: OpenCV implementation + CPU," +
+ " 1: CUDA + GPU (CUDA), " +
+ " 2: CUDA + GPU (CUDA FP16)," +
+ " 3: TIM-VX + NPU," +
+ " 4: CANN + NPU")
+ int backendTarget = 0;
+ @Parameter(names = {"--save", "-s"}, order = 4,
+ description = "Specify to save a file with results.")
+ boolean save;
+ @Parameter(names = {"--vis", "-v"}, order = 5, arity = 1,
+ description = "Specify to open a new window to show results.")
+ boolean vis = true;
+ }
+
+ static class TrackingResult {
+ boolean isLocated;
+ Rect bbox;
+ float score;
+ }
+
+ static class VitTrack {
+ private final TrackerVit model;
+
+ VitTrack(String modelPath, int backendId, int targetId) {
+ final TrackerVit.Params params = new TrackerVit.Params();
+ params.net(new BytePointer(modelPath))
+ .backend(backendId)
+ .target(targetId);
+ model = TrackerVit.create(params);
+ }
+
+ void init(Mat image, Rect roi) {
+ model.init(image, roi);
+ }
+
+ TrackingResult infer(Mat image) {
+ final TrackingResult result = new TrackingResult();
+ result.bbox = new Rect();
+ result.isLocated = model.update(image, result.bbox);
+ result.score = model.getTrackingScore();
+ return result;
+ }
+ }
+
+ static Mat visualize(Mat image, Rect bbox, float score, boolean isLocated, double fps, Scalar boxColor,
+ Scalar textColor, double fontScale, int fontSize) {
+ final Mat output = image.clone();
+ final int h = output.rows();
+ final int w = output.cols();
+ if (fps >= 0) {
+ putText(output, String.format("FPS: %.2f", fps), new Point(0, 30), FONT_HERSHEY_DUPLEX, fontScale,
+ textColor);
+ }
+
+ if (isLocated && score >= 0.3) {
+ rectangle(output, bbox, boxColor, 2, LINE_8, 0);
+ putText(output, String.format("%.2f", score), new Point(bbox.x(), bbox.y() + 25),
+ FONT_HERSHEY_DUPLEX, fontScale, textColor, fontSize, LINE_8, false);
+ } else {
+ final Size textSize = getTextSize("Target lost!", FONT_HERSHEY_DUPLEX, fontScale, fontSize, new int[]{0});
+ final int textX = (w - textSize.width()) / 2;
+ final int textY = (h - textSize.height()) / 2;
+ putText(output, "Target lost!", new Point(textX, textY), FONT_HERSHEY_DUPLEX,
+ fontScale, new Scalar(0, 0, 255, 0), fontSize, LINE_8, false);
+ }
+
+ return output;
+ }
+
+ /**
+ * Execute: mvn compile exec:java -q -Dexec.args=""
+ */
+ public static void main(String[] argv) {
+ final Args args = new Args();
+ final JCommander jc = JCommander.newBuilder()
+ .addObject(args)
+ .build();
+ jc.setUsageFormatter(new UnixStyleUsageFormatter(jc));
+ jc.parse(argv);
+ if (args.help) {
+ jc.usage();
+ return;
+ }
+ final int backendId = backendTargetPairs[args.backendTarget][0];
+ final int targetId = backendTargetPairs[args.backendTarget][1];
+ VitTrack tracker = new VitTrack(args.modelPath, backendId, targetId);
+
+ final VideoCapture video = new VideoCapture();
+ if (args.input == null) {
+ video.open(0);
+ } else {
+ video.open(args.input);
+ }
+ if (!video.isOpened()) {
+ System.err.println("Error: Could not open video source");
+ return;
+ }
+
+ Mat firstFrame = new Mat();
+ video.read(firstFrame);
+
+ if (firstFrame.empty()) {
+ System.err.println("No frames grabbed!");
+ return;
+ }
+
+ Mat firstFrameCopy = firstFrame.clone();
+ putText(firstFrameCopy, "1. Drag a bounding box to track.", new Point(0, 25), FONT_HERSHEY_SIMPLEX, 1, new Scalar(0, 255, 0, 0));
+ putText(firstFrameCopy, "2. Press ENTER to confirm", new Point(0, 50), FONT_HERSHEY_SIMPLEX, 1, new Scalar(0, 255, 0, 0));
+ final Rect roi = selectROI("VitTrack Demo", firstFrameCopy);
+
+ if (roi.area() == 0) {
+ System.err.println("No ROI is selected! Exiting...");
+ return;
+ } else {
+ System.out.printf("Selected ROI: (x: %d, y: %d, width: %d, height: %d)%n", roi.x(), roi.y(), roi.width(),
+ roi.height());
+ }
+
+ // Create VideoWriter if save option is specified
+ final VideoWriter outputVideo = new VideoWriter();
+ if (args.save) {
+ final Size frameSize = firstFrame.size();
+ outputVideo.open("output.mp4", VideoWriter.fourcc((byte) 'm', (byte) 'p', (byte) '4', (byte) 'v'),
+ video.get(CAP_PROP_FPS), frameSize);
+ if (!outputVideo.isOpened()) {
+ System.err.println("Error: Could not create output video stream");
+ return;
+ }
+ }
+
+ // Initialize tracker with ROI
+ tracker.init(firstFrame, roi);
+
+ // Track frame by frame
+ final TickMeter tm = new TickMeter();
+ while (waitKey(1) < 0) {
+ video.read(firstFrame);
+ if (firstFrame.empty()) {
+ System.out.println("End of video");
+ break;
+ }
+
+ // Inference
+ tm.start();
+ final TrackingResult result = tracker.infer(firstFrame);
+ tm.stop();
+
+ // Visualize
+ Mat frame = firstFrame.clone();
+ frame = visualize(frame, result.bbox, result.score, result.isLocated, tm.getFPS(),
+ new Scalar(0, 255, 0, 0), new Scalar(0, 255, 0, 0), 1.0, 1);
+
+ if (args.save) {
+ outputVideo.write(frame);
+ }
+ if (args.vis) {
+ imshow("VitTrack Demo", frame);
+ }
+ tm.reset();
+ }
+ if (args.save) {
+ outputVideo.release();
+ }
+
+ video.release();
+ }
+
+}
diff --git a/models/object_tracking_vittrack/pom.xml b/models/object_tracking_vittrack/pom.xml
new file mode 100644
index 00000000..6b58bac1
--- /dev/null
+++ b/models/object_tracking_vittrack/pom.xml
@@ -0,0 +1,31 @@
+
+
+
+ 4.0.0
+
+
+ opencv_zoo
+ demo
+ 1.0.0-SNAPSHOT
+
+
+ object_tracking_vittrack
+
+
+ ${project.basedir}
+
+
+ org.codehaus.mojo
+ exec-maven-plugin
+ 3.3.0
+
+ java
+ demo
+
+
+
+
+
+
\ No newline at end of file
diff --git a/models/pom.xml b/models/pom.xml
new file mode 100644
index 00000000..a38928f3
--- /dev/null
+++ b/models/pom.xml
@@ -0,0 +1,98 @@
+
+
+
+ 4.0.0
+
+ opencv_zoo
+ demo
+ 1.0.0-SNAPSHOT
+ OpenCV Zoo demo application
+ pom
+
+
+ ${project.basedir}
+
+
+ org.codehaus.mojo
+ exec-maven-plugin
+ 3.3.0
+
+ java
+ demo
+
+
+
+
+
+
+ object_tracking_vittrack
+ text_detection_ppocr
+
+
+
+
+ org.bytedeco
+ javacv-platform
+ 1.5.10
+
+
+ org.bytedeco
+ flycapture-platform
+
+
+ org.bytedeco
+ libdc1394-platform
+
+
+ org.bytedeco
+ libfreenect-platform
+
+
+ org.bytedeco
+ libfreenect2-platform
+
+
+ org.bytedeco
+ librealsense-platform
+
+
+ org.bytedeco
+ librealsense2-platform
+
+
+ org.bytedeco
+ videoinput-platform
+
+
+ org.bytedeco
+ artoolkitplus-platform
+
+
+ org.bytedeco
+ leptonica-platform
+
+
+ org.bytedeco
+ tesseract-platform
+
+
+
+
+ org.bytedeco
+ opencv-platform-gpu
+ 4.9.0-1.5.10
+
+
+ org.bytedeco
+ cuda-platform-redist
+ 12.3-8.9-1.5.10
+
+
+ com.beust
+ jcommander
+ 1.82
+
+
+
\ No newline at end of file
diff --git a/models/text_detection_ppocr/README.md b/models/text_detection_ppocr/README.md
index 1a875d1c..21367f77 100644
--- a/models/text_detection_ppocr/README.md
+++ b/models/text_detection_ppocr/README.md
@@ -43,6 +43,19 @@ cmake --build build
./build/opencv_zoo_text_detection_ppocr -h
```
+### Java
+
+Install Maven to get started with:
+
+```shell
+# detect on camera input
+mvn compile exec:java -q
+# detect on an image
+mvn compile exec:java -q -Dexec.args="--input /path/to/image -v"
+# get help messages
+mvn compile exec:java -q -Dexec.args="--help"
+```
+
### Example outputs

diff --git a/models/text_detection_ppocr/demo.java b/models/text_detection_ppocr/demo.java
new file mode 100644
index 00000000..b65a02a5
--- /dev/null
+++ b/models/text_detection_ppocr/demo.java
@@ -0,0 +1,221 @@
+import com.beust.jcommander.JCommander;
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.UnixStyleUsageFormatter;
+import org.bytedeco.javacpp.FloatPointer;
+import org.bytedeco.javacv.CanvasFrame;
+import org.bytedeco.javacv.OpenCVFrameConverter;
+import org.bytedeco.opencv.global.opencv_dnn;
+import org.bytedeco.opencv.opencv_core.*;
+import org.bytedeco.opencv.opencv_dnn.TextDetectionModel_DB;
+import org.bytedeco.opencv.opencv_videoio.VideoCapture;
+
+import java.util.AbstractMap;
+import java.util.Map;
+
+import static org.bytedeco.opencv.global.opencv_imgcodecs.imwrite;
+import static org.bytedeco.opencv.global.opencv_imgproc.*;
+
+public class demo {
+
+ // Valid combinations of backends and targets
+ static int[][] backendTargetPairs = {
+ {opencv_dnn.DNN_BACKEND_OPENCV, opencv_dnn.DNN_TARGET_CPU},
+ {opencv_dnn.DNN_BACKEND_CUDA, opencv_dnn.DNN_TARGET_CUDA},
+ {opencv_dnn.DNN_BACKEND_CUDA, opencv_dnn.DNN_TARGET_CUDA_FP16},
+ {opencv_dnn.DNN_BACKEND_TIMVX, opencv_dnn.DNN_TARGET_NPU},
+ {opencv_dnn.DNN_BACKEND_CANN, opencv_dnn.DNN_TARGET_NPU}
+ };
+
+ static class Args {
+ @Parameter(names = {"--help", "-h"}, order = 0, help = true,
+ description = "Print help message.")
+ boolean help;
+ @Parameter(names = {"--model", "-m"}, order = 1,
+ description = "Set model type.")
+ String model = "text_detection_en_ppocrv3_2023may.onnx";
+ @Parameter(names = {"--input", "-i"}, order = 2,
+ description = "Path to input image or video file. Skip this argument to capture frames from a camera.")
+ String input;
+ @Parameter(names = "--width", order = 3,
+ description = "Resize input image to certain width, It should be multiple by 32.")
+ int width = 736;
+ @Parameter(names = "--height", order = 4,
+ description = "Resize input image to certain height, It should be multiple by 32.")
+ int height = 736;
+ @Parameter(names = "--binary_threshold", order = 5,
+ description = "Threshold of the binary map.")
+ float binaryThreshold = 0.3f;
+ @Parameter(names = "--polygon_threshold", order = 6,
+ description = "Threshold of polygons.")
+ float polygonThreshold = 0.5f;
+ @Parameter(names = "--max_candidates", order = 7,
+ description = "Set maximum number of polygon candidates.")
+ int maxCandidates = 200;
+ @Parameter(names = "--unclip_ratio", order = 8,
+ description = "The unclip ratio of the detected text region, which determines the output size.")
+ double unclipRatio = 2.0;
+ @Parameter(names = {"--save", "-s"}, order = 9,
+ description = "Specify to save file with results (i.e. bounding box, confidence level). Invalid in case of camera input.")
+ boolean save;
+ @Parameter(names = {"--viz", "-v"}, order = 10,
+ description = "Specify to open a new window to show results. Invalid in case of camera input.")
+ boolean viz;
+ @Parameter(names = {"--backend", "-bt"}, order = 11,
+ description = "Choose one of computation backends:" +
+ " 0: OpenCV implementation + CPU," +
+ " 1: CUDA + GPU (CUDA), " +
+ " 2: CUDA + GPU (CUDA FP16)," +
+ " 3: TIM-VX + NPU," +
+ " 4: CANN + NPU")
+ int backend = 0;
+ }
+
+ static class PPOCRDet {
+ private final TextDetectionModel_DB model;
+ private final Size inputSize;
+
+ public PPOCRDet(String modelPath, Size inputSize,
+ float binaryThreshold, float polygonThreshold, int maxCandidates, double unclipRatio,
+ int backendId, int targetId) {
+ this.inputSize = inputSize;
+
+ model = new TextDetectionModel_DB(modelPath);
+ model.setPreferableBackend(backendId);
+ model.setPreferableTarget(targetId);
+
+ model.setBinaryThreshold(binaryThreshold);
+ model.setPolygonThreshold(polygonThreshold);
+ model.setUnclipRatio(unclipRatio);
+ model.setMaxCandidates(maxCandidates);
+
+ model.setInputParams(1.0 / 255.0, inputSize,
+ new Scalar(122.67891434, 116.66876762, 104.00698793, 0), true, false);
+ }
+
+ public Map.Entry infer(Mat image) {
+ if (image.rows() != inputSize.height()) {
+ throw new IllegalArgumentException("height of input image != net input size");
+ }
+ if (image.cols() != inputSize.width()) {
+ throw new IllegalArgumentException("width of input image != net input size");
+ }
+ final PointVectorVector pt = new PointVectorVector();
+ final FloatPointer confidences = new FloatPointer();
+ model.detect(image, pt, confidences);
+ return new AbstractMap.SimpleEntry<>(pt, confidences);
+ }
+ }
+
+ static Mat visualize(Mat image, Map.Entry results, double fps, Scalar boxColor,
+ Scalar textColor, boolean isClosed, int thickness) {
+ final Mat output = new Mat();
+ image.copyTo(output);
+ if (fps > 0) {
+ putText(output, String.format("FPS: %.2f", fps), new Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, textColor);
+ }
+ final PointVectorVector pvv = results.getKey();
+ final MatVector matVector = new MatVector();
+ for (int i = 0; i < pvv.size(); i++) {
+ final PointVector pv = pvv.get(i);
+ final Point pts = new Point(pv.size());
+ for (int j = 0; j < pv.size(); j++) {
+ pts.position(j).x(pv.get(j).x()).y(pv.get(j).y());
+ }
+ matVector.push_back(new Mat(pts.position(0)));
+ }
+ polylines(output, matVector, isClosed, boxColor, thickness, LINE_AA, 0);
+ matVector.close();
+ return output;
+ }
+
+ /**
+ * Execute: mvn compile exec:java -q -Dexec.args=""
+ */
+ public static void main(String[] argv) {
+ final Args args = new Args();
+ final JCommander jc = JCommander.newBuilder()
+ .addObject(args)
+ .build();
+ jc.setUsageFormatter(new UnixStyleUsageFormatter(jc));
+ jc.parse(argv);
+ if (args.help) {
+ jc.usage();
+ return;
+ }
+ final int[] backendTargetPair = backendTargetPairs[args.backend];
+ if (args.model == null || args.model.isEmpty()) {
+ throw new IllegalArgumentException("Model name is empty");
+ }
+ final Size inpSize = new Size(args.width, args.height);
+
+ final PPOCRDet model = new PPOCRDet(args.model, inpSize,
+ args.binaryThreshold, args.polygonThreshold, args.maxCandidates, args.unclipRatio,
+ backendTargetPair[0], backendTargetPair[1]);
+
+ final VideoCapture cap = new VideoCapture();
+ if (args.input != null) {
+ cap.open(args.input);
+ } else {
+ cap.open(0);
+ }
+ if (!cap.isOpened()) {
+ throw new IllegalArgumentException("Cannot open video or file");
+ }
+ Mat originalImage = new Mat();
+
+ final OpenCVFrameConverter.ToMat converter = new OpenCVFrameConverter.ToMat();
+ CanvasFrame mainframe = null;
+ if (args.input == null || args.viz) {
+ mainframe = new CanvasFrame(args.model + " Demo", CanvasFrame.getDefaultGamma() / 2.2);
+ mainframe.setDefaultCloseOperation(javax.swing.JFrame.EXIT_ON_CLOSE);
+ mainframe.setVisible(true);
+ }
+
+ final Scalar boxColor = new Scalar(0, 255, 0, 0);
+ final Scalar textColor = new Scalar(0, 0, 255, 0);
+ final TickMeter tm = new TickMeter();
+
+ while (cap.read(originalImage)) {
+ final int originalW = originalImage.cols();
+ final int originalH = originalImage.rows();
+ final double scaleHeight = originalH / (double) inpSize.height();
+ final double scaleWidth = originalW / (double) inpSize.width();
+ final Mat image = new Mat();
+ resize(originalImage, image, inpSize);
+
+ // inference
+ tm.start();
+ final Map.Entry results = model.infer(image);
+ tm.stop();
+ // Scale the results bounding box
+ final PointVectorVector pvv = results.getKey();
+ for (int i = 0; i < pvv.size(); i++) {
+ final PointVector pts = pvv.get(i);
+ for (int j = 0; j < pts.size(); j++) {
+ pts.get(j).x((int) (pts.get(j).x() * scaleWidth));
+ pts.get(j).y((int) (pts.get(j).y() * scaleHeight));
+ }
+ }
+
+ originalImage = visualize(originalImage, results, tm.getFPS(), boxColor, textColor, true, 2);
+ tm.reset();
+ if (args.input != null) {
+ if (args.save) {
+ System.out.println("Result image saved to result.jpg");
+ imwrite("result.jpg", originalImage);
+ }
+ if (args.viz) {
+ mainframe.showImage(converter.convert(originalImage));
+ }
+ } else {
+ mainframe.showImage(converter.convert(originalImage));
+ }
+
+ // clear
+ pvv.close();
+ image.close();
+ }
+ tm.close();
+ }
+
+}
diff --git a/models/text_detection_ppocr/pom.xml b/models/text_detection_ppocr/pom.xml
new file mode 100644
index 00000000..8571a0dd
--- /dev/null
+++ b/models/text_detection_ppocr/pom.xml
@@ -0,0 +1,16 @@
+
+
+
+ 4.0.0
+
+
+ opencv_zoo
+ demo
+ 1.0.0-SNAPSHOT
+
+
+ text_detection_ppocr
+
+
\ No newline at end of file