Skip to content

Commit 385bd6f

Browse files
authored
Merge pull request #3891 from CodeLinaro:3rdPost
FastCV extension 3rd Post #3891 Adding FastCV extensions for merge, split, gemm and arithm APIs add, subtract ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
1 parent bdc0518 commit 385bd6f

File tree

10 files changed

+530
-12
lines changed

10 files changed

+530
-12
lines changed

modules/fastcv/include/opencv2/fastcv.hpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
2+
* Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

@@ -11,6 +11,7 @@
1111
#include "opencv2/fastcv/arithm.hpp"
1212
#include "opencv2/fastcv/bilateralFilter.hpp"
1313
#include "opencv2/fastcv/blur.hpp"
14+
#include "opencv2/fastcv/channel.hpp"
1415
#include "opencv2/fastcv/cluster.hpp"
1516
#include "opencv2/fastcv/draw.hpp"
1617
#include "opencv2/fastcv/edges.hpp"

modules/fastcv/include/opencv2/fastcv/arithm.hpp

+40-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
2+
* Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

@@ -8,6 +8,10 @@
88

99
#include <opencv2/core.hpp>
1010

11+
#define FCV_CMP_EQ(val1,val2) (fabs(val1 - val2) < FLT_EPSILON)
12+
13+
#define FCV_OPTYPE(depth,op) ((depth<<3) + op)
14+
1115
namespace cv {
1216
namespace fastcv {
1317

@@ -26,6 +30,41 @@ CV_EXPORTS_W void matmuls8s32(InputArray src1, InputArray src2, OutputArray dst)
2630

2731
//! @}
2832

33+
//! @addtogroup fastcv
34+
//! @{
35+
36+
/**
37+
* @brief Arithmetic add and subtract operations for two matrices
38+
* It is optimized for Qualcomm's processors
39+
* @param src1 First source matrix, can be of type CV_8U, CV_16S, CV_32F.
40+
* Note: CV_32F not supported for subtract
41+
* @param src2 Second source matrix of same type and size as src1
42+
* @param dst Resulting matrix of type as src mats
43+
* @param op type of operation - 0 for add and 1 for subtract
44+
*/
45+
CV_EXPORTS_W void arithmetic_op(InputArray src1, InputArray src2, OutputArray dst, int op);
46+
47+
//! @}
48+
49+
//! @addtogroup fastcv
50+
//! @{
51+
52+
/**
53+
* @brief Matrix multiplication of two float type matrices
54+
* R = a*A*B + b*C where A,B,C,R are matrices and a,b are constants
55+
* It is optimized for Qualcomm's processors
56+
* @param src1 First source matrix of type CV_32F
57+
* @param src2 Second source matrix of type CV_32F with same rows as src1 cols
58+
* @param dst Resulting matrix of type CV_32F
59+
* @param alpha multiplying factor for src1 and src2
60+
* @param src3 Optional third matrix of type CV_32F to be added to matrix product
61+
* @param beta multiplying factor for src3
62+
*/
63+
CV_EXPORTS_W void gemm(InputArray src1, InputArray src2, OutputArray dst, float alpha = 1.0,
64+
InputArray src3 = noArray(), float beta = 0.0);
65+
66+
//! @}
67+
2968
} // fastcv::
3069
} // cv::
3170

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
* Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
#ifndef OPENCV_FASTCV_CHANNEL_HPP
7+
#define OPENCV_FASTCV_CHANNEL_HPP
8+
9+
#include <opencv2/core.hpp>
10+
11+
namespace cv {
12+
namespace fastcv {
13+
14+
//! @addtogroup fastcv
15+
//! @{
16+
17+
/**
18+
* @brief Creates one multi-channel mat out of several single-channel CV_8U mats.
19+
* Optimized for Qualcomm's processors
20+
* @param mv input vector of matrices to be merged; all the matrices in mv must be of CV_8UC1 and have the same size
21+
* Note: numbers of mats can be 2,3 or 4.
22+
* @param dst output array of depth CV_8U and same size as mv[0]; The number of channels
23+
* will be the total number of matrices in the matrix array
24+
*/
25+
CV_EXPORTS_W void merge(InputArrayOfArrays mv, OutputArray dst);
26+
27+
//! @}
28+
29+
//! @addtogroup fastcv
30+
//! @{
31+
32+
/**
33+
* @brief Splits an CV_8U multi-channel mat into several CV_8UC1 mats
34+
* Optimized for Qualcomm's processors
35+
* @param src input 2,3 or 4 channel mat of depth CV_8U
36+
* @param mv output vector of size src.channels() of CV_8UC1 mats
37+
*/
38+
CV_EXPORTS_W void split(InputArray src, OutputArrayOfArrays mv);
39+
40+
//! @}
41+
42+
} // fastcv::
43+
} // cv::
44+
45+
#endif // OPENCV_FASTCV_CHANNEL_HPP

modules/fastcv/include/opencv2/fastcv/pyramid.hpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
2+
* Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

@@ -16,7 +16,7 @@ namespace fastcv {
1616

1717
/**
1818
* @brief Creates a gradient pyramid from an image pyramid
19-
*
19+
* Note: The borders are ignored during gradient calculation.
2020
* @param pyr Input pyramid of 1-channel 8-bit images. Only continuous images are supported.
2121
* @param dx Horizontal Sobel gradient pyramid of the same size as pyr
2222
* @param dy Verical Sobel gradient pyramid of the same size as pyr

modules/fastcv/perf/perf_matmul.cpp

+34-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
2+
* Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

@@ -10,6 +10,9 @@ namespace opencv_test {
1010
typedef std::tuple<int /*rows1*/, int /*cols1*/, int /*cols2*/> MatMulPerfParams;
1111
typedef perf::TestBaseWithParam<MatMulPerfParams> MatMulPerfTest;
1212

13+
typedef std::tuple<int /*rows1*/, int /*cols1*/, int /*cols2*/, float> MatMulGemmPerfParams;
14+
typedef perf::TestBaseWithParam<MatMulGemmPerfParams> MatMulGemmPerfTest;
15+
1316
PERF_TEST_P(MatMulPerfTest, run,
1417
::testing::Combine(::testing::Values(8, 16, 128, 256), // rows1
1518
::testing::Values(8, 16, 128, 256), // cols1
@@ -37,4 +40,34 @@ PERF_TEST_P(MatMulPerfTest, run,
3740
SANITY_CHECK_NOTHING();
3841
}
3942

43+
PERF_TEST_P(MatMulGemmPerfTest, run,
44+
::testing::Combine(::testing::Values(8, 16, 128, 256), // rows1
45+
::testing::Values(8, 16, 128, 256), // cols1
46+
::testing::Values(8, 16, 128, 256), // cols2
47+
::testing::Values(2.5, 5.8)) // alpha
48+
)
49+
{
50+
auto p = GetParam();
51+
int rows1 = std::get<0>(p);
52+
int cols1 = std::get<1>(p);
53+
int cols2 = std::get<2>(p);
54+
float alpha = std::get<3>(p);
55+
56+
RNG& rng = cv::theRNG();
57+
Mat src1(rows1, cols1, CV_32FC1), src2(cols1, cols2, CV_32FC1);
58+
cvtest::randUni(rng, src1, Scalar::all(-128.0), Scalar::all(128.0));
59+
cvtest::randUni(rng, src2, Scalar::all(-128.0), Scalar::all(128.0));
60+
61+
Mat dst;
62+
63+
while (next())
64+
{
65+
startTimer();
66+
cv::fastcv::gemm(src1, src2, dst, alpha, noArray(), 0);
67+
stopTimer();
68+
}
69+
70+
SANITY_CHECK_NOTHING();
71+
}
72+
4073
} // namespace

modules/fastcv/src/arithm.cpp

+149-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
2+
* Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

@@ -32,5 +32,153 @@ void matmuls8s32(InputArray _src1, InputArray _src2, OutputArray _dst)
3232
(int32_t*)dst.data, dst.step);
3333
}
3434

35+
void arithmetic_op(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
36+
{
37+
CV_Assert(!_src1.empty() && (_src1.depth() == CV_8U || _src1.depth() == CV_16S || _src1.depth() == CV_32F));
38+
CV_Assert(!_src2.empty() && _src2.type() == _src1.type());
39+
CV_Assert(_src2.size() == _src1.size());
40+
41+
Mat src1 = _src1.getMat();
42+
Mat src2 = _src2.getMat();
43+
44+
_dst.create(_src1.rows(), _src1.cols(), _src1.type());
45+
Mat dst = _dst.getMat();
46+
47+
INITIALIZATION_CHECK;
48+
49+
fcvConvertPolicy policy = FASTCV_CONVERT_POLICY_SATURATE;
50+
51+
int nStripes = cv::getNumThreads();
52+
53+
int func = FCV_OPTYPE(_src1.depth(), op);
54+
switch(func)
55+
{
56+
case FCV_OPTYPE(CV_8U, 0):
57+
cv::parallel_for_(cv::Range(0, src1.rows), [&](const cv::Range &range){
58+
int rangeHeight = range.end - range.start;
59+
const uchar* yS1 = src1.data + static_cast<size_t>(range.start)*src1.step[0];
60+
const uchar* yS2 = src2.data + static_cast<size_t>(range.start)*src2.step[0];
61+
uchar* yD = dst.data + static_cast<size_t>(range.start)*dst.step[0];
62+
fcvAddu8(yS1, src1.cols, rangeHeight, src1.step[0],
63+
yS2, src2.step[0], policy, yD, dst.step[0]);
64+
}, nStripes);
65+
break;
66+
case FCV_OPTYPE(CV_16S, 0):
67+
cv::parallel_for_(cv::Range(0, src1.rows), [&](const cv::Range &range){
68+
int rangeHeight = range.end - range.start;
69+
const short* yS1 = (short*)src1.data + static_cast<size_t>(range.start)*(src1.step[0]/sizeof(short));
70+
const short* yS2 = (short*)src2.data + static_cast<size_t>(range.start)*(src2.step[0]/sizeof(short));
71+
short* yD = (short*)dst.data + static_cast<size_t>(range.start)*(dst.step[0]/sizeof(short));
72+
fcvAdds16_v2(yS1, src1.cols, rangeHeight, src1.step[0],
73+
yS2, src2.step[0], policy, yD, dst.step[0]);
74+
}, nStripes);
75+
break;
76+
case FCV_OPTYPE(CV_32F, 0):
77+
cv::parallel_for_(cv::Range(0, src1.rows), [&](const cv::Range &range){
78+
int rangeHeight = range.end - range.start;
79+
const float* yS1 = (float*)src1.data + static_cast<size_t>(range.start)*(src1.step[0]/sizeof(float));
80+
const float* yS2 = (float*)src2.data + static_cast<size_t>(range.start)*(src2.step[0]/sizeof(float));
81+
float* yD = (float*)dst.data + static_cast<size_t>(range.start)*(dst.step[0]/sizeof(float));
82+
fcvAddf32(yS1, src1.cols, rangeHeight, src1.step[0],
83+
yS2, src2.step[0], yD, dst.step[0]);
84+
}, nStripes);
85+
break;
86+
case FCV_OPTYPE(CV_8U, 1):
87+
cv::parallel_for_(cv::Range(0, src1.rows), [&](const cv::Range &range){
88+
int rangeHeight = range.end - range.start;
89+
const uchar* yS1 = src1.data + static_cast<size_t>(range.start)*src1.step[0];
90+
const uchar* yS2 = src2.data + static_cast<size_t>(range.start)*src2.step[0];
91+
uchar* yD = dst.data + static_cast<size_t>(range.start)*dst.step[0];
92+
fcvSubtractu8(yS1, src1.cols, rangeHeight, src1.step[0],
93+
yS2, src2.step[0], policy, yD, dst.step[0]);
94+
}, nStripes);
95+
break;
96+
case FCV_OPTYPE(CV_16S, 1):
97+
cv::parallel_for_(cv::Range(0, src1.rows), [&](const cv::Range &range){
98+
int rangeHeight = range.end - range.start;
99+
const short* yS1 = (short*)src1.data + static_cast<size_t>(range.start)*(src1.step[0]/sizeof(short));
100+
const short* yS2 = (short*)src2.data + static_cast<size_t>(range.start)*(src2.step[0]/sizeof(short));
101+
short* yD = (short*)dst.data + static_cast<size_t>(range.start)*(dst.step[0]/sizeof(short));
102+
fcvSubtracts16(yS1, src1.cols, rangeHeight, src1.step[0],
103+
yS2, src2.step[0], policy, yD, dst.step[0]);
104+
}, nStripes);
105+
break;
106+
default:
107+
CV_Error(cv::Error::StsBadArg, cv::format("op type is not supported"));
108+
break;
109+
}
110+
}
111+
112+
113+
void gemm(InputArray _src1, InputArray _src2, OutputArray _dst, float alpha, InputArray _src3, float beta)
114+
{
115+
CV_Assert(!_src1.empty() && _src1.type() == CV_32FC1);
116+
CV_Assert(_src1.cols() == _src2.rows());
117+
Mat src1 = _src1.getMat();
118+
119+
CV_Assert(!_src2.empty() && _src2.type() == CV_32FC1);
120+
Mat src2 = _src2.getMat();
121+
122+
bool isSrc3 = !_src3.empty();
123+
124+
Mat src3 = _src3.getMat();
125+
126+
_dst.create(_src1.rows(), _src2.cols(), CV_32FC1);
127+
128+
Mat dst = _dst.getMat();
129+
130+
CV_Assert(!FCV_CMP_EQ(alpha,0));
131+
132+
cv::Mat dst_temp1, dst_temp2;
133+
float *dstp = NULL;
134+
bool inplace = false;
135+
size_t dst_stride;
136+
fcvStatus status = FASTCV_SUCCESS;
137+
138+
int n = src1.cols, m = src1.rows, k = src2.cols;
139+
140+
INITIALIZATION_CHECK;
141+
142+
if(src1.data == dst.data || src2.data == dst.data || (isSrc3 && (src3.data == dst.data)))
143+
{
144+
dst_temp1 = cv::Mat(m, k, CV_32FC1);
145+
dstp = dst_temp1.ptr<float>();
146+
inplace = true;
147+
dst_stride = dst_temp1.step[0];
148+
}
149+
else
150+
{
151+
dstp = (float32_t*)dst.data;
152+
dst_stride = dst.step[0];
153+
}
154+
float32_t *dstp1 = dstp;
155+
status = fcvMatrixMultiplyf32_v2((float32_t*)src1.data, n, m, src1.step[0], (float32_t*)src2.data, k,
156+
src2.step[0], dstp, dst_stride);
157+
158+
bool isAlpha = !(FCV_CMP_EQ(alpha,0) || FCV_CMP_EQ(alpha,1));
159+
if(isAlpha && status == FASTCV_SUCCESS)
160+
{
161+
status = fcvMultiplyScalarf32(dstp, k, m, dst_stride, alpha, dstp1, dst_stride);
162+
}
163+
164+
if(isSrc3 && (!FCV_CMP_EQ(beta,0)) && status == FASTCV_SUCCESS)
165+
{
166+
cv::Mat dst3 = cv::Mat(m, k, CV_32FC1);
167+
if(!FCV_CMP_EQ(beta,1))
168+
{
169+
status = fcvMultiplyScalarf32((float32_t*)src3.data, k, m, src3.step[0], beta, (float32_t*)dst3.data, dst3.step[0]);
170+
if(status == FASTCV_SUCCESS)
171+
fcvAddf32_v2(dstp, k, m, dst_stride, (float32_t*)dst3.data, dst3.step[0], dstp1, dst_stride);
172+
}
173+
else
174+
fcvAddf32_v2(dstp, k, m, dst_stride, (float32_t*)src3.data, src3.step[0], dstp1, dst_stride);
175+
}
176+
177+
if(inplace == true)
178+
{
179+
dst_temp1(cv::Rect(0, 0, k, m)).copyTo(dst(cv::Rect(0, 0, k, m)));
180+
}
181+
}
182+
35183
} // fastcv::
36184
} // cv::

0 commit comments

Comments
 (0)