微信公众号:OpenCV学堂
关注获取更多计算机视觉与深度学习知识
OpenCV支持的并行框架
1. Intel TBB (第三方库,需显式启用)
2. C=并行C/C++编程语言扩展 (第三方库,需显式启用)
3. OpenMP (编译器集成, 需显式启用)
4. APPLE GCD (苹果系统自动使用)
5. Windows RT并发(Windows RT自动使用)
6. Windows并发(运行时部分, Windows,MSVC++ >= 10自动使用)
7. Pthreads
在VS IDE中开启OpenMP,只需要右键点击项目,从属性中
这样就可以开启并行加速。
卷积并行实现与时间比较
parallel_for_
ParallelLoopBody
start = (double)cv::getTickCount();
for (int row = 0; row < rows; row++) {
for (int col = 1; col < cols - 1; col++)
{
int sum = src.at
(row, col) + src.at (row - 1, col) + src.at (row + 1, col) + src.at
(row, col - 1) + src.at (row - 1, col - 1) + src.at (row + 1, col - 1) + src.at
(row, col + 1) + src.at (row - 1, col + 1) + src.at (row + 1, col + 1); int pv = sum / 9;
dst.at
(row, col) = pv; }
}
double start = (double)cv::getTickCount();
parallel_for_(Range(0, rows * cols), [&](const Range &range)
{
for (int r = range.start; r < range.end; r++)
{
int i = r / cols, j = r % cols;
double value = 0;
for (int k = -sz; k <= sz; k++)
{
uchar *sptr = src.ptr(i + sz + k);
for (int l = -sz; l <= sz; l++)
{
value += kernel.ptr<double>(k + sz)[l + sz] * sptr[j + sz + l];
}
}
dst.ptr(i)[j] = saturate_cast
(value); }
});
double time = (((double)cv::getTickCount() - start)) / cv::getTickFrequency();
std::cout << "parallel_for_conv3x3 execute time: " << time * 1000 << " ms" << std::endl;
class parallelConvolution : public ParallelLoopBody
{
private:
Mat m_src, &m_dst;
Mat m_kernel;
int sz;
public:
parallelConvolution(Mat src, Mat &dst, Mat kernel)
: m_src(src), m_dst(dst), m_kernel(kernel)
{
sz = kernel.rows / 2;
}
virtual void operator()(const Range &range) const CV_OVERRIDE
{
for (int r = range.start; r < range.end; r++)
{
int i = r / m_src.cols, j = r % m_src.cols;
double value = 0;
for (int k = -sz; k <= sz; k++)
{
const uchar *sptr = m_src.ptr(i + sz + k);
for (int l = -sz; l <= sz; l++)
{
value += m_kernel.ptr<double>(k + sz)[l + sz] * sptr[j + sz + l];
}
}
m_dst.ptr(i)[j] = saturate_cast
(value); }
}
};
调用方式如下:
start = (double)cv::getTickCount();
parallelConvolution obj(src, dst, kernel);
parallel_for_(Range(0, rows * cols), obj);
time = (((double)cv::getTickCount() - start)) / cv::getTickFrequency();
std::cout << "parallelConvolution conv3x3 execute time: " << time * 1000 << " ms" << std::endl;
运行结果如下:
对此,我自己也有一些原因分析,但是更希望大家留言分析一下相关原因,为什么没有加速效果??
推荐阅读
OpenCV4.8+YOLOv8对象检测C++推理演示
总结 | OpenCV4 Mat操作全接触
三行代码实现 TensorRT8.6 C++ 深度学习模型部署
实战 | YOLOv8+OpenCV 实现DM码定位检测与解析
对象检测边界框损失 – 从IOU到ProbIOU
YOLOv8 OBB实现自定义旋转对象检测
初学者必看 | 学习深度学习的五个误区
YOLOv8自定义数据集训练实现安全帽检测