在32核的服务器上,运行一小段图像卷积的操作。输入是图像矩阵(按列),和一个y
方向上的卷积kernel。举例来说,如果kernel是[-0.5 0 0 0 1 0 0 0 0.5],那么下面
程序中的变量分别为
kernel: [-0.5 1 0.5]
szKernel: 3
step: 4
这段程序中使用了OpenMP对for循环执行并行操作。当使用>4个核时,程序运行速度反
而下降。这段程序要运行很多次,是整个程序的性能瓶颈,请大牛们指点一下该如何改
进。
double s = kernel[(szKernel-1)/2];
//#pragma omp parallel for
for (int k = 0; k < m*n; k++)
v_out[k] = v_in[k] * s;
for (int k = 0; k < (szKernel-1)/2; k++)
{
double s = kernel[k];
int offset = ((szKernel-1)/2 - k)*step;
#pragma omp parallel for
for (int x = 0; x < n; x++)
{
long pos1 = x * m;
long pos2 = x * m + m - offset;
for (int y = 0; y < offset; y++, pos1++, pos2++)
v_out[pos1] += v_in[pos2] * s;
pos2 = x * m;
for (int y = offset; y < m; y++, pos1++, pos2++)
v_out[pos1] += v_in[pos2] * s;
}
}
for (int k = (szKernel+1)/2; k < szKernel; k++)
{
double s = kernel[k];
int offset = (k - (szKernel-1)/2)*step;
#pragma omp parallel for
for (int x = 0; x < n; x++)
{
long pos1 = x * m;
long pos2 = x * m + offset;
for (int y = 0; y < m - offset; y++, pos1++, pos2++)
v_out[pos1] += v_in[pos2] * s;
pos2 = x * m;
for (int y = m - offset; y < m; y++, pos1++, pos2++)
v_out[pos1] += v_in[pos2] * s;
}
}