okay,没代码没真相。
here is my shabby codes and I want to share with you all.
output and analysis will be post soon
//file name matrix_op.c
#include
#include
#include
#include
#include
#include
#define _MX_INIT(type_t, row, col) ({ int i;\
type_t ** ret;\
ret=(type_t ** )malloc( row *sizeof(type_t*));\
for(i=0;i
(ret);\
})
#define _MX_FREE(mx, row) ({int i; for(i=0;i
free(mx)
;})
#define _MX_GEN(type_t, row, col, randfunc )({\
int i, j;\
type_t **ret;\
ret=_MX_INIT(type_t, row,col);\
for(i=0;i
for(j=0;jret[i][j]=(randfunc) ;\
(ret);\
})
int **mx_add_naive(int row, int col, int *const *a, int *const *b)
{
int i, j;
int **m = _MX_INIT(int, row, col);
for (i = 0; i < row; ++i)
for (j = 0; j < col; ++j)
m[i][j] = a[i][j] + b[i][j];
return m;
}
int **mx_add_better(int row, int col, int *const *a, int *const *b)
{
int i, j;
int **m = _MX_INIT(int, row, col);
for (i = 0; i < row; ++i) {
int *ap = a[i], *bp = b[i], *mp = m[i]; //let compiler know where
it
can be buffered
for (j = 0; j < col; ++j)
mp[j] = ap[j] + bp[j];
}
return m;
}
int **mx_add_sse2(int row, int col, int *const *a, int *const *b)
{
int i, j, k;
int width = 4;
int **m = _MX_INIT(int, row, col);
int x[width];
for (i = 0; i < row; ++i) {
int *ap = a[i], *bp = b[i], *mp = m[i];
__m128i t;
for (j = 0; j < width * (col / width); j += width) {
//the function here is for 64 bits integet operation.
//if need only 32 bits, then change the function names
correspondingly.
t = _mm_add_epi64(_mm_loadu_si128((__m128i *) (ap + j)),
_mm_loadu_si128((__m128i *) (bp + j)));
_mm_store_si128((__m128i *) x, t);
mp[j] = x[0], mp[j + 1] = x[1]; //no bound check for simple
}
//finishing the leftovers
for (j = width * (col / width); j < col; j++) {
m[i][j] = a[i][j] + b[i][j];
}
}
return m;
}
int main(int argc, char *argv[])
{
clock_t t;
int col = 100, row = 100;
if (argc > 1)
row = col = atoi(argv[1]);
int **a, **b, **m;
srand(time(NULL));
a = _MX_GEN(int, row, col, rand());
b = _MX_GEN(int, row, col, rand());
t = clock();
m = mx_add_naive(row, col, a, b);
fprintf(stderr, "naive add: %lf second; m[%d][%d]=%d \n",
(double) (clock() - t) / CLOCKS_PER_SEC, row / 2, col / 2,
m[row / 2][col / 2]);
_MX_FREE(m, row);
t = clock();
m = mx_add_better(row, col, a, b);
fprintf(stderr, "better add: %lf second; m[%d][%d]=%d \n",
(double) (clock() - t) / CLOCKS_PER_SEC, row / 2, col / 2,
m[row / 2][col / 2]);
_MX_FREE(m, row);
t = clock();
m = mx_add_sse2(row, col, a, b);
fprintf(stderr, "sse2 add: %lf second; m[%d][%d]=%d \n",
(double) (clock() - t) / CLOCKS_PER_SEC, row / 2, col / 2,
m[row / 2][col / 2]);
_MX_FREE(m, row);
_MX_FREE(a, row);
_MX_FREE(b, row);
}