写了一个简单的global memory上做矩阵相乘的代码,代码和GPU配置如下。问题是code
跑1024x1024的矩阵没有问题,但是跑到2048x2048结果就是错的(全是0), 请问是我GPU
的硬件问题吗?还是kernel里面的index的问题?我trace了cudaError,提示是下面这
一句的问题
cudaMemcpy(c, d_c, M * N * sizeof(T), cudaMemcpyDeviceToHost);
可是我怀疑是kernel里面index的问题,因为如果我注释掉kernel里面这句话
for (int k = 0; k < S; ++k)
val += a[row * S + k] * b[k * N + col];
则无论输入矩阵有多大都没有问题。但是我现在找不出代码中的错误,请大家帮我看看
。感激不尽!
template
void __global__ ker_matmul(T *a, T *b, T *c, int M, int S, int N) {
//dims_a[0] = M, dims_a[1] = S;
//dims_b[0] = S, dims_c[1] = N;
//dims_c[0] = M, dims_c[1] = N;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int row = tid / M;
int col = tid % M;
if (row < M && col < N) {
T val = 0;
for (int k = 0; k < S; ++k)
val += a[row * S + k] * b[k * N + col];
c[tid] = val;
}
}
template
void cuArr::cu_matmul(T *a, int *dims_a, T *b, int *dims_b, T *c)
{
if (dims_a[1] != dims_b[0]) return 0;
fill_n(c, dims_a[0] * dims_b[1], 0);
T *d_a = 0, *d_b = 0, *d_c = 0;
int M = dims_a[0], S = dims_a[1], N = dims_b[1];
cudaError_t cudaStatus;
cudaStatus = cudaMalloc((void **)&d_a, M * S * sizeof(T));
cudaStatus = cudaMalloc((void **)&d_b, S * N * sizeof(T));
cudaStatus = cudaMalloc((void **)&d_c, M * N * sizeof(T));
cudaStatus = cudaMemcpy(d_a, a, M * S * sizeof(T),
cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(d_b, b, S * N * sizeof(T),
cudaMemcpyHostToDevice);
int threadsPerBlk = 1024;
int numBlks = M * N / threadsPerBlk;
ker_matmul << > > (d_a, d_b,d_c, M, S, N);
cudaStatus = cudaGetLastError();
cudaStatus = cudaMemcpy(c, d_c, M * N * sizeof(T),
cudaMemcpyDeviceToHost);
cudaFree(d_c);
cudaFree(d_b);
cudaFree(d_a);
}
GeForce GT 720, Kepler, 0.967 Ghz
--------------------------------------
SM | 2
SP (SP per SM) | 384(192)
global mem | 1 gb
const mem | 64 kb
warp size | 32
regs/blk | 65536
threads/blk | 1024
shared mem/blk | 48 kb
block dims | 1024 x 1024 x 64
grid dims | 2^21 x 2^11 x 2^11
--------------------------------------