I am trying to allocate and copy memory of a flattened 2D array on to the device using cudaMalloc3D to test the performance of cudaMalloc3D. But when I try to write to the array from the kernel it throws 'an illegal memory access was encountered' exception. The program runs fine if I am just reading from the array but when I try to write to it, there is an error. Any help on this will be greatly appreciated. Below is my code and the syntax for compiling the code.
Compile using
nvcc -O2 -arch sm_20 test.cu
Code: test.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PI 3.14159265
#define NX 8192 /* includes boundary points on both end */
#define NY 4096 /* includes boundary points on both end */
#define NZ 1 /* needed for cudaMalloc3D */
#define N_THREADS_X 16
#define N_THREADS_Y 16
#define N_BLOCKS_X NX/N_THREADS_X
#define N_BLOCKS_Y NY/N_THREADS_Y
#define LX 4.0 /* length of the domain in x-direction */
#define LY 2.0 /* length of the domain in x-direction */
#define dx (REAL) ( LX/( (REAL) (NX) ) )
#define cSqrd 5.0
#define dt (REAL) ( 0.4 * dx / sqrt(cSqrd) )
#define FACTOR ( cSqrd * (dt*dt)/(dx*dx) )
#define IC (i + j*NX) /* (i,j) */
#define IM1 (i + j*NX - 1) /* (i-1,j) */
#define IP1 (i + j*NX + 1) /* (i+1,j) */
#define JM1 (i + (j-1)*NX) /* (i,j-1) */
#define JP1 (i + (j+1)*NX) /* (i,j+1) */
// Macro for checking CUDA errors following a CUDA launch or API call
#define cudaCheckError() {\
cudaError_t e = cudaGetLastError();\
if( e != cudaSuccess ) {\
printf("\nCuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));\
exit(EXIT_FAILURE);\
}\
}
typedef double REAL;
typedef int INT;
void meshGrid ( REAL *x, REAL *y )
{
INT i,j;
REAL a;
for (j=0; j<NY; j++) {
a = dx * ( (REAL) j );
for (i=0; i<NX; i++) {
x[IC] = dx * ( (REAL) i );
y[IC] = a;
}
}
}
void initWave ( REAL *u, REAL *uold, REAL *x, REAL *y )
{
INT i,j;
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
u[IC] = 0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] );
}
}
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
uold[IC] = u[IC] + 0.5*FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
}
}
}
__global__ void solveWaveGPU ( cudaPitchedPtr uold, cudaPitchedPtr u, cudaPitchedPtr unew )
{
INT i,j;
i = blockIdx.x*blockDim.x + threadIdx.x;
j = blockIdx.y*blockDim.y + threadIdx.y;
if (i>0 && i < (NX-1) && j>0 && j < (NY-1) ) {
char *unewPtr = (char *) unew.ptr;
REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch);
REAL tmp = unew_row[j]; // no error on this line
unew_row[j] = 1.2; // this is where I get the error
}
}
INT main(INT argc, char *argv[])
{
INT nTimeSteps = 10;
// pointers for the host side
REAL *unew, *u, *uold, *uFinal, *x, *y;
// allocate memory on the host
unew = (REAL *)calloc(NX*NY,sizeof(REAL));
u = (REAL *)calloc(NX*NY,sizeof(REAL));
uold = (REAL *)calloc(NX*NY,sizeof(REAL));
uFinal = (REAL *)calloc(NX*NY,sizeof(REAL));
x = (REAL *)calloc(NX*NY,sizeof(REAL));
y = (REAL *)calloc(NX*NY,sizeof(REAL));
// pointer for the device side
size_t pitch = NX * sizeof(REAL);
cudaPitchedPtr d_u, d_uold, d_unew, d_tmp;
cudaExtent myExtent = make_cudaExtent(pitch, NY, NZ);
// allocate 3D memory on the device
cudaMalloc3D( &d_u, myExtent ); cudaCheckError();
cudaMalloc3D( &d_uold, myExtent ); cudaCheckError();
cudaMalloc3D( &d_unew, myExtent ); cudaCheckError();
// initialize grid and wave
meshGrid( x, y );
initWave( u, uold, x, y );
// copy host memory to 3D device memory
cudaMemcpy3DParms cpy3D = { 0 };
cpy3D.kind = cudaMemcpyHostToDevice;
// copying u to d_u
cpy3D.srcPtr = make_cudaPitchedPtr(u, pitch, NX, NY);
cpy3D.dstPtr = d_u;
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
// copying uold to d_uold
cpy3D.srcPtr = make_cudaPitchedPtr(uold, pitch, NX, NY);
cpy3D.dstPtr = d_uold;
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
// set up the GPU grid/block model
dim3 dimGrid ( N_BLOCKS_X , N_BLOCKS_Y );
dim3 dimBlock ( N_THREADS_X, N_THREADS_Y );
for ( INT n = 1; n < nTimeSteps + 1; n++ ) {
solveWaveGPU <<< dimGrid, dimBlock >>> ( d_uold, d_u, d_unew );
cudaThreadSynchronize();
cudaCheckError();
d_tmp = d_uold;
d_uold = d_u;
d_u = d_unew;
d_unew = d_tmp;
}
// copy the memory back to host
cpy3D.kind = cudaMemcpyDeviceToHost;
// copying d_unew to uFinal
cpy3D.srcPtr = d_unew;
cpy3D.dstPtr = make_cudaPitchedPtr(uFinal, pitch, NX, NY);
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
free(u); cudaFree(d_u.ptr);
free(unew); cudaFree(d_unew.ptr);
free(uold); cudaFree(d_uold.ptr);
free(uFinal); free(x); free(y);
return EXIT_SUCCESS;
}