I want to know the impact on performance when using cudaMalloc or cudaMalloc3D when allocating, copying and accessing memory for a 2D array. I have code that I tried to test the run time on where on one I use cudaMalloc and on the other cudaMalloc3D. I have included the code and is also hosted on here github-repo. An explanation on how the performance is impacted by either api would be much appreciated.

cudaMalloc code:

```
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PI 3.14159265
#define NX 8192 /* includes boundary points on both end */
#define NY 4096 /* includes boundary points on both end */
#define N_THREADS_X 16
#define N_THREADS_Y 16
#define N_BLOCKS_X NX/N_THREADS_X
#define N_BLOCKS_Y NY/N_THREADS_Y
#define LX 4.0 /* length of the domain in x-direction */
#define LY 2.0 /* length of the domain in x-direction */
#define dx (REAL) ( LX/( (REAL) (NX) ) )
#define cSqrd 5.0
#define dt (REAL) ( 0.4 * dx / sqrt(cSqrd) )
#define FACTOR ( cSqrd * (dt*dt)/(dx*dx) )
#define IC (i + j*NX) /* (i,j) */
#define IM1 (i + j*NX - 1) /* (i-1,j) */
#define IP1 (i + j*NX + 1) /* (i+1,j) */
#define JM1 (i + (j-1)*NX) /* (i,j-1) */
#define JP1 (i + (j+1)*NX) /* (i,j+1) */
#define cudaCheckError() {\
cudaError_t e = cudaGetLastError() ; \
if( e != cudaSuccess ) {\
printf("\nCuda Failure %s:%d: %s\n",__FILE__,__LINE__,cudaGetErrorString(e));\
exit(EXIT_FAILURE);\
}\
}
typedef double REAL;
typedef int INT;
__global__ void solveWaveGPU ( REAL *uold, REAL *u, REAL *unew )
{
INT i,j;
i = blockIdx.x*blockDim.x + threadIdx.x;
j = blockIdx.y*blockDim.y + threadIdx.y;
if (i>0 && i < (NX-1) && j>0 && j < (NY-1) ) {
unew[IC] = 2.0*u[IC] - uold[IC] + FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
}
}
void initWave ( REAL *unew, REAL *u, REAL *uold, REAL *x, REAL *y )
{
INT i,j;
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
u[IC] = 0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] );
}
}
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
uold[IC] = u[IC] + 0.5*FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
}
}
}
void meshGrid ( REAL *x, REAL *y )
{
INT i,j;
REAL a;
for (j=0; j<NY; j++) {
a = dx * ( (REAL) j );
for (i=0; i<NX; i++) {
x[IC] = dx * ( (REAL) i );
y[IC] = a;
}
}
}
INT main(INT argc, char *argv[])
{
INT nTimeSteps = 100;
REAL *unew, *u, *uold, *uFinal, *x, *y; //pointers for the host side
REAL *d_unew, *d_u, *d_uold, *tmp; //pointers for the device
// variable declaration for timing
cudaEvent_t timeStart, timeStop;
cudaEventCreate(&timeStart);
cudaEventCreate(&timeStop);
float elapsedTime_gpu;
unew = (REAL *)calloc(NX*NY,sizeof(REAL));
u = (REAL *)calloc(NX*NY,sizeof(REAL));
uold = (REAL *)calloc(NX*NY,sizeof(REAL));
uFinal = (REAL *)calloc(NX*NY,sizeof(REAL));
x = (REAL *)calloc(NX*NY,sizeof(REAL));
y = (REAL *)calloc(NX*NY,sizeof(REAL));
// create device copies of the variables
cudaMalloc( (void**) &d_unew, NX*NY*sizeof(REAL) ); cudaCheckError();
cudaMalloc( (void**) &d_u, NX*NY*sizeof(REAL) ); cudaCheckError();
cudaMalloc( (void**) &d_uold, NX*NY*sizeof(REAL) ); cudaCheckError();
meshGrid( x, y );
initWave( unew, u, uold, x, y );
// start timing the GPU
cudaMemcpy( d_u, u, NX*NY*sizeof(REAL), cudaMemcpyHostToDevice ); cudaCheckError();
cudaMemcpy( d_uold, uold, NX*NY*sizeof(REAL), cudaMemcpyHostToDevice ); cudaCheckError();
cudaMemcpy( d_unew, unew, NX*NY*sizeof(REAL), cudaMemcpyHostToDevice ); cudaCheckError();
// set up the GPU grid/block model
dim3 dimGrid ( N_BLOCKS_X , N_BLOCKS_Y );
dim3 dimBlock ( N_THREADS_X, N_THREADS_Y );
// launch the GPU kernel
cudaEventRecord(timeStart, 0);
for (INT n=1; n<nTimeSteps+1; n++) {
solveWaveGPU <<<dimGrid,dimBlock>>>(d_uold, d_u, d_unew);
cudaDeviceSynchronize();
cudaCheckError();
tmp = d_uold;
d_uold = d_u;
d_u = d_unew;
d_unew = tmp;
}
cudaEventRecord(timeStop, 0);
cudaEventSynchronize(timeStop);
cudaEventElapsedTime(&elapsedTime_gpu, timeStart, timeStop);
cudaMemcpy( uFinal, d_u, NX*NY*sizeof(REAL), cudaMemcpyDeviceToHost ); cudaCheckError();
printf("elapsedTime on the GPU= %f s.\n", elapsedTime_gpu/1000.0);
free(unew); free(u); free(uold);
cudaFree(d_unew); cudaFree(d_u); cudaFree(d_uold);
free(uFinal); free(x); free(y);
cudaEventDestroy(timeStart);
cudaEventDestroy(timeStop);
return (0);
}
```

cudaMalloc3D code:

```
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PI 3.14159265
#define NX 8192 /* includes boundary points on both end */
#define NY 4096 /* includes boundary points on both end */
#define NZ 1 /* needed for cudaMalloc3D */
#define N_THREADS_X 16
#define N_THREADS_Y 16
#define N_BLOCKS_X NX/N_THREADS_X
#define N_BLOCKS_Y NY/N_THREADS_Y
#define LX 4.0 /* length of the domain in x-direction */
#define LY 2.0 /* length of the domain in x-direction */
#define dx (REAL) ( LX/( (REAL) (NX) ) )
#define cSqrd 5.0
#define dt (REAL) ( 0.4 * dx / sqrt(cSqrd) )
#define FACTOR ( cSqrd * (dt*dt)/(dx*dx) )
#define IC (i + j*NX) /* (i,j) */
#define IM1 (i + j*NX - 1) /* (i-1,j) */
#define IP1 (i + j*NX + 1) /* (i+1,j) */
#define JM1 (i + (j-1)*NX) /* (i,j-1) */
#define JP1 (i + (j+1)*NX) /* (i,j+1) */
#define cudaCheckError() {\
cudaError_t e = cudaGetLastError() ; \
if( e != cudaSuccess ) {\
printf("\nCuda Failure %s:%d: %s\n",__FILE__,__LINE__,cudaGetErrorString(e));\
exit(EXIT_FAILURE);\
}\
}
typedef double REAL;
typedef int INT;
__global__ void solveWaveGPU ( cudaPitchedPtr uold, cudaPitchedPtr u, cudaPitchedPtr unew )
{
INT i,j;
i = blockIdx.x*blockDim.x + threadIdx.x;
j = blockIdx.y*blockDim.y + threadIdx.y;
if (i>0 && i < (NX-1) && j>0 && j < (NY-1) ) {
char *d_u = (char *) u.ptr;
char *d_uold = (char *) uold.ptr;
char *d_unew = (char *) unew.ptr;
REAL *u_row = (REAL *)(d_u + j * u.pitch);
REAL u_IP1 = ( (REAL *)(d_u + (j+1) * u.pitch) )[i];
REAL u_IM1 = ( (REAL *)(d_u + (j-1) * u.pitch) )[i];
REAL u_JP1 = u_row[i+1];
REAL u_JM1 = u_row[i-1];
REAL u_IC = u_row[i];
REAL uold_IC = ( (REAL *)(d_uold + j * uold.pitch) )[i];
REAL *unew_row = (REAL *)(d_unew + j * unew.pitch);
unew_row[i] = 2.0 * u_IC - uold_IC + FACTOR * ( u_IP1 + u_IM1 + u_JP1 + u_JM1 - 4.0 * u_IC );
}
}
void initWave ( REAL *unew, REAL *u, REAL *uold, REAL *x, REAL *y )
{
INT i,j;
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
u[IC] = 0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] );
}
}
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
uold[IC] = u[IC] + 0.5*FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
}
}
}
void meshGrid ( REAL *x, REAL *y )
{
INT i,j;
REAL a;
for (j=0; j<NY; j++) {
a = dx * ( (REAL) j );
for (i=0; i<NX; i++) {
x[IC] = dx * ( (REAL) i );
y[IC] = a;
}
}
}
INT main(INT argc, char *argv[])
{
INT nTimeSteps = 100;
REAL *unew, *u, *uold, *uFinal, *x, *y; //pointers for the host side
// variable declaration for timing
cudaEvent_t timeStart, timeStop;
cudaEventCreate(&timeStart);
cudaEventCreate(&timeStop);
float elapsedTime_gpu;
unew = (REAL *)calloc(NX*NY,sizeof(REAL));
u = (REAL *)calloc(NX*NY,sizeof(REAL));
uold = (REAL *)calloc(NX*NY,sizeof(REAL));
uFinal = (REAL *)calloc(NX*NY,sizeof(REAL));
x = (REAL *)calloc(NX*NY,sizeof(REAL));
y = (REAL *)calloc(NX*NY,sizeof(REAL));
cudaExtent myExtent = make_cudaExtent(NX * sizeof(REAL), NY, NZ);
cudaPitchedPtr d_u, d_uold, d_unew, d_tmp;
// create device copies of the variables
cudaMalloc3D( &d_u , myExtent ); cudaCheckError();
cudaMalloc3D( &d_uold, myExtent ); cudaCheckError();
cudaMalloc3D( &d_unew, myExtent ); cudaCheckError();
meshGrid( x, y );
initWave( unew, u, uold, x, y );
cudaMemcpy3DParms cpy3D = { 0 };
cpy3D.extent = myExtent;
cpy3D.kind = cudaMemcpyHostToDevice;
// copy 3D from u to d_u
cpy3D.srcPtr = make_cudaPitchedPtr(u, NX*sizeof(REAL), NX, NY);
cpy3D.dstPtr = d_u;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
// copy 3D from uold to d_uold
cpy3D.srcPtr = make_cudaPitchedPtr(uold, NX*sizeof(REAL), NX, NY);
cpy3D.dstPtr = d_uold;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
// set up the GPU grid/block model
dim3 dimGrid ( N_BLOCKS_X , N_BLOCKS_Y );
dim3 dimBlock ( N_THREADS_X, N_THREADS_Y );
// launch the GPU kernel
// start timing the GPU
cudaEventRecord(timeStart, 0);
for (INT n=1; n<nTimeSteps+1; n++) {
solveWaveGPU <<<dimGrid,dimBlock>>>(d_uold, d_u, d_unew);
cudaDeviceSynchronize();
cudaCheckError();
d_tmp = d_uold;
d_uold = d_u;
d_u = d_unew;
d_unew = d_tmp;
}
cudaEventRecord(timeStop, 0);
cudaEventSynchronize(timeStop);
cudaEventElapsedTime(&elapsedTime_gpu, timeStart, timeStop);
// copy 3D from d_u to uFinal
cpy3D.kind = cudaMemcpyDeviceToHost;
cpy3D.srcPtr = d_u;
cpy3D.dstPtr = make_cudaPitchedPtr(uFinal, NX*sizeof(REAL), NX, NY);
cudaMemcpy3D( &cpy3D ); cudaCheckError();
printf("elapsedTime on the GPU= %f s.\n", elapsedTime_gpu/1000.0);
free(u); cudaFree(d_unew.ptr);
free(uold); cudaFree(d_u.ptr);
free(unew); cudaFree(d_uold.ptr);
free(uFinal); free(x); free(y);
cudaEventDestroy(timeStart);
cudaEventDestroy(timeStop);
return (0);
}
```

Timing:

```
cudaMalloc3D: 1.192510 s
cudaMalloc: 0.960322 s
```

Machine specification:

```
GNU/Linux x86_64
NVIDIA GeForce GTX Titan CC: 3.5
CUDA ver 7.0
```