Here I want to calculate the distance of each two points, and decide if they are neighbours. here is my simple code in cuda.

```
__global__ void calcNeighbors(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float dis = 0.0f;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=0; i<N; i++) {
DataPoint p2 = points[i];
dis = 0;
dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
(p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
(p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[tid*N+i] = true;
} else {
neighbors[tid*N+i] = false;
}
}
tid += blockDim.x * gridDim.x;
}
}
```

The DataPoint is a struct is

```
typedef struct DataPoint {
float pfDimens[3];
} DataPoint;
```

so here i want to reduce the time, How can i do? I have tried to use memory coalesing and share memory, but i didn't get a good speed up?

===============use share memory==============

```
__global__ void calcNeighbors2(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
__shared__ DataPoint sharedpoints[threadsPerBlock];
int start = blockIdx.x * blockDim.x;
int len = start+threadIdx.x;
if (len < N) {
sharedpoints[threadIdx.x] = points[len];
}
len = imin(N, blockDim.x + start);
__syncthreads();
int tid = threadIdx.x;
float dis;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=start; i<len; i++) {
dis = 0;
dis += (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) * (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) +
(p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) * (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) +
(p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]) * (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[i*N+tid] = true;
} else {
neighbors[i*N+tid] = false;
}
}
tid += blockDim.x;
}
}
```

Here i changed the neighbors[tid*N+i] to neighbors[i*N+tid], it give me amlost 8x speed up on Tesla K10.G2.8GB. But when i use share memory to store some points, it is no use?

# Best How To :

There are at least 4 ideas, some of which have already been stated in the comments:

Transform your point distance storage from AoS format:

```
struct DataPoint {
float pfDimens[3];
};
```

to SoA format:

```
struct DataPoint {
float pfDimens_x[NPTS];
float pfDimens_y[NPTS];
float pfDimens_z[NPTS];
};
```

this will enable full coalescing on loading of the data. In fact, to help with point 4 below, I would just switch to using 3 bare arrays, rather than a structure.

reduce the computation to (slightly less than) half:

```
for (int i=N-1; i>tid; i--) {
```

then, either in the thread code itself, or in the host, you can populate the other "half" of the output matrix by copying data.

Transpose the storage in your output matrix, so that you can write a storage operation like this:

```
neighbors[i*N+tid] = true;
```

which will nicely coalesce, as opposed to this:

```
neighbors[tid*N+i] = true;
```

which will not.

Since your input point data is read only, mark the kernel parameter appropriately:

```
const float * __restrict__ points_x, const float * __restrict__ points_y, const float * __restrict__ points_z
```

in some cases, and on some GPUs, this will often lead to a speed-up due to use of the read-only cache. If you really want to get aggressive with caching, and your data array is small enough (4K or less `float`

points), you could put a copy of the point data in global memory as well as a copy in `__constant__`

memory, and load the "uniform" load you are doing here through constant memory:

```
DataPoint p2 = c_points[i];
```

thus you could perform the coalesced load through the read-only cache, the uniform load through the constant cache, and the coalesced store going to ordinary global memory.

On a K40c, on linux/CUDA 7, for `N`

= 4096, the net effect of these changes appears to be about a 3.5x speedup, at the kernel level:

```
$ cat t749.cu
#include <stdio.h>
#define N 4096
// if N is 16K/3 or less, we can use constant
#define USE_CONSTANT
#define THRESH 0.2f
#define nTPB 256
#define nBLK (N/nTPB+1)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
struct DataPoint {
float pfDimens[3];
};
__global__ void calcNeighbors(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float dis = 0.0f;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=0; i<N; i++) {
DataPoint p2 = points[i];
dis = 0;
dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
(p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
(p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[tid*N+i] = true;
} else {
neighbors[tid*N+i] = false;
}
}
tid += blockDim.x * gridDim.x;
}
}
#ifdef USE_CONSTANT
__constant__ float cpx[N];
__constant__ float cpy[N];
__constant__ float cpz[N];
#endif
__global__ void calcNeighbors2(const float * __restrict__ pts_x, const float * __restrict__ pts_y, const float * __restrict__ pts_z, const float doubleRadius, bool * __restrict__ neighbors) {
int tid = threadIdx.x+blockDim.x*blockIdx.x;
while (tid < N) {
float p1x = pts_x[tid];
float p1y = pts_y[tid];
float p1z = pts_z[tid];
for (int i = N-1; i > tid; i--){
float p2x, p2y, p2z;
#ifdef USE_CONSTANT
p2x = cpx[i];
p2y = cpy[i];
p2z = cpz[i];
#else
p2x = pts_x[i];
p2y = pts_y[i];
p2z = pts_z[i];
#endif
float dis = ((p1x-p2x)*(p1x-p2x)) + ((p1y-p2y)*(p1y-p2y)) + ((p1z-p2z)*(p1z-p2z));
neighbors[i*N+tid] = (dis <= doubleRadius);
}
tid += blockDim.x * gridDim.x;
}
}
int main(){
float *dx, *dy, *dz, *hx, *hy, *hz;
DataPoint *dp, *hp;
bool *dn, *hn1, *hn2;
hx =(float *)malloc(N*sizeof(float));
hy =(float *)malloc(N*sizeof(float));
hz =(float *)malloc(N*sizeof(float));
hp =(DataPoint *)malloc(N*sizeof(DataPoint));
hn1=(bool *)malloc(N*N*sizeof(bool));
hn2=(bool *)malloc(N*N*sizeof(bool));
cudaMalloc(&dx, N*sizeof(float));
cudaMalloc(&dy, N*sizeof(float));
cudaMalloc(&dz, N*sizeof(float));
cudaMalloc(&dp, N*sizeof(DataPoint));
cudaMalloc(&dn, N*N*sizeof(bool));
for (int i =0; i < N; i++){
hx[i] = rand()/(float)RAND_MAX;
hy[i] = rand()/(float)RAND_MAX;
hz[i] = rand()/(float)RAND_MAX;
hp[i].pfDimens[0] = hx[i];
hp[i].pfDimens[1] = hy[i];
hp[i].pfDimens[2] = hz[i];}
cudaMemcpy(dx, hx, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dy, hy, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dz, hz, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dp, hp, N*sizeof(DataPoint), cudaMemcpyHostToDevice);
// warm-up
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t1 = dtime_usec(0);
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 1 error");
t1 = dtime_usec(t1);
cudaMemcpy(hn1, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
// warm-up
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t2 = dtime_usec(0);
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 2 error");
t2 = dtime_usec(t2);
cudaMemcpy(hn2, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
cudaCheckErrors("some error");
printf("t1: %fs, t2: %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC);
// results validation
for (int i = 0; i < N; i++)
for (int j = i+1; j < N; j++)
if (hn1[i*N+j] != hn2[j*N+i]) {printf("mismatch at %d, %d, was: %d, should be: %d\n", i, j, hn2[j*N+i], hn1[i*N+j]); return 1;}
return 0;
}
$ nvcc -arch=sm_35 -o t749 t749.cu
$ ./t749
t1: 0.004903s, t2: 0.001395s
$
```

In the case of K40c, the limited number of blocks being launched above (16) is a significant impediment to performance, due to latency. If we comment out the `USE_CONSTANT`

define, and change `N`

to 16384, we observe an even higher speedup with the improved kernel:

```
$ ./t749
t1: 0.267107s, t2: 0.008209s
$
```

the resultant ~48 blocks being enough to approximately "fill" the K40c which has 15 SMs.

**EDIT:** now that you've posted a shared memory kernel, I added it to my test case as `calcNeighbors3`

and compared it's timing performance (as `t3`

). It is almost as fast as my kernel, and it seems to provide the correct result (matches your original kernel) so I'm not sure what your concerns are.

Here's the updated code and test case:

```
$ cat t749.cu
#include <stdio.h>
#include <math.h>
#define imin(X,Y) ((X)<(Y))?(X):(Y)
#define N 32768
// if N is 16K/3 or less, we can use constant
// #define USE_CONSTANT
#define THRESH 0.2f
#define nTPB 256
#define nBLK (N/nTPB+1)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
struct DataPoint {
float pfDimens[3];
};
__global__ void calcNeighbors(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float dis = 0.0f;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=0; i<N; i++) {
DataPoint p2 = points[i];
dis = 0;
dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
(p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
(p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[tid*N+i] = true;
} else {
neighbors[tid*N+i] = false;
}
}
tid += blockDim.x * gridDim.x;
}
}
#ifdef USE_CONSTANT
__constant__ float cpx[N];
__constant__ float cpy[N];
__constant__ float cpz[N];
#endif
__global__ void calcNeighbors2(const float * __restrict__ pts_x, const float * __restrict__ pts_y, const float * __restrict__ pts_z, const float doubleRadius, bool * __restrict__ neighbors) {
int tid = threadIdx.x+blockDim.x*blockIdx.x;
while (tid < N) {
float p1x = pts_x[tid];
float p1y = pts_y[tid];
float p1z = pts_z[tid];
for (int i = N-1; i > tid; i--){
float p2x, p2y, p2z;
#ifdef USE_CONSTANT
p2x = cpx[i];
p2y = cpy[i];
p2z = cpz[i];
#else
p2x = pts_x[i];
p2y = pts_y[i];
p2z = pts_z[i];
#endif
float dis = ((p1x-p2x)*(p1x-p2x)) + ((p1y-p2y)*(p1y-p2y)) + ((p1z-p2z)*(p1z-p2z));
neighbors[i*N+tid] = (dis <= doubleRadius);
}
tid += blockDim.x * gridDim.x;
}
}
__global__ void calcNeighbors3(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
__shared__ DataPoint sharedpoints[nTPB];
int start = blockIdx.x * blockDim.x;
int len = start+threadIdx.x;
if (len < N) {
sharedpoints[threadIdx.x] = points[len];
}
len = imin(N, blockDim.x + start);
__syncthreads();
int tid = threadIdx.x;
float dis;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=start; i<len; i++) {
dis = 0;
dis += (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) * (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) +
(p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) * (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) +
(p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]) * (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[i*N+tid] = true;
} else {
neighbors[i*N+tid] = false;
}
}
tid += blockDim.x;
}
}
int main(){
float *dx, *dy, *dz, *hx, *hy, *hz;
DataPoint *dp, *hp;
bool *dn, *hn1, *hn2, *hn3;
hx =(float *)malloc(N*sizeof(float));
hy =(float *)malloc(N*sizeof(float));
hz =(float *)malloc(N*sizeof(float));
hp =(DataPoint *)malloc(N*sizeof(DataPoint));
hn1=(bool *)malloc(N*N*sizeof(bool));
hn2=(bool *)malloc(N*N*sizeof(bool));
hn3=(bool *)malloc(N*N*sizeof(bool));
cudaMalloc(&dx, N*sizeof(float));
cudaMalloc(&dy, N*sizeof(float));
cudaMalloc(&dz, N*sizeof(float));
cudaMalloc(&dp, N*sizeof(DataPoint));
cudaMalloc(&dn, N*N*sizeof(bool));
for (int i =0; i < N; i++){
hx[i] = rand()/(float)RAND_MAX;
hy[i] = rand()/(float)RAND_MAX;
hz[i] = rand()/(float)RAND_MAX;
hp[i].pfDimens[0] = hx[i];
hp[i].pfDimens[1] = hy[i];
hp[i].pfDimens[2] = hz[i];}
cudaMemcpy(dx, hx, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dy, hy, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dz, hz, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dp, hp, N*sizeof(DataPoint), cudaMemcpyHostToDevice);
#ifdef USE_CONSTANT
cudaMemcpyToSymbol(cpx, hx, N*sizeof(float));
cudaMemcpyToSymbol(cpy, hy, N*sizeof(float));
cudaMemcpyToSymbol(cpz, hz, N*sizeof(float));
#endif
// warm-up
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t1 = dtime_usec(0);
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 1 error");
t1 = dtime_usec(t1);
cudaMemcpy(hn1, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
// warm-up
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t2 = dtime_usec(0);
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 2 error");
t2 = dtime_usec(t2);
cudaMemcpy(hn2, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
// warm-up
calcNeighbors3<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t3 = dtime_usec(0);
calcNeighbors3<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 3 error");
t3 = dtime_usec(t3);
cudaMemcpy(hn3, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
cudaCheckErrors("some error");
printf("t1: %fs, t2: %fs, t3: %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC, t3/(float)USECPSEC);
// results validation
for (int i = 0; i < N; i++)
for (int j = i+1; j < N; j++)
if (hn1[i*N+j] != hn2[j*N+i]) {printf("1:2 mismatch at %d, %d, was: %d, should be: %d\n", i, j, hn2[j*N+i], hn1[i*N+j]); return 1;}
for (int i = 0; i < N*N; i++)
if (hn1[i] != hn3[i]) {printf("1:3 mismatch at %d, was: %d, should be: %d\n", i, hn1[i], hn3[i]); return 1;}
return 0;
}
$ nvcc -arch=sm_35 -o t749 t749.cu
$ ./t749
t1: 1.260010s, t2: 0.022661s, t3: 0.029632s
$
```

For this test, I have changed the data set size to 32768 since that is closer to the range you care about. Your shared memory kernel shows about a 42x speedup over your original kernel, and my kernel shows about a 55x speedup, on my K40c.