#include #include #include /* * * The functions f2, f3, and f4 operating on x_s are all the same. * The reason is that *x_s = x_s[] = x_s[4]. * I.e. * 1) Arrays are Pointers * 2) There Is No Bound Check On Arrays (x_s[4] = x_s[]) * */ template __device__ void f2(int idx, float x_s[nthreads]) { x_s[idx] = 100.f + idx; } __device__ void f3(int idx, float *x_s) { x_s[idx] = 200.f + idx; } __device__ void f4(int idx, float x_s[]) { x_s[idx] = 300.f + idx; } template __global__ void f1(float *x_d) { int idx = blockIdx.x * blockDim.x + threadIdx.x; __shared__ float x_s[nthreads]; x_s[idx] = x_d[idx]; __syncthreads(); printf("%i %.2e %.2e\n", idx, x_d[idx], x_s[idx]); // These all do the same! f2 (idx, x_s); f3(idx, x_s); f4(idx, x_s); __syncthreads(); if(idx==0) { printf("\n"); } __syncthreads(); printf("%i %.2e %.2e\n", idx, x_d[idx], x_s[idx]); } int main() { float *x_h, *x_d; int nx = 4; // allocate, copy cudaSetDevice(0); x_h = (float *) malloc(sizeof(float)*nx); for(int ii=0; ii <<< 1, 4 >>> (x_d); // free cudaFree(x_d); free(x_h); }