#include #include #include template __device__ void f2(int idx, float x_s[nthreads]) { __shared__ float y_s[nthreads]; int kk = 1; y_s[idx] = x_s[idx]; __syncthreads(); if(idx>0) { printf("%i %i %.2e %.2e %.2e %.2e\n", kk, idx, x_s[idx], x_s[idx-1], y_s[idx], y_s[idx-1]); } else { printf("%i %i %.2e %.2e\n", kk, idx, x_s[idx], y_s[idx]); } } template __global__ void f1(float *x_d) { int idx = blockIdx.x * blockDim.x + threadIdx.x; __shared__ float x_s[nthreads]; x_s[idx] = x_d[idx]; __syncthreads(); //printf("%i %.2e %.2e\n", idx, x_d[idx], x_s[idx]); f2 (idx, x_s); } int main() { float *x_h, *x_d; int nx = 4; // allocate, copy cudaSetDevice(0); x_h = (float *) malloc(sizeof(float)*nx); for(int ii=0; ii <<< 1, 4 >>> (x_d); // free cudaFree(x_d); free(x_h); }