diff options
-rw-r--r-- | Makefile | 7 | ||||
-rw-r--r-- | pointerTest.cu | 79 |
2 files changed, 84 insertions, 2 deletions
@@ -1,7 +1,7 @@ SM=35 ARCH= -arch sm_${SM} -all: advection diff2d diff2d_old functionTest incrementArrays hydro1d moveArrays +all: advection diff2d diff2d_old functionTest incrementArrays hydro1d moveArrays pointerTest advection: nvcc $(ARCH) -o advection advection.cu @@ -24,6 +24,9 @@ hydro1d: moveArrays: nvcc $(ARCH) -o moveArrays moveArrays.cu +pointerTest: + nvcc $(ARCH) -o pointerTest pointerTest.cu + clean: - rm -rf advection diff2d diff2d_old functionTest incrementArrays hydro1d moveArrays + rm -rf advection diff2d diff2d_old functionTest incrementArrays hydro1d moveArrays pointerTest diff --git a/pointerTest.cu b/pointerTest.cu new file mode 100644 index 0000000..8776288 --- /dev/null +++ b/pointerTest.cu @@ -0,0 +1,79 @@ +#include <cuda.h> +#include <stdio.h> +#include <stdlib.h> + +/* + * + * The functions f2, f3, and f4 operating on x_s are all the same. + * The reason is that *x_s = x_s[] = x_s[4]. + * I.e. + * 1) Arrays are Pointers + * 2) There Is No Bound Check On Arrays (x_s[4] = x_s[]) + * + */ + +template <int nthreads> +__device__ void f2(int idx, float x_s[nthreads]) +{ + x_s[idx] = 100.f + idx; +} + +__device__ void f3(int idx, float *x_s) +{ + x_s[idx] = 200.f + idx; +} + +__device__ void f4(int idx, float x_s[]) +{ + x_s[idx] = 300.f + idx; +} + +template <int nthreads> +__global__ void f1(float *x_d) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + __shared__ float x_s[nthreads]; + x_s[idx] = x_d[idx]; + __syncthreads(); + printf("%i %.2e %.2e\n", idx, x_d[idx], x_s[idx]); + + // These all do the same! + f2 <nthreads> (idx, x_s); + f3(idx, x_s); + f4(idx, x_s); + + __syncthreads(); + + if(idx==0) { + printf("\n"); + } + __syncthreads(); + + printf("%i %.2e %.2e\n", idx, x_d[idx], x_s[idx]); +} + +int main() +{ + float *x_h, *x_d; + int nx = 4; + + // allocate, copy + cudaSetDevice(0); + x_h = (float *) malloc(sizeof(float)*nx); + for(int ii=0; ii<nx; ii++) { + x_h[ii] = ii; + printf("%i %.2e\n", ii, x_h[ii]); + } + printf("\n"); + cudaMalloc((void**) &x_d, sizeof(float)*nx); + cudaMemcpy(x_d, x_h, sizeof(float)*nx, cudaMemcpyHostToDevice); + + // run kernel + f1 <4> <<< 1, 4 >>> (x_d); + + // free + cudaFree(x_d); + free(x_h); +} + |