diff options
author | Volker Hoffmann <volker@cheleb.net> | 2014-06-23 18:39:19 +0200 |
---|---|---|
committer | Volker Hoffmann <volker@cheleb.net> | 2014-06-23 18:39:19 +0200 |
commit | 9cb32ed15f59d1c942dd63cab00aca845cda2b8f (patch) | |
tree | 4dc307fd54fcd06ac310617542b72aa495ea68d8 | |
parent | d7b894f738547e3e385cf06e10b8347c747b0415 (diff) |
2d diffusion code, blocks cannot communicate
-rw-r--r-- | diff2d_old.cu | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/diff2d_old.cu b/diff2d_old.cu new file mode 100644 index 0000000..bac7124 --- /dev/null +++ b/diff2d_old.cu @@ -0,0 +1,102 @@ +#include <cuda.h> +#include <stdio.h> +#include <stdlib.h> + +template<int Nthreads> +__global__ void diffuse(float *uold_d, float *unew_d, int Ncells) +{ + // thread index, no block + int itx = threadIdx.x; + int ity = threadIdx.y; + + // total index (over all blocks) + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int idy = blockIdx.y * blockDim.y + threadIdx.y; + + // allocate shared memory, copy over from global memory + // shared memory is faster than global memory + // this copying requires a thread to one a single r/w op + // the math later requires 5 reads to update one cell + // therefore, it's worth copying over first + __shared__ float u_s[Nthreads][Nthreads]; + u_s[ity][itx] = uold_d[idy*Ncells+idx]; + + // to make sure all neighbouring cells are updated, we wait for all threads + __syncthreads(); + + // math, watch out when we're on the boundary + if(itx>0 && itx<Nthreads-1 && ity>0 && ity<Nthreads-1) { + unew_d[idy*Ncells+idx] = (u_s[ity][itx] + u_s[ity-1][itx] + u_s[ity+1][itx] + + u_s[ity][itx-1] + u_s[ity][itx+1])/5.f; + } + + // debug + //printf("%i %i %.2e\n", idx, idy, unew_d[idy*N+idx]); +} + +__global__ void updateMem(float *uold_d, float *unew_d, int Ncells) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int idy = blockIdx.y * blockDim.y + threadIdx.y; + uold_d[idy*Ncells+idx] = unew_d[idy*Ncells+idx]; +} + +int main() +{ + int Ncells = 64; // Number of cells ALONG ONE DIMENSION + // We have NxN total cells + + float *u_h, *uold_d, *unew_d; // Memory pointers + + // select a CUDA device + cudaSetDevice(0); + + // allocate host memory + u_h = (float *)malloc(sizeof(float)*Ncells*Ncells); + + // allocate device memory + cudaMalloc((void **) &uold_d, sizeof(float)*Ncells*Ncells); + cudaMalloc((void **) &unew_d, sizeof(float)*Ncells*Ncells); + + // set zero + cudaMemset(unew_d, 0, sizeof(float)*Ncells*Ncells); + + // fill array + for(int i=0; i<Ncells; i++) { + for(int j=0; j<Ncells; j++) { + u_h[j*Ncells+i] = 0.f; + } + } + + // put in some data + u_h[22*Ncells+22] = 1.f; + + // copy to device + cudaMemcpy(uold_d, u_h, sizeof(float)*Ncells*Ncells, cudaMemcpyHostToDevice); + + // this is how we define a 2D grid of threads and blocks + dim3 nThreads(32,32,1); + dim3 nBlocks(2,2,1); + + // main loop, let's do 10 timesteps + for(int i=0; i<120; i++) { + // run our physics + // <32> [32x32] is the biggest block size we can have! + // alt: diffuse <32> <<< (1,1,1), (32,32,1) >>> (...) + diffuse <32> <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells); + // set uold to unew before the next step + updateMem <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells); + } + + // copy back into host memory + cudaMemcpy(u_h, unew_d, sizeof(float)*Ncells*Ncells, cudaMemcpyDeviceToHost); + + // output array + for(int i=0; i<Ncells; i++) { + for(int j=0; j<Ncells; j++) { + printf("%i %i %.16f \n", i, j, u_h[j*Ncells+i]); + } + } + + return 0; +} |