diff options
Diffstat (limited to 'diff2d.cu')
-rw-r--r-- | diff2d.cu | 127 |
1 files changed, 127 insertions, 0 deletions
diff --git a/diff2d.cu b/diff2d.cu new file mode 100644 index 0000000..055520c --- /dev/null +++ b/diff2d.cu @@ -0,0 +1,127 @@ +#include <cuda.h> +#include <stdio.h> +#include <stdlib.h> + +template<int Nthreads, int Nhalo> +__global__ void diffuse(float *uold_d, float *unew_d, int Ncells) +{ + // thread index, no block + int itx = threadIdx.x; + int ity = threadIdx.y; + + // total index (over all blocks) + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int idy = blockIdx.y * blockDim.y + threadIdx.y; + + // allocate shared memory, copy over from global memory + // shared memory is faster than global memory + // this copying requires a thread to one a single r/w op + // the math later requires 5 reads to update one cell + // therefore, it's worth copying over first + __shared__ float u_s[Nthreads+2*Nhalo][Nthreads+2*Nhalo]; + u_s[ity+Nhalo][itx+Nhalo] = uold_d[idy*Ncells+idx]; + + if(ity<Nhalo) { + u_s[ity][itx+Nhalo] = uold_d[(idy-Nhalo)*Ncells+idx]; + u_s[Nthreads+2*Nhalo-1-ity][itx+Nhalo] = uold_d[(idy+Nthreads)*Ncells+idx]; + if(ity==0 && itx==0) { + //printf("%i %i %i %i %i %.16f\n", blockIdx.x, blockIdx.y, Nt+2*Nhalo-1-ity, itx+Nhalo, (idy+Nt)*N+idx, u_s[Nt+2*Nhalo-1-ity][itx+Nhalo]); + } + } + + // fill in block boundaries + if(itx<Nhalo) { + u_s[ity+Nhalo][itx] = uold_d[idy*Ncells+idx-Nhalo]; + u_s[ity+Nhalo][Nthreads+2*Nhalo-1-itx] = uold_d[idy*Ncells+idx+Nthreads]; + } + + // to make sure all neighbouring cells are updated, we wait for all threads + __syncthreads(); + + // math, watch out when we're on the boundary + if(idx>0 && idx<Ncells-1 && idy>0 && idy<Ncells-1) { + unew_d[idy*Ncells+idx] = (u_s[ity+Nhalo][itx+Nhalo] + u_s[ity-1+Nhalo][itx+Nhalo] + u_s[ity+1+Nhalo][itx+Nhalo] + u_s[ity+Nhalo][itx-1+Nhalo] + u_s[ity+Nhalo][itx+1+Nhalo])/5.f; + } + + //printf("%i %i %.2e\n", idx, idy, unew_d[idy*N+idx]); + if(blockIdx.x == 1 && blockIdx.y == 0) { + if(itx==0 && ity==31) { + // printf("%.16f %.16f %.16f %.16f %.16f %.16f\n", u_s[ity+Nhalo][itx+Nhalo], u_s[ity-1+Nhalo][itx+Nhalo], u_s[ity+1+Nhalo][itx+Nhalo], u_s[ity+Nhalo][itx-1+Nhalo], u_s[ity+Nhalo][itx+1+Nhalo], unew_d[idy*Ncells+idx]); + } + //if(u_s[ity+Nhalo][itx+Nhalo] != 0.f) { + // printf("%i %i %.16f\n", itx, ity, u_s[ity+Nhalo][itx+Nhalo]); + //} + } + +} + +__global__ void updateMem(float *uold_d, float *unew_d, int Ncells) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int idy = blockIdx.y * blockDim.y + threadIdx.y; + uold_d[idy*Ncells+idx] = unew_d[idy*Ncells+idx]; + if(idx==32 && idy==32) { + // printf("%.16e\n", uold_d[idy*Ncells+idx]); + } +} + +int main() +{ + int Ncells = 64; // Number of cells ALONE ONE DIMENSIONS + // We have NxN total cells + + float *u_h, *uold_d, *unew_d; // Pointer party + + // select a CUDA device + cudaSetDevice(0); + + // allocate host memory + u_h = (float *)malloc(sizeof(float)*Ncells*Ncells); + + // allocate device memory + cudaMalloc((void **) &uold_d, sizeof(float)*Ncells*Ncells); + cudaMalloc((void **) &unew_d, sizeof(float)*Ncells*Ncells); + + // set zero + cudaMemset(unew_d, 0, sizeof(float)*Ncells*Ncells); + + // fill array + for(int i=0; i<Ncells; i++) { + for(int j=0; j<Ncells; j++) { + u_h[j*Ncells+i] = 0.f; + } + } + + // set some inital conditions + u_h[16*Ncells+16] = 1.f; + u_h[32*Ncells+32] = 1.f; + + // copy to device + cudaMemcpy(uold_d, u_h, sizeof(float)*Ncells*Ncells, cudaMemcpyHostToDevice); + + // this is how we define a 2D grid of threads and blocks + dim3 nThreads(32,32,1); + dim3 nBlocks(2,2,1); + + for(int i=0; i<128; i++) { + // run our physics + // <32> [32x32] is the biggest block size we can have! + // alt: diffuse <32> <<< (1,1,1), (32,32,1) >>> (...) + diffuse <32,1> <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells); + // set uold to unew before the next step + updateMem <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells); + } + + // copy back into host memory + cudaMemcpy(u_h, unew_d, sizeof(float)*Ncells*Ncells, cudaMemcpyDeviceToHost); + + // output array + for(int i=0; i<Ncells; i++) { + for(int j=0; j<Ncells; j++) { + printf("%i %i %.16f \n", i, j, u_h[j*Ncells+i]); + } + } + + return 0; +} + |