summaryrefslogtreecommitdiffstats
path: root/diff2d.cu
diff options
context:
space:
mode:
Diffstat (limited to 'diff2d.cu')
-rw-r--r--diff2d.cu127
1 files changed, 127 insertions, 0 deletions
diff --git a/diff2d.cu b/diff2d.cu
new file mode 100644
index 0000000..055520c
--- /dev/null
+++ b/diff2d.cu
@@ -0,0 +1,127 @@
+#include <cuda.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+template<int Nthreads, int Nhalo>
+__global__ void diffuse(float *uold_d, float *unew_d, int Ncells)
+{
+ // thread index, no block
+ int itx = threadIdx.x;
+ int ity = threadIdx.y;
+
+ // total index (over all blocks)
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ int idy = blockIdx.y * blockDim.y + threadIdx.y;
+
+ // allocate shared memory, copy over from global memory
+ // shared memory is faster than global memory
+ // this copying requires a thread to one a single r/w op
+ // the math later requires 5 reads to update one cell
+ // therefore, it's worth copying over first
+ __shared__ float u_s[Nthreads+2*Nhalo][Nthreads+2*Nhalo];
+ u_s[ity+Nhalo][itx+Nhalo] = uold_d[idy*Ncells+idx];
+
+ if(ity<Nhalo) {
+ u_s[ity][itx+Nhalo] = uold_d[(idy-Nhalo)*Ncells+idx];
+ u_s[Nthreads+2*Nhalo-1-ity][itx+Nhalo] = uold_d[(idy+Nthreads)*Ncells+idx];
+ if(ity==0 && itx==0) {
+ //printf("%i %i %i %i %i %.16f\n", blockIdx.x, blockIdx.y, Nt+2*Nhalo-1-ity, itx+Nhalo, (idy+Nt)*N+idx, u_s[Nt+2*Nhalo-1-ity][itx+Nhalo]);
+ }
+ }
+
+ // fill in block boundaries
+ if(itx<Nhalo) {
+ u_s[ity+Nhalo][itx] = uold_d[idy*Ncells+idx-Nhalo];
+ u_s[ity+Nhalo][Nthreads+2*Nhalo-1-itx] = uold_d[idy*Ncells+idx+Nthreads];
+ }
+
+ // to make sure all neighbouring cells are updated, we wait for all threads
+ __syncthreads();
+
+ // math, watch out when we're on the boundary
+ if(idx>0 && idx<Ncells-1 && idy>0 && idy<Ncells-1) {
+ unew_d[idy*Ncells+idx] = (u_s[ity+Nhalo][itx+Nhalo] + u_s[ity-1+Nhalo][itx+Nhalo] + u_s[ity+1+Nhalo][itx+Nhalo] + u_s[ity+Nhalo][itx-1+Nhalo] + u_s[ity+Nhalo][itx+1+Nhalo])/5.f;
+ }
+
+ //printf("%i %i %.2e\n", idx, idy, unew_d[idy*N+idx]);
+ if(blockIdx.x == 1 && blockIdx.y == 0) {
+ if(itx==0 && ity==31) {
+ // printf("%.16f %.16f %.16f %.16f %.16f %.16f\n", u_s[ity+Nhalo][itx+Nhalo], u_s[ity-1+Nhalo][itx+Nhalo], u_s[ity+1+Nhalo][itx+Nhalo], u_s[ity+Nhalo][itx-1+Nhalo], u_s[ity+Nhalo][itx+1+Nhalo], unew_d[idy*Ncells+idx]);
+ }
+ //if(u_s[ity+Nhalo][itx+Nhalo] != 0.f) {
+ // printf("%i %i %.16f\n", itx, ity, u_s[ity+Nhalo][itx+Nhalo]);
+ //}
+ }
+
+}
+
+__global__ void updateMem(float *uold_d, float *unew_d, int Ncells)
+{
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ int idy = blockIdx.y * blockDim.y + threadIdx.y;
+ uold_d[idy*Ncells+idx] = unew_d[idy*Ncells+idx];
+ if(idx==32 && idy==32) {
+ // printf("%.16e\n", uold_d[idy*Ncells+idx]);
+ }
+}
+
+int main()
+{
+ int Ncells = 64; // Number of cells ALONE ONE DIMENSIONS
+ // We have NxN total cells
+
+ float *u_h, *uold_d, *unew_d; // Pointer party
+
+ // select a CUDA device
+ cudaSetDevice(0);
+
+ // allocate host memory
+ u_h = (float *)malloc(sizeof(float)*Ncells*Ncells);
+
+ // allocate device memory
+ cudaMalloc((void **) &uold_d, sizeof(float)*Ncells*Ncells);
+ cudaMalloc((void **) &unew_d, sizeof(float)*Ncells*Ncells);
+
+ // set zero
+ cudaMemset(unew_d, 0, sizeof(float)*Ncells*Ncells);
+
+ // fill array
+ for(int i=0; i<Ncells; i++) {
+ for(int j=0; j<Ncells; j++) {
+ u_h[j*Ncells+i] = 0.f;
+ }
+ }
+
+ // set some inital conditions
+ u_h[16*Ncells+16] = 1.f;
+ u_h[32*Ncells+32] = 1.f;
+
+ // copy to device
+ cudaMemcpy(uold_d, u_h, sizeof(float)*Ncells*Ncells, cudaMemcpyHostToDevice);
+
+ // this is how we define a 2D grid of threads and blocks
+ dim3 nThreads(32,32,1);
+ dim3 nBlocks(2,2,1);
+
+ for(int i=0; i<128; i++) {
+ // run our physics
+ // <32> [32x32] is the biggest block size we can have!
+ // alt: diffuse <32> <<< (1,1,1), (32,32,1) >>> (...)
+ diffuse <32,1> <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells);
+ // set uold to unew before the next step
+ updateMem <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells);
+ }
+
+ // copy back into host memory
+ cudaMemcpy(u_h, unew_d, sizeof(float)*Ncells*Ncells, cudaMemcpyDeviceToHost);
+
+ // output array
+ for(int i=0; i<Ncells; i++) {
+ for(int j=0; j<Ncells; j++) {
+ printf("%i %i %.16f \n", i, j, u_h[j*Ncells+i]);
+ }
+ }
+
+ return 0;
+}
+