summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVolker Hoffmann <volker@cheleb.net>2014-06-23 18:39:19 +0200
committerVolker Hoffmann <volker@cheleb.net>2014-06-23 18:39:19 +0200
commit9cb32ed15f59d1c942dd63cab00aca845cda2b8f (patch)
tree4dc307fd54fcd06ac310617542b72aa495ea68d8
parentd7b894f738547e3e385cf06e10b8347c747b0415 (diff)
2d diffusion code, blocks cannot communicate
-rw-r--r--diff2d_old.cu102
1 files changed, 102 insertions, 0 deletions
diff --git a/diff2d_old.cu b/diff2d_old.cu
new file mode 100644
index 0000000..bac7124
--- /dev/null
+++ b/diff2d_old.cu
@@ -0,0 +1,102 @@
+#include <cuda.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+template<int Nthreads>
+__global__ void diffuse(float *uold_d, float *unew_d, int Ncells)
+{
+ // thread index, no block
+ int itx = threadIdx.x;
+ int ity = threadIdx.y;
+
+ // total index (over all blocks)
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ int idy = blockIdx.y * blockDim.y + threadIdx.y;
+
+ // allocate shared memory, copy over from global memory
+ // shared memory is faster than global memory
+ // this copying requires a thread to one a single r/w op
+ // the math later requires 5 reads to update one cell
+ // therefore, it's worth copying over first
+ __shared__ float u_s[Nthreads][Nthreads];
+ u_s[ity][itx] = uold_d[idy*Ncells+idx];
+
+ // to make sure all neighbouring cells are updated, we wait for all threads
+ __syncthreads();
+
+ // math, watch out when we're on the boundary
+ if(itx>0 && itx<Nthreads-1 && ity>0 && ity<Nthreads-1) {
+ unew_d[idy*Ncells+idx] = (u_s[ity][itx] + u_s[ity-1][itx] + u_s[ity+1][itx] +
+ u_s[ity][itx-1] + u_s[ity][itx+1])/5.f;
+ }
+
+ // debug
+ //printf("%i %i %.2e\n", idx, idy, unew_d[idy*N+idx]);
+}
+
+__global__ void updateMem(float *uold_d, float *unew_d, int Ncells)
+{
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ int idy = blockIdx.y * blockDim.y + threadIdx.y;
+ uold_d[idy*Ncells+idx] = unew_d[idy*Ncells+idx];
+}
+
+int main()
+{
+ int Ncells = 64; // Number of cells ALONG ONE DIMENSION
+ // We have NxN total cells
+
+ float *u_h, *uold_d, *unew_d; // Memory pointers
+
+ // select a CUDA device
+ cudaSetDevice(0);
+
+ // allocate host memory
+ u_h = (float *)malloc(sizeof(float)*Ncells*Ncells);
+
+ // allocate device memory
+ cudaMalloc((void **) &uold_d, sizeof(float)*Ncells*Ncells);
+ cudaMalloc((void **) &unew_d, sizeof(float)*Ncells*Ncells);
+
+ // set zero
+ cudaMemset(unew_d, 0, sizeof(float)*Ncells*Ncells);
+
+ // fill array
+ for(int i=0; i<Ncells; i++) {
+ for(int j=0; j<Ncells; j++) {
+ u_h[j*Ncells+i] = 0.f;
+ }
+ }
+
+ // put in some data
+ u_h[22*Ncells+22] = 1.f;
+
+ // copy to device
+ cudaMemcpy(uold_d, u_h, sizeof(float)*Ncells*Ncells, cudaMemcpyHostToDevice);
+
+ // this is how we define a 2D grid of threads and blocks
+ dim3 nThreads(32,32,1);
+ dim3 nBlocks(2,2,1);
+
+ // main loop, let's do 10 timesteps
+ for(int i=0; i<120; i++) {
+ // run our physics
+ // <32> [32x32] is the biggest block size we can have!
+ // alt: diffuse <32> <<< (1,1,1), (32,32,1) >>> (...)
+ diffuse <32> <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells);
+ // set uold to unew before the next step
+ updateMem <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells);
+ }
+
+ // copy back into host memory
+ cudaMemcpy(u_h, unew_d, sizeof(float)*Ncells*Ncells, cudaMemcpyDeviceToHost);
+
+ // output array
+ for(int i=0; i<Ncells; i++) {
+ for(int j=0; j<Ncells; j++) {
+ printf("%i %i %.16f \n", i, j, u_h[j*Ncells+i]);
+ }
+ }
+
+ return 0;
+}