#include #include #include template __global__ void diffuse(float *uold_d, float *unew_d, int Ncells) { // thread index, no block int itx = threadIdx.x; int ity = threadIdx.y; // total index (over all blocks) int idx = blockIdx.x * blockDim.x + threadIdx.x; int idy = blockIdx.y * blockDim.y + threadIdx.y; // allocate shared memory, copy over from global memory // shared memory is faster than global memory // this copying requires a thread to one a single r/w op // the math later requires 5 reads to update one cell // therefore, it's worth copying over first __shared__ float u_s[Nthreads][Nthreads]; u_s[ity][itx] = uold_d[idy*Ncells+idx]; // to make sure all neighbouring cells are updated, we wait for all threads __syncthreads(); // math, watch out when we're on the boundary if(itx>0 && itx0 && ity [32x32] is the biggest block size we can have! // alt: diffuse <32> <<< (1,1,1), (32,32,1) >>> (...) diffuse <32> <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells); // set uold to unew before the next step updateMem <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells); } // copy back into host memory cudaMemcpy(u_h, unew_d, sizeof(float)*Ncells*Ncells, cudaMemcpyDeviceToHost); // output array for(int i=0; i