From cb4dabf1eb1c1ba77af4a98e2cab2149b7728a93 Mon Sep 17 00:00:00 2001 From: Volker Hoffmann Date: Fri, 4 Jul 2014 17:18:51 +0200 Subject: [wip] just stashing my unfinished 1d hydro somewhere --- hydro1d.cu | 249 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 hydro1d.cu diff --git a/hydro1d.cu b/hydro1d.cu new file mode 100644 index 0000000..357af9e --- /dev/null +++ b/hydro1d.cu @@ -0,0 +1,249 @@ +#include +#include +#include +#include + +#define NSTEPS 128 // number of time steps +#define NX 12 // number of cells in computational domain (EXCLUDING BOUNDARY) (0..11 / 0..NX-1) +#define NTHREADS_PER_BLOCK 4 // number of threads per block (0..3 / 0..NTHREADS_PER_BLOCK-1) +#define NBOUNDARY 1 // boundary cells added on each side of the domain + + + + + +__device__ void syncBoundaries() {} + + + + + +// all arrays/pointers passed are nthreads (*float) wide +template +__device__ void advectStep(float *ui,float *qold, float *qnew, + int itx, int dx, int dt) +{ + __shared__ flux_s[nthreads]; + + // Compute Flux + if(ui[itx]>0.0) { + flux[itx] = q[itx] * ui[itx]; + } else { + flux[itx] = q[itx+1] * ui[itx]; + } + __syncthreads(); + + // Update Me + qnew[itx] = qold[itx] - dt / dx * ( flux[itx] - flux[itx-1] ); +} + +template +__global__ void hydroStep(float *rho_old_d, float *rhou_old_d, + float *rho_new_d, float *rhou_new_d, + int dt, int dx, int itx) +{ + // global and local indices + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int itx = threadIdx.x; + + // interface velocity + __shared__ float ui[nthreads] + ui[itx] = 0.5 * ( rhou[itx]/rho[itx] - rhou[itx+1]/rho[itx+1] ); + + // first hydro half-steps + advectStep (ui, rho_old_s, rhou_new_s, itx, dx, dt); + advectStep (ui, rhou_old_s, rhou_new_s, itx, dx, dt); + + // update old to new before we do the next + ... + + // new do the pressure half step + + + + + + + + + + + + + // update ghost cells (periodic BCs) + rho_new_d[0] = rho_new[ntotal-1] + rho_new_d[ntotal] = rho_new[1] + rhou_new_d[0] = rhoi_new[ntotal-1] + rhou_new_d[ntotal] = rhoi_new[0] + __syncthreads(); + } + + // update old/new + rho_old[idx] = rho_new[idx] + rhou_old[idx] = rhou_new[idx] + + // compute pressure + p[idx] = (gamma - 1.0) * rho_new_d * eint + + // hydro, second half-step + if(idx>0 && idx +__global__ void hydroLoop(float *rho_d, float *rhou_d, + float cfl, int dx) +{ + // global and local indices + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int itx = threadIdx.x; + + // prepare shared memory + __shared__ float rho_new_s[nthreads]; + __shared__ float rho_old_s[nthreads]; + __shared__ float rhou_new_s[nthreads]; + __shared__ float rhou_old_s[nthreads]; + + rho_old_s[itx] = rho_d[idx]; rhou_old_s[itx] = rho_d[idx]; + rho_new_s[itx] = 0.f; rhou_new_s[itx] = 0.f; + + // other variables + int dt; + int dx; + + // main loop, be liberal with barriers + while(t < tmax) { + dt = 1.0; + t += dt; + hydroStep (rho_old_s, rhou_old_s, rho_new_s, rhou_new_s, dt, dx, itx); + __syncthreads(); + updateBoundaries(); + __syncthreads(); + copyNewToold(); + __syncthreads(); + } + + // quite kernel, now we can write back into main memory, write outputs, etc + // when that's done, we can terminate (if we have reached tend) + // or jump back on the hydro loop here (if we have not reached the final time) + +} + +__global__ void dumpMem(float *x_d, float *rho_old_d, float *rhou_old_d, float *rho_new_d, float *rhou_new_d) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + //printf("%i %.4f %.4f %.4f %.4f %.4f\n", idx, x_d[idx], rho_old_d[idx], rhou_old_d[idx], rho_new_d[idx], rhou_new_d[idx]); + printf("%i %.4f %.4f %.4f\n", idx, x_d[idx], rho_old_d[idx], rhou_old_d[idx]); +} + +int main() +{ + float *rho_h, *rhou_h; // density and momentum on the host + float *rho_old_d, *rhou_old_d; // density and momentum on the device + float *rho_new_d, *rhou_new_d; // density and momentum on the device + + float *x_h, *x_d; // cell center grid + + float xlo = -5.5; // lower grid limit + float xhi = 5.5; // upper grid limit + float dx; // grid spacing + + // index transformations + const int NTOTAL = NX + 2 * NBOUNDARY; + + // preamble + cudaSetDevice(0); + + // set up grid + dx = (xhi - xlo) / float(NX-1); + printf("%.4f %.4f\n\n", dx, float(NX)); + x_h = (float *) malloc(sizeof(float)*NTOTAL); + cudaMalloc((void**) &x_d, sizeof(float)*NTOTAL); + + // Computational Domain + printf("X-Grid:\n"); + for(int ii=NBOUNDARY; ii>> (x_d, rho_old_d, rhou_old_d, rho_new_d, rhou_new_d); + cudaDeviceSynchronize(); + printf("\n"); + + // compute gpu parameters + int nblocks = NX / NTHREADS_PER_BLOCK + NX % NTHREADS_PER_BLOCK; + + // hydro step + hydroStep < 32, NTOTAL > <<< 1, 14 >>> (rho_old_d, rhou_old_d, rho_new_d, rhou_new_d); + cudaDeviceSynchronize(); + printf("\n"); + + // dump cpu memory + // dumpMem <<< 1, 14 >>> (x_d, rho_old_d, rhou_old_d, rho_new_d, rhou_new_d); + + // for(int istep=0; istep [32x32] is the biggest block size we can have! + // alt: diffuse <32> <<< (1,1,1), (32,32,1) >>> (...) + // diffuse <32,1> <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells); + // set uold to unew before the next step + // updateMem <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells); + // } + + // copy back into host memory + // cudaMemcpy(u_h, unew_d, sizeof(float)*Ncells*Ncells, cudaMemcpyDeviceToHost); + + // output array + // for(int i=0; i