diff options
author | Volker Hoffmann <volker@cheleb.net> | 2014-07-04 17:18:51 +0200 |
---|---|---|
committer | Volker Hoffmann <volker@cheleb.net> | 2014-07-04 17:18:51 +0200 |
commit | cb4dabf1eb1c1ba77af4a98e2cab2149b7728a93 (patch) | |
tree | f5262dfef427329d47dee9e84052ba4610b3020e | |
parent | a575f6fa32703a7a56fff5c9aa57556cad59b040 (diff) |
-rw-r--r-- | hydro1d.cu | 249 |
1 files changed, 249 insertions, 0 deletions
diff --git a/hydro1d.cu b/hydro1d.cu new file mode 100644 index 0000000..357af9e --- /dev/null +++ b/hydro1d.cu @@ -0,0 +1,249 @@ +#include <cuda.h> +#include <stdio.h> +#include <stdlib.h> +#include <math.h> + +#define NSTEPS 128 // number of time steps +#define NX 12 // number of cells in computational domain (EXCLUDING BOUNDARY) (0..11 / 0..NX-1) +#define NTHREADS_PER_BLOCK 4 // number of threads per block (0..3 / 0..NTHREADS_PER_BLOCK-1) +#define NBOUNDARY 1 // boundary cells added on each side of the domain + + + + + +__device__ void syncBoundaries() {} + + + + + +// all arrays/pointers passed are nthreads (*float) wide +template <int nthreads> +__device__ void advectStep(float *ui,float *qold, float *qnew, + int itx, int dx, int dt) +{ + __shared__ flux_s[nthreads]; + + // Compute Flux + if(ui[itx]>0.0) { + flux[itx] = q[itx] * ui[itx]; + } else { + flux[itx] = q[itx+1] * ui[itx]; + } + __syncthreads(); + + // Update Me + qnew[itx] = qold[itx] - dt / dx * ( flux[itx] - flux[itx-1] ); +} + +template <int nthreads, int ntotal> +__global__ void hydroStep(float *rho_old_d, float *rhou_old_d, + float *rho_new_d, float *rhou_new_d, + int dt, int dx, int itx) +{ + // global and local indices + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int itx = threadIdx.x; + + // interface velocity + __shared__ float ui[nthreads] + ui[itx] = 0.5 * ( rhou[itx]/rho[itx] - rhou[itx+1]/rho[itx+1] ); + + // first hydro half-steps + advectStep <nthreads> (ui, rho_old_s, rhou_new_s, itx, dx, dt); + advectStep <nthreads> (ui, rhou_old_s, rhou_new_s, itx, dx, dt); + + // update old to new before we do the next + ... + + // new do the pressure half step + + + + + + + + + + + + + // update ghost cells (periodic BCs) + rho_new_d[0] = rho_new[ntotal-1] + rho_new_d[ntotal] = rho_new[1] + rhou_new_d[0] = rhoi_new[ntotal-1] + rhou_new_d[ntotal] = rhoi_new[0] + __syncthreads(); + } + + // update old/new + rho_old[idx] = rho_new[idx] + rhou_old[idx] = rhou_new[idx] + + // compute pressure + p[idx] = (gamma - 1.0) * rho_new_d * eint + + // hydro, second half-step + if(idx>0 && idx<ntotal=1) { + // source terms + + p[idx] = gamma - 1.0 ) * rho_new + + + + + //printf("%i %.4f %.4f\n", idx, umid, rho_old_d[idx]); + } +} + +// rho_d and rhou_d are ncells wide +template <nthreads> +__global__ void hydroLoop(float *rho_d, float *rhou_d, + float cfl, int dx) +{ + // global and local indices + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int itx = threadIdx.x; + + // prepare shared memory + __shared__ float rho_new_s[nthreads]; + __shared__ float rho_old_s[nthreads]; + __shared__ float rhou_new_s[nthreads]; + __shared__ float rhou_old_s[nthreads]; + + rho_old_s[itx] = rho_d[idx]; rhou_old_s[itx] = rho_d[idx]; + rho_new_s[itx] = 0.f; rhou_new_s[itx] = 0.f; + + // other variables + int dt; + int dx; + + // main loop, be liberal with barriers + while(t < tmax) { + dt = 1.0; + t += dt; + hydroStep <nthreads> (rho_old_s, rhou_old_s, rho_new_s, rhou_new_s, dt, dx, itx); + __syncthreads(); + updateBoundaries(); + __syncthreads(); + copyNewToold(); + __syncthreads(); + } + + // quite kernel, now we can write back into main memory, write outputs, etc + // when that's done, we can terminate (if we have reached tend) + // or jump back on the hydro loop here (if we have not reached the final time) + +} + +__global__ void dumpMem(float *x_d, float *rho_old_d, float *rhou_old_d, float *rho_new_d, float *rhou_new_d) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + //printf("%i %.4f %.4f %.4f %.4f %.4f\n", idx, x_d[idx], rho_old_d[idx], rhou_old_d[idx], rho_new_d[idx], rhou_new_d[idx]); + printf("%i %.4f %.4f %.4f\n", idx, x_d[idx], rho_old_d[idx], rhou_old_d[idx]); +} + +int main() +{ + float *rho_h, *rhou_h; // density and momentum on the host + float *rho_old_d, *rhou_old_d; // density and momentum on the device + float *rho_new_d, *rhou_new_d; // density and momentum on the device + + float *x_h, *x_d; // cell center grid + + float xlo = -5.5; // lower grid limit + float xhi = 5.5; // upper grid limit + float dx; // grid spacing + + // index transformations + const int NTOTAL = NX + 2 * NBOUNDARY; + + // preamble + cudaSetDevice(0); + + // set up grid + dx = (xhi - xlo) / float(NX-1); + printf("%.4f %.4f\n\n", dx, float(NX)); + x_h = (float *) malloc(sizeof(float)*NTOTAL); + cudaMalloc((void**) &x_d, sizeof(float)*NTOTAL); + + // Computational Domain + printf("X-Grid:\n"); + for(int ii=NBOUNDARY; ii<NTOTAL; ii++) { + x_h[ii] = xlo + (ii-NBOUNDARY) * dx; + } + // Boundary Cells (Only Works For One Boundary Cell!) + x_h[0] = xlo - dx; + x_h[NTOTAL-1] = xhi + dx; + // Debug Print + for(int ii=0; ii<NTOTAL; ii++) { + printf("%.4f ", x_h[ii]); + } + printf("\n\n"); + cudaMemcpy(x_d, x_h, sizeof(float)*NTOTAL, cudaMemcpyHostToDevice); + + // initial conditions + rho_h = (float*) malloc(sizeof(float)*NTOTAL); + rhou_h = (float*) malloc(sizeof(float)*NTOTAL); + printf("ICs\n"); + for(int ii=0; ii<NTOTAL; ii++) { + rhou_h[ii] = 0.f; + rho_h[ii] = 1.f + exp(-(x_h[ii]*x_h[ii]) / (1.0*1.0)); + printf("%.4f ", rho_h[ii]); + } + printf("\n\n"); + cudaMalloc((void**) &rho_old_d, sizeof(float)*NTOTAL); + cudaMalloc((void**) &rhou_old_d, sizeof(float)*NTOTAL); + cudaMemcpy(rho_old_d, rho_h, sizeof(float)*NTOTAL, cudaMemcpyHostToDevice); + cudaMemcpy(rhou_old_d, rhou_h, sizeof(float)*NTOTAL, cudaMemcpyHostToDevice); + + // prepare new step + cudaMemset(rho_new_d, 0, sizeof(float)*NTOTAL); + cudaMemset(rhou_new_d, 0, sizeof(float)*NTOTAL); + + // call cuda kernel to check on memory contents here!!!! + dumpMem <<< 1, 14 >>> (x_d, rho_old_d, rhou_old_d, rho_new_d, rhou_new_d); + cudaDeviceSynchronize(); + printf("\n"); + + // compute gpu parameters + int nblocks = NX / NTHREADS_PER_BLOCK + NX % NTHREADS_PER_BLOCK; + + // hydro step + hydroStep < 32, NTOTAL > <<< 1, 14 >>> (rho_old_d, rhou_old_d, rho_new_d, rhou_new_d); + cudaDeviceSynchronize(); + printf("\n"); + + // dump cpu memory + // dumpMem <<< 1, 14 >>> (x_d, rho_old_d, rhou_old_d, rho_new_d, rhou_new_d); + + // for(int istep=0; istep<NSTEPS; istep++) { + // run our physics + // <32> [32x32] is the biggest block size we can have! + // alt: diffuse <32> <<< (1,1,1), (32,32,1) >>> (...) + // diffuse <32,1> <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells); + // set uold to unew before the next step + // updateMem <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells); + // } + + // copy back into host memory + // cudaMemcpy(u_h, unew_d, sizeof(float)*Ncells*Ncells, cudaMemcpyDeviceToHost); + + // output array + // for(int i=0; i<Ncells; i++) { + // for(int j=0; j<Ncells; j++) { + // printf("%i %i %.16f \n", i, j, u_h[j*Ncells+i]); + // } + // } + + // free memory + free(x_h); free(rho_h); free(rhou_h); + cudaFree(x_d); + cudaFree(rho_old_d); cudaFree(rhou_old_d); + cudaFree(rho_new_d); cudaFree(rhou_new_d); + + return 0; +} + |