summaryrefslogtreecommitdiffstats
path: root/hydro1d.cu
diff options
context:
space:
mode:
Diffstat (limited to 'hydro1d.cu')
-rw-r--r--hydro1d.cu249
1 files changed, 249 insertions, 0 deletions
diff --git a/hydro1d.cu b/hydro1d.cu
new file mode 100644
index 0000000..357af9e
--- /dev/null
+++ b/hydro1d.cu
@@ -0,0 +1,249 @@
+#include <cuda.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#define NSTEPS 128 // number of time steps
+#define NX 12 // number of cells in computational domain (EXCLUDING BOUNDARY) (0..11 / 0..NX-1)
+#define NTHREADS_PER_BLOCK 4 // number of threads per block (0..3 / 0..NTHREADS_PER_BLOCK-1)
+#define NBOUNDARY 1 // boundary cells added on each side of the domain
+
+
+
+
+
+__device__ void syncBoundaries() {}
+
+
+
+
+
+// all arrays/pointers passed are nthreads (*float) wide
+template <int nthreads>
+__device__ void advectStep(float *ui,float *qold, float *qnew,
+ int itx, int dx, int dt)
+{
+ __shared__ flux_s[nthreads];
+
+ // Compute Flux
+ if(ui[itx]>0.0) {
+ flux[itx] = q[itx] * ui[itx];
+ } else {
+ flux[itx] = q[itx+1] * ui[itx];
+ }
+ __syncthreads();
+
+ // Update Me
+ qnew[itx] = qold[itx] - dt / dx * ( flux[itx] - flux[itx-1] );
+}
+
+template <int nthreads, int ntotal>
+__global__ void hydroStep(float *rho_old_d, float *rhou_old_d,
+ float *rho_new_d, float *rhou_new_d,
+ int dt, int dx, int itx)
+{
+ // global and local indices
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ int itx = threadIdx.x;
+
+ // interface velocity
+ __shared__ float ui[nthreads]
+ ui[itx] = 0.5 * ( rhou[itx]/rho[itx] - rhou[itx+1]/rho[itx+1] );
+
+ // first hydro half-steps
+ advectStep <nthreads> (ui, rho_old_s, rhou_new_s, itx, dx, dt);
+ advectStep <nthreads> (ui, rhou_old_s, rhou_new_s, itx, dx, dt);
+
+ // update old to new before we do the next
+ ...
+
+ // new do the pressure half step
+
+
+
+
+
+
+
+
+
+
+
+
+ // update ghost cells (periodic BCs)
+ rho_new_d[0] = rho_new[ntotal-1]
+ rho_new_d[ntotal] = rho_new[1]
+ rhou_new_d[0] = rhoi_new[ntotal-1]
+ rhou_new_d[ntotal] = rhoi_new[0]
+ __syncthreads();
+ }
+
+ // update old/new
+ rho_old[idx] = rho_new[idx]
+ rhou_old[idx] = rhou_new[idx]
+
+ // compute pressure
+ p[idx] = (gamma - 1.0) * rho_new_d * eint
+
+ // hydro, second half-step
+ if(idx>0 && idx<ntotal=1) {
+ // source terms
+
+ p[idx] = gamma - 1.0 ) * rho_new
+
+
+
+
+ //printf("%i %.4f %.4f\n", idx, umid, rho_old_d[idx]);
+ }
+}
+
+// rho_d and rhou_d are ncells wide
+template <nthreads>
+__global__ void hydroLoop(float *rho_d, float *rhou_d,
+ float cfl, int dx)
+{
+ // global and local indices
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ int itx = threadIdx.x;
+
+ // prepare shared memory
+ __shared__ float rho_new_s[nthreads];
+ __shared__ float rho_old_s[nthreads];
+ __shared__ float rhou_new_s[nthreads];
+ __shared__ float rhou_old_s[nthreads];
+
+ rho_old_s[itx] = rho_d[idx]; rhou_old_s[itx] = rho_d[idx];
+ rho_new_s[itx] = 0.f; rhou_new_s[itx] = 0.f;
+
+ // other variables
+ int dt;
+ int dx;
+
+ // main loop, be liberal with barriers
+ while(t < tmax) {
+ dt = 1.0;
+ t += dt;
+ hydroStep <nthreads> (rho_old_s, rhou_old_s, rho_new_s, rhou_new_s, dt, dx, itx);
+ __syncthreads();
+ updateBoundaries();
+ __syncthreads();
+ copyNewToold();
+ __syncthreads();
+ }
+
+ // quite kernel, now we can write back into main memory, write outputs, etc
+ // when that's done, we can terminate (if we have reached tend)
+ // or jump back on the hydro loop here (if we have not reached the final time)
+
+}
+
+__global__ void dumpMem(float *x_d, float *rho_old_d, float *rhou_old_d, float *rho_new_d, float *rhou_new_d)
+{
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ //printf("%i %.4f %.4f %.4f %.4f %.4f\n", idx, x_d[idx], rho_old_d[idx], rhou_old_d[idx], rho_new_d[idx], rhou_new_d[idx]);
+ printf("%i %.4f %.4f %.4f\n", idx, x_d[idx], rho_old_d[idx], rhou_old_d[idx]);
+}
+
+int main()
+{
+ float *rho_h, *rhou_h; // density and momentum on the host
+ float *rho_old_d, *rhou_old_d; // density and momentum on the device
+ float *rho_new_d, *rhou_new_d; // density and momentum on the device
+
+ float *x_h, *x_d; // cell center grid
+
+ float xlo = -5.5; // lower grid limit
+ float xhi = 5.5; // upper grid limit
+ float dx; // grid spacing
+
+ // index transformations
+ const int NTOTAL = NX + 2 * NBOUNDARY;
+
+ // preamble
+ cudaSetDevice(0);
+
+ // set up grid
+ dx = (xhi - xlo) / float(NX-1);
+ printf("%.4f %.4f\n\n", dx, float(NX));
+ x_h = (float *) malloc(sizeof(float)*NTOTAL);
+ cudaMalloc((void**) &x_d, sizeof(float)*NTOTAL);
+
+ // Computational Domain
+ printf("X-Grid:\n");
+ for(int ii=NBOUNDARY; ii<NTOTAL; ii++) {
+ x_h[ii] = xlo + (ii-NBOUNDARY) * dx;
+ }
+ // Boundary Cells (Only Works For One Boundary Cell!)
+ x_h[0] = xlo - dx;
+ x_h[NTOTAL-1] = xhi + dx;
+ // Debug Print
+ for(int ii=0; ii<NTOTAL; ii++) {
+ printf("%.4f ", x_h[ii]);
+ }
+ printf("\n\n");
+ cudaMemcpy(x_d, x_h, sizeof(float)*NTOTAL, cudaMemcpyHostToDevice);
+
+ // initial conditions
+ rho_h = (float*) malloc(sizeof(float)*NTOTAL);
+ rhou_h = (float*) malloc(sizeof(float)*NTOTAL);
+ printf("ICs\n");
+ for(int ii=0; ii<NTOTAL; ii++) {
+ rhou_h[ii] = 0.f;
+ rho_h[ii] = 1.f + exp(-(x_h[ii]*x_h[ii]) / (1.0*1.0));
+ printf("%.4f ", rho_h[ii]);
+ }
+ printf("\n\n");
+ cudaMalloc((void**) &rho_old_d, sizeof(float)*NTOTAL);
+ cudaMalloc((void**) &rhou_old_d, sizeof(float)*NTOTAL);
+ cudaMemcpy(rho_old_d, rho_h, sizeof(float)*NTOTAL, cudaMemcpyHostToDevice);
+ cudaMemcpy(rhou_old_d, rhou_h, sizeof(float)*NTOTAL, cudaMemcpyHostToDevice);
+
+ // prepare new step
+ cudaMemset(rho_new_d, 0, sizeof(float)*NTOTAL);
+ cudaMemset(rhou_new_d, 0, sizeof(float)*NTOTAL);
+
+ // call cuda kernel to check on memory contents here!!!!
+ dumpMem <<< 1, 14 >>> (x_d, rho_old_d, rhou_old_d, rho_new_d, rhou_new_d);
+ cudaDeviceSynchronize();
+ printf("\n");
+
+ // compute gpu parameters
+ int nblocks = NX / NTHREADS_PER_BLOCK + NX % NTHREADS_PER_BLOCK;
+
+ // hydro step
+ hydroStep < 32, NTOTAL > <<< 1, 14 >>> (rho_old_d, rhou_old_d, rho_new_d, rhou_new_d);
+ cudaDeviceSynchronize();
+ printf("\n");
+
+ // dump cpu memory
+ // dumpMem <<< 1, 14 >>> (x_d, rho_old_d, rhou_old_d, rho_new_d, rhou_new_d);
+
+ // for(int istep=0; istep<NSTEPS; istep++) {
+ // run our physics
+ // <32> [32x32] is the biggest block size we can have!
+ // alt: diffuse <32> <<< (1,1,1), (32,32,1) >>> (...)
+ // diffuse <32,1> <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells);
+ // set uold to unew before the next step
+ // updateMem <<< nBlocks, nThreads >>> (uold_d, unew_d, Ncells);
+ // }
+
+ // copy back into host memory
+ // cudaMemcpy(u_h, unew_d, sizeof(float)*Ncells*Ncells, cudaMemcpyDeviceToHost);
+
+ // output array
+ // for(int i=0; i<Ncells; i++) {
+ // for(int j=0; j<Ncells; j++) {
+ // printf("%i %i %.16f \n", i, j, u_h[j*Ncells+i]);
+ // }
+ // }
+
+ // free memory
+ free(x_h); free(rho_h); free(rhou_h);
+ cudaFree(x_d);
+ cudaFree(rho_old_d); cudaFree(rhou_old_d);
+ cudaFree(rho_new_d); cudaFree(rhou_new_d);
+
+ return 0;
+}
+