1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
// advection.cu
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
__global__ void advect(float *uold, float *unew, float cs, float dx, float dt, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx==0) {
unew[idx] = uold[idx] - dt * cs * ( uold[idx] - uold[N-1] ) / dx;
} else {
unew[idx] = uold[idx] - dt * cs * ( uold[idx] - uold[idx-1] ) / dx;
}
printf("(%i,%.3f,%.3f)\n", idx, uold[idx], unew[idx]);
}
int main(void)
{
float *u_h; // pointer to host memory
float *uold_d, *unew_d; // pointers to device memory
float cs = 1.0; // bulk velocity
float dt = 0.1; // timestep
float dx = 0.2; // space step
int N = 32; // cells
int i; // counters
// check CFL condition
assert(cs*dt < dx);
// allocate host memory
u_h = (float *)malloc(sizeof(float)*N);
// allocate device memory
cudaMalloc((void **) &uold_d, sizeof(float)*N);
cudaMalloc((void **) &unew_d, sizeof(float)*N);
// fill initial array on host
for (i=0; i<N; i++) {
u_h[i] = 0.f;
}
// put a square in
u_h[N/2-2] = 1.f;
u_h[N/2-1] = 1.f;
u_h[N/2] = 1.f;
u_h[N/2+1] = 1.f;
u_h[N/2+2] = 1.f;
// copy to device
cudaMemcpy(uold_d, u_h, sizeof(float)*N, cudaMemcpyHostToDevice);
// launch configuration
int blockSize = N;
int nBlocks = 1;
// run simulation
for(i=0; i<10; i++) {
printf("*** %i\n", i);
advect <<< nBlocks, blockSize >>> (uold_d, unew_d, cs, dx, dt, N);
cudaDeviceSynchronize();
printf("\n");
cudaMemcpy(uold_d, unew_d, sizeof(float)*N, cudaMemcpyDeviceToDevice);
}
printf("\n");
// copy back
cudaMemcpy(u_h, unew_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// dump output
for (i=0; i<N; i++) {
printf("%.3f\n", u_h[i]);
}
// cleanup
free(u_h); cudaFree(unew_d); cudaFree(uold_d);
}
|