summaryrefslogtreecommitdiffstats
path: root/incrementArrays.cu
blob: 3f727b63dd0a4f2b259eb3491c62b181b60463a1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
// incrementArrays.cu
#include <stdio.h>
#include <assert.h>
#include <cuda.h>

void incrementArrayOnHost(float *a, int N)
{
    int i;
    for (i=0; i<N; i++) {
        a[i] = a[i] + 1.f;
    }
}

__global__ void incrementArrayOnDevice(float *a, int N)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx<N) {
        a[idx] = a[idx] + 1.f;
    }
}

int main(void)
{
    float *a_h, *b_h;       // pointers to host memory
    float *a_d;             // pointer to device memory
    int i, N = 10;
    size_t size = N*sizeof(float);

    // allocate arrays on host
    a_h = (float *)malloc(size);
    b_h = (float *)malloc(size);

    // allocate array on device
    cudaMalloc((void **) &a_d, size);

    // initialization of host data
    for (i=0; i<N; i++) {
        printf("%.2f\n", i);
        a_h[i] = (float) i;
    }
    printf("\n");

    // copy data from host to device
    cudaMemcpy(a_d, a_h, sizeof(float)*N, cudaMemcpyHostToDevice);

    // do calculation on host
    incrementArrayOnHost(a_h, N);

    // do calculation on device
    // 01 - compute execution configuration
    int blockSize = 4;
    int nBlocks = N/blockSize + (N%blockSize == 0?0:1);
    // 02 - call incrementArrayOnDevice kernel
    incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N);
    
    // Retrieve result from device and store in b_h
    // (dst, src, count, kind)
    cudaMemcpy(b_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);

    // check result
    for (i=0; i<N; i++) {
        printf("%.2f\n", b_h[i]);
        assert(a_h[i] == b_h[i]);
    }

    // cleanup
    free(a_h); free(b_h); cudaFree(a_d);
}