Device pointers point to GPU memory May be passed to/from host code May not be dereferenced in host code

Host points point to CPU memory May be passed to/from device code May not be dereferenced in device code

Which means we can only pass pointers between Device and Host and dereference them accordingly. But We allocate for what device pointers point at inside Host Code.

Notice that Device code is to be compiled by NVIDIA compiler, Host code is to be compiled by gcc via nvcc. But no need to worry! nvcc could automatically devide *.cu to Device/Host code, and compile them accordingly.

You only need to do nvcc test.cu

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* test.cu */
// Device Code, run on device(GPU)
// call it from Host Code
__global__ void add(int *a, int *b, int *c){
    *c = *a + *b;
}

//Host Code, run on CPU
int main(){
    int a, b, c;
    int *d_a, *d_b, *d_c;
    int size = sizeof(int);

    // Allocate space for variables on GPU
    /* Why here is (void **)&d_a instead of (void *)d_a?
    Could be in that */
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    a = 2;
    b = 7;

    //Copy inputs to Device
    // copy what &a points to what d_a points
    cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);

    add<<<1,1>>>(d_a, d_b, d_c);

    //Copy output from Device
    //copy from what d_c points to what &c points
    cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);

    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    return 0;
}

Blocks and Threads

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
// In this program we add 2 vecs via cuda
__global__ void add(int *a, int *b, int *c){
    c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
    /* or threadIdx.x if using threads */
    /* or index = threadInd.x + blockIdx.x * blockDim.x if using blocks&threads */
}

#define N 512
int main(){
    int *a, *b, *c; 
    int *d_a, *d_b, *d_c;
    int size = N*sizeof(int);

    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    a = (int *)malloc(size); random_ints(a, N);
    b = (int *)malloc(size); random_ints(b, N);
    c = (int *)malloc(size);

    //cp what d_a points from what a points
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

    // Launch add() kernel on GPU 
    // with N blocks, each with 1 thread;
    add<<<N,1>>>(d_a, d_b, d_c);
    /* or <<<1,N>>> with N threads in 1 block */
    /* or <<<N/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>> */

    //cp what c points from what d_c points
    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

    //cleanup
    free(a); free(b); free(c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    return 0;
}

Why use threads?

Within a block, threads share data via shared memory which is extremely fast on-chip memory:

  • By opposition to device memory, referred to as global memory
  • Like a user-managed cache
  • By __shared__ variable in Device Code.
  • Example: 1D Stencil

Each output element is the sum of 7 input elements, each thread processes one output element. Procedures:

  • Read BLOCK_SIZE + 2*radiusinput elements from global memory to shared memory
  • Compute BLOCK_SIZE output elements
  • Write BLOCK_SIZE output elements to global memory

Remember to __syncthreads() before using all the data

We can also select device

cudaGetDeviceCount(int *count) cudaSetDevice(int device) cudaGetDevice(int *device) cudaGetDeviceProperties(cudaDeviceProp *prop, int device)