Cuda
Device pointers point to GPU memory May be passed to/from host code May not be dereferenced in host code
Host points point to CPU memory May be passed to/from device code May not be dereferenced in device code
Which means we can only pass pointers between Device and Host and dereference them accordingly. But We allocate for what device pointers point at inside Host Code.
Notice that Device code is to be compiled by NVIDIA compiler, Host code is to be compiled by gcc via nvcc.
But no need to worry!
nvcc
could automatically devide *.cu
to Device/Host code, and compile them accordingly.
You only need to do nvcc test.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* test.cu */
// Device Code, run on device(GPU)
// call it from Host Code
__global__ void add(int *a, int *b, int *c){
*c = *a + *b;
}
//Host Code, run on CPU
int main(){
int a, b, c;
int *d_a, *d_b, *d_c;
int size = sizeof(int);
// Allocate space for variables on GPU
/* Why here is (void **)&d_a instead of (void *)d_a?
Could be in that */
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
a = 2;
b = 7;
//Copy inputs to Device
// copy what &a points to what d_a points
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
add<<<1,1>>>(d_a, d_b, d_c);
//Copy output from Device
//copy from what d_c points to what &c points
cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
Blocks and Threads
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
// In this program we add 2 vecs via cuda
__global__ void add(int *a, int *b, int *c){
c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
/* or threadIdx.x if using threads */
/* or index = threadInd.x + blockIdx.x * blockDim.x if using blocks&threads */
}
#define N 512
int main(){
int *a, *b, *c;
int *d_a, *d_b, *d_c;
int size = N*sizeof(int);
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
a = (int *)malloc(size); random_ints(a, N);
b = (int *)malloc(size); random_ints(b, N);
c = (int *)malloc(size);
//cp what d_a points from what a points
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
// with N blocks, each with 1 thread;
add<<<N,1>>>(d_a, d_b, d_c);
/* or <<<1,N>>> with N threads in 1 block */
/* or <<<N/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>> */
//cp what c points from what d_c points
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
//cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
Why use threads?
Within a block, threads share data via shared memory which is extremely fast on-chip memory:
- By opposition to device memory, referred to as global memory
- Like a user-managed cache
- By
__shared__
variable in Device Code. -
Example: 1D Stencil
Each output element is the sum of 7 input elements, each thread processes one output element. Procedures:
- Read
BLOCK_SIZE + 2*radius
input elements from global memory to shared memory - Compute
BLOCK_SIZE
output elements - Write
BLOCK_SIZE
output elements to global memory
Remember to __syncthreads() before using all the data
We can also select device
cudaGetDeviceCount(int *count) cudaSetDevice(int device) cudaGetDevice(int *device) cudaGetDeviceProperties(cudaDeviceProp *prop, int device)