Hello World
#include <stdio.h>
#include <cuda.h>
__global__ void helloWorldKernel()
{
// store the index of the thread
int t = threadIdx.x;
// index of the block
int b = blockIdx.x;
// total number of blocks
int B = blockDim.x;
printf("Hello world from thread %d and block %d out of %d\n", t, b, B);
}
int main()
{
printf("Hello World\n");
// add kernel to queue on GPU, executing over 64 threads on four cores (thread blocks)
helloWorldKernel <<< 4, 64 >>> ();
// block until kernel has run
cudaDeviceSynchronize();
return 0;
}
Asynchronously filling an array
- GPU has a volatile structure distinct from that of the CPU
- Thus you can't directly share data from CPU to GPU. You have to send it over
- This means that the speedup of the GPU must offset the time to send the data over
#include "cuda.h"
#include <stdio.h>
__global__ void fillKernel(int N, float val, float *c_x)
{
int t = threadIdx.x;
int b = blockIdx.x;
int B = blockDim.x;
// map t, b, and B to a unique index
int n = t + b*B;
if (n < N)
c_x[n] = val;
}
int main(int argc, char *argv[])
{
if (argc < 2)
{
printf("usage: ./array N\n");
}
int N = atoi(argv[1]);
// allocate and fill an array on CPU (HOST)
// h_ preceding host pointers
float *h_x = (float*)malloc(N * sizeof(float));
float val = 1.234;
for (int i = 0; i < N; ++i)
{
h_x[i] = val;
}
float *c_x;
cudaMalloc(&c_x, N*sizeof(float));
int B = 256;
// guarantees that B divides the numerator
// rounds up to the smallest number equivalent to ceil(G + B/B)
int G = (B + N - 1)/B;
fillKernel <<< G , B >>> (N, val, c_x);
// copy data from c_x to h_x
// cudaMemcpy already forces a sync
cudaMemcpy(h_x, c_x, N*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; ++i) {
printf("h_x[%d] = %g\n", i, h_x[i]);
}
printf("N = %d, G = %d, B = %d, G*B=%d\n", N, G, B, G*B);
return 0;
}
Timing
- Queuing up jobs is very expensive, and may not be worth the opportunity cost of just running code on CPU
- Remedy this by queuing up jobs over various kernels
Cuda Debugging
cuda-memcheck <./program args>
nvcc -g -G # debugging info
cuda-gdb ./program
set cuda memcheck on
cuda thread 2 # switch to thread 2
cuda block 6 # switch to block
l
where # view where the error is
nvprof --metrics dram_read_throughput,dram_write_throughput ./cudaAddVectors args # gives timings for kernels
nvprof --query --metrics # view all metrics