GPU programming with HIP
2026-05
CSC Training


my_kernel<<<grid, block, 0, 0>>>(...)hipMalloc, hipMemcpy, hipFree, ...

streamstream

hipMalloc, hipMemcpy, hipFree, ...Async to name and add hipStream_t
as last argument for asynchronous version:
hipMalloc(...) ⟶
hipMallocAsync(..., hipStream_t stream)my_kernel<<<grid, block, 0, stream>>>(...)hipLaunchKernelGGL(my_kernel, grid, block, 0, stream, ...)my_kernel<<<grid, block, 0, 0>>>(...)hipStream_t stream[3];
for (int i = 0; i<3; ++i)
hipStreamCreate(&stream[i]);
for (int i = 0; i < 3; ++i) {
hipMemcpyAsync(d_data[i], h_data[i], bytes,
hipMemcpyHostToDevice, stream[i]);
hipkernel<<<grid, block, 0, stream[i]>>>
(d_data[i], i);
hipMemcpyAsync(h_data[i], d_data[i], bytes,
hipMemcpyDeviceToHost, stream[i]);
}
for(int i = 0; i<3; ++i) {
hipStreamSynchronize(stream[i]);
hipStreamDestroy(stream[i]); }

hipError_t hipEventQuery(hipEvent_t event):
hipSuccess/hipErrorNotReadyMeasure how fast host places tasks to stream:
// Start timed GPU kernel
clock_t start_kernel_clock = clock();
kernel<<<gridsize, blocksize, 0, stream>>>(d_a, n_total);
// Start timed device-to-host memcopy
clock_t start_d2h_clock = clock();
hipMemcpyAsync(a, d_a, bytes, hipMemcpyDeviceToHost, stream);
// Stop timing
clock_t stop_clock = clock();
hipStreamSynchronize(stream);Measure duration of tasks on GPU:
// Start timed GPU kernel
hipEventRecord(start_kernel_event, stream);
kernel<<<gridsize, blocksize, 0, stream>>>(d_a, n_total);
// Start timed device-to-host memcopy
hipEventRecord(start_d2h_event, stream);
hipMemcpyAsync(a, d_a, bytes, hipMemcpyDeviceToHost, stream);
// Stop timing
hipEventRecord(stop_event, stream);
hipEventSynchronize(stop_event);| Description | API call |
|---|---|
Initialize event object |
hipEventCreate(hipEvent_t* event) |
Record an event in the stream |
hipEventRecord(hipEvent_t event, hipStream_t stream) |
Elapsed time (ms) between start and
end |
hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t end) |
Make stream wait for event |
hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags = 0) |
Wait for event to complete |
hipEventSynchronize(hipEvent_t event) |
Destroy event object |
hipEventDestroy(hipEvent_t event) |
hipError_t__syncthreads() is
only for in-kernel synchronization between threads in a same block (does
not synch threads across blocks)