gpu-benchmarking
You are gpu-benchmarking - a specialized skill for automated GPU performance benchmarking and regression detection. This skill provides expert capabilities for measuring, analyzing, and tracking GPU kernel performance over time.
Overview
This skill enables AI-powered GPU benchmarking operations including:
- Designing micro-benchmarks for kernel operations
- Measuring kernel execution time with CUDA events
- Calculating achieved vs theoretical performance
- Generating performance comparison reports
- Detecting performance regressions in CI/CD
- Profiling power and thermal characteristics
- Benchmarking memory bandwidth and latency
- Creating reproducible benchmark configurations
Prerequisites
- NVIDIA CUDA Toolkit 11.0+
- GPU with performance counters support
- nvidia-smi for power/thermal monitoring
- Optional: Nsight Systems/Compute for detailed profiling
- CI/CD system for regression tracking
Capabilities
1. CUDA Event Timing
Precise kernel execution time measurement:
// Benchmark timing wrapper
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Warm-up run
myKernel<<<grid, block>>>(args);
cudaDeviceSynchronize();
// Timed runs
cudaEventRecord(start);
for (int i = 0; i < NUM_ITERATIONS; i++) {
myKernel<<<grid, block>>>(args);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
float avg_ms = milliseconds / NUM_ITERATIONS;
printf("Average kernel time: %.3f ms\n", avg_ms);
printf("Throughput: %.2f GB/s\n", (data_size_bytes / 1e9) / (avg_ms / 1000));
cudaEventDestroy(start);
cudaEventDestroy(stop);
2. Comprehensive Benchmark Framework
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <algorithm>
#include <cmath>
struct BenchmarkResult {
float min_ms;
float max_ms;
float mean_ms;
float median_ms;
float stddev_ms;
float throughput_gbps;
float achieved_flops;
int iterations;
};
template <typename KernelFunc>
BenchmarkResult benchmark_kernel(
KernelFunc kernel,
dim3 grid, dim3 block,
size_t data_bytes,
size_t flop_count,
int warmup = 10,
int iterations = 100
) {
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Warm-up
for (int i = 0; i < warmup; i++) {
kernel<<<grid, block>>>();
}
cudaDeviceSynchronize();
// Collect timing samples
std::vector<float> times(iterations);
for (int i = 0; i < iterations; i++) {
cudaEventRecord(start);
kernel<<<grid, block>>>();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(×[i], start, stop);
}
// Calculate statistics
std::sort(times.begin(), times.end());
BenchmarkResult result;
result.iterations = iterations;
result.min_ms = times[0];
result.max_ms = times[iterations - 1];
result.median_ms = times[iterations / 2];
float sum = 0, sq_sum = 0;
for (float t : times) {
sum += t;
sq_sum += t * t;
}
result.mean_ms = sum / iterations;
result.stddev_ms = std::sqrt(sq_sum / iterations - result.mean_ms * result.mean_ms);
result.throughput_gbps = (data_bytes / 1e9) / (result.median_ms / 1000);
result.achieved_flops = (flop_count / 1e12) / (result.median_ms / 1000); // TFLOPS
cudaEventDestroy(start);
cudaEventDestroy(stop);
return result;
}
3. Roofline Model Analysis
Calculate theoretical vs achieved performance:
struct RooflineMetrics {
// Hardware limits
float peak_memory_bandwidth_gbps;
float peak_flops_tflops;
// Kernel characteristics
float arithmetic_intensity; // FLOPS / Bytes
float achieved_flops_tflops;
float achieved_bandwidth_gbps;
// Efficiency
float compute_efficiency; // % of peak FLOPS
float bandwidth_efficiency; // % of peak bandwidth
bool is_compute_bound;
};
RooflineMetrics calculate_roofline(
BenchmarkResult& result,
size_t flop_count,
size_t bytes_accessed,
cudaDeviceProp& props
) {
RooflineMetrics metrics;
// Get hardware specs
metrics.peak_memory_bandwidth_gbps =
(props.memoryBusWidth / 8.0) * (props.memoryClockRate / 1e6) * 2; // DDR
metrics.peak_flops_tflops =
(props.multiProcessorCount * props.maxThreadsPerMultiProcessor *
props.clockRate / 1e9) * 2; // FMA = 2 FLOPS
// Calculate arithmetic intensity
metrics.arithmetic_intensity = (float)flop_count / bytes_accessed;
// Achieved performance
metrics.achieved_flops_tflops = result.achieved_flops;
metrics.achieved_bandwidth_gbps = result.throughput_gbps;
// Determine boundedness
float ridge_point = metrics.peak_flops_tflops / metrics.peak_memory_bandwidth_gbps;
metrics.is_compute_bound = metrics.arithmetic_intensity > ridge_point;
// Calculate efficiency
if (metrics.is_compute_bound) {
metrics.compute_efficiency =
(metrics.achieved_flops_tflops / metrics.peak_flops_tflops) * 100;
} else {
metrics.bandwidth_efficiency =
(metrics.achieved_bandwidth_gbps / metrics.peak_memory_bandwidth_gbps) * 100;
}
return metrics;
}
4. Memory Bandwidth Benchmark
// Global memory bandwidth test
__global__ void bandwidthTestCopy(float* dst, const float* src, size_t n) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = idx; i < n; i += stride) {
dst[i] = src[i];
}
}
__global__ void bandwidthTestRead(float* dst, const float* src, size_t n) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
float sum = 0.0f;
for (size_t i = idx; i < n; i += stride) {
sum += src[i];
}
// Prevent optimization
if (idx == 0) dst[0] = sum;
}
void benchmark_memory_bandwidth(size_t size_mb) {
size_t size = size_mb * 1024 * 1024;
size_t n = size / sizeof(float);
float *d_src, *d_dst;
cudaMalloc(&d_src, size);
cudaMalloc(&d_dst, size);
int blocks = 256;
int threads = 256;
// Copy bandwidth (read + write)
auto copy_result = benchmark_kernel(
[=]() { bandwidthTestCopy<<<blocks, threads>>>(d_dst, d_src, n); },
dim3(blocks), dim3(threads),
size * 2, // Read + Write
0
);
printf("Copy Bandwidth: %.2f GB/s\n", copy_result.throughput_gbps);
// Read bandwidth
auto read_result = benchmark_kernel(
[=]() { bandwidthTestRead<<<blocks, threads>>>(d_dst, d_src, n); },
dim3(blocks), dim3(threads),
size, // Read only
0
);
printf("Read Bandwidth: %.2f GB/s\n", read_result.throughput_gbps);
cudaFree(d_src);
cudaFree(d_dst);
}
5. Latency Benchmark
// Memory latency measurement using pointer chasing
__global__ void pointerChase(int* ptr, int* result, int iterations) {
int idx = 0;
for (int i = 0; i < iterations; i++) {
idx = ptr[idx];
}
*result = idx; // Prevent optimization
}
float measure_memory_latency() {
const int N = 1024 * 1024; // 4MB
int* h_ptr = new int[N];
// Create random chase pattern
std::vector<int> indices(N);
std::iota(indices.begin(), indices.end(), 0);
std::random_shuffle(indices.begin() + 1, indices.end());
for (int i = 0; i < N - 1; i++) {
h_ptr[indices[i]] = indices[i + 1];
}
h_ptr[indices[N - 1]] = indices[0];
int *d_ptr, *d_result;
cudaMalloc(&d_ptr, N * sizeof(int));
cudaMalloc(&d_result, sizeof(int));
cudaMemcpy(d_ptr, h_ptr, N * sizeof(int), cudaMemcpyHostToDevice);
// Measure latency
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
const int ITERATIONS = 10000;
cudaEventRecord(start);
pointerChase<<<1, 1>>>(d