gpu-benchmarking

You are gpu-benchmarking - a specialized skill for automated GPU performance benchmarking and regression detection. This skill provides expert capabilities for measuring, analyzing, and tracking GPU kernel performance over time.

Overview

This skill enables AI-powered GPU benchmarking operations including:

Designing micro-benchmarks for kernel operations
Measuring kernel execution time with CUDA events
Calculating achieved vs theoretical performance
Generating performance comparison reports
Detecting performance regressions in CI/CD
Profiling power and thermal characteristics
Benchmarking memory bandwidth and latency
Creating reproducible benchmark configurations

Prerequisites

NVIDIA CUDA Toolkit 11.0+
GPU with performance counters support
nvidia-smi for power/thermal monitoring
Optional: Nsight Systems/Compute for detailed profiling
CI/CD system for regression tracking

Capabilities

1. CUDA Event Timing

Precise kernel execution time measurement:

// Benchmark timing wrapper
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

// Warm-up run
myKernel<<<grid, block>>>(args);
cudaDeviceSynchronize();

// Timed runs
cudaEventRecord(start);
for (int i = 0; i < NUM_ITERATIONS; i++) {
    myKernel<<<grid, block>>>(args);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);

float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
float avg_ms = milliseconds / NUM_ITERATIONS;

printf("Average kernel time: %.3f ms\n", avg_ms);
printf("Throughput: %.2f GB/s\n", (data_size_bytes / 1e9) / (avg_ms / 1000));

cudaEventDestroy(start);
cudaEventDestroy(stop);

2. Comprehensive Benchmark Framework

#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <algorithm>
#include <cmath>

struct BenchmarkResult {
    float min_ms;
    float max_ms;
    float mean_ms;
    float median_ms;
    float stddev_ms;
    float throughput_gbps;
    float achieved_flops;
    int iterations;
};

template <typename KernelFunc>
BenchmarkResult benchmark_kernel(
    KernelFunc kernel,
    dim3 grid, dim3 block,
    size_t data_bytes,
    size_t flop_count,
    int warmup = 10,
    int iterations = 100
) {
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Warm-up
    for (int i = 0; i < warmup; i++) {
        kernel<<<grid, block>>>();
    }
    cudaDeviceSynchronize();

    // Collect timing samples
    std::vector<float> times(iterations);
    for (int i = 0; i < iterations; i++) {
        cudaEventRecord(start);
        kernel<<<grid, block>>>();
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&times[i], start, stop);
    }

    // Calculate statistics
    std::sort(times.begin(), times.end());

    BenchmarkResult result;
    result.iterations = iterations;
    result.min_ms = times[0];
    result.max_ms = times[iterations - 1];
    result.median_ms = times[iterations / 2];

    float sum = 0, sq_sum = 0;
    for (float t : times) {
        sum += t;
        sq_sum += t * t;
    }
    result.mean_ms = sum / iterations;
    result.stddev_ms = std::sqrt(sq_sum / iterations - result.mean_ms * result.mean_ms);

    result.throughput_gbps = (data_bytes / 1e9) / (result.median_ms / 1000);
    result.achieved_flops = (flop_count / 1e12) / (result.median_ms / 1000);  // TFLOPS

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return result;
}

3. Roofline Model Analysis

Calculate theoretical vs achieved performance:

struct RooflineMetrics {
    // Hardware limits
    float peak_memory_bandwidth_gbps;
    float peak_flops_tflops;

    // Kernel characteristics
    float arithmetic_intensity;  // FLOPS / Bytes
    float achieved_flops_tflops;
    float achieved_bandwidth_gbps;

    // Efficiency
    float compute_efficiency;    // % of peak FLOPS
    float bandwidth_efficiency;  // % of peak bandwidth
    bool is_compute_bound;
};

RooflineMetrics calculate_roofline(
    BenchmarkResult& result,
    size_t flop_count,
    size_t bytes_accessed,
    cudaDeviceProp& props
) {
    RooflineMetrics metrics;

    // Get hardware specs
    metrics.peak_memory_bandwidth_gbps =
        (props.memoryBusWidth / 8.0) * (props.memoryClockRate / 1e6) * 2;  // DDR
    metrics.peak_flops_tflops =
        (props.multiProcessorCount * props.maxThreadsPerMultiProcessor *
         props.clockRate / 1e9) * 2;  // FMA = 2 FLOPS

    // Calculate arithmetic intensity
    metrics.arithmetic_intensity = (float)flop_count / bytes_accessed;

    // Achieved performance
    metrics.achieved_flops_tflops = result.achieved_flops;
    metrics.achieved_bandwidth_gbps = result.throughput_gbps;

    // Determine boundedness
    float ridge_point = metrics.peak_flops_tflops / metrics.peak_memory_bandwidth_gbps;
    metrics.is_compute_bound = metrics.arithmetic_intensity > ridge_point;

    // Calculate efficiency
    if (metrics.is_compute_bound) {
        metrics.compute_efficiency =
            (metrics.achieved_flops_tflops / metrics.peak_flops_tflops) * 100;
    } else {
        metrics.bandwidth_efficiency =
            (metrics.achieved_bandwidth_gbps / metrics.peak_memory_bandwidth_gbps) * 100;
    }

    return metrics;
}

4. Memory Bandwidth Benchmark

// Global memory bandwidth test
__global__ void bandwidthTestCopy(float* dst, const float* src, size_t n) {
    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = blockDim.x * gridDim.x;

    for (size_t i = idx; i < n; i += stride) {
        dst[i] = src[i];
    }
}

__global__ void bandwidthTestRead(float* dst, const float* src, size_t n) {
    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = blockDim.x * gridDim.x;

    float sum = 0.0f;
    for (size_t i = idx; i < n; i += stride) {
        sum += src[i];
    }
    // Prevent optimization
    if (idx == 0) dst[0] = sum;
}

void benchmark_memory_bandwidth(size_t size_mb) {
    size_t size = size_mb * 1024 * 1024;
    size_t n = size / sizeof(float);

    float *d_src, *d_dst;
    cudaMalloc(&d_src, size);
    cudaMalloc(&d_dst, size);

    int blocks = 256;
    int threads = 256;

    // Copy bandwidth (read + write)
    auto copy_result = benchmark_kernel(
        [=]() { bandwidthTestCopy<<<blocks, threads>>>(d_dst, d_src, n); },
        dim3(blocks), dim3(threads),
        size * 2,  // Read + Write
        0
    );

    printf("Copy Bandwidth: %.2f GB/s\n", copy_result.throughput_gbps);

    // Read bandwidth
    auto read_result = benchmark_kernel(
        [=]() { bandwidthTestRead<<<blocks, threads>>>(d_dst, d_src, n); },
        dim3(blocks), dim3(threads),
        size,  // Read only
        0
    );

    printf("Read Bandwidth: %.2f GB/s\n", read_result.throughput_gbps);

    cudaFree(d_src);
    cudaFree(d_dst);
}

5. Latency Benchmark

// Memory latency measurement using pointer chasing
__global__ void pointerChase(int* ptr, int* result, int iterations) {
    int idx = 0;
    for (int i = 0; i < iterations; i++) {
        idx = ptr[idx];
    }
    *result = idx;  // Prevent optimization
}

float measure_memory_latency() {
    const int N = 1024 * 1024;  // 4MB
    int* h_ptr = new int[N];

    // Create random chase pattern
    std::vector<int> indices(N);
    std::iota(indices.begin(), indices.end(), 0);
    std::random_shuffle(indices.begin() + 1, indices.end());

    for (int i = 0; i < N - 1; i++) {
        h_ptr[indices[i]] = indices[i + 1];
    }
    h_ptr[indices[N - 1]] = indices[0];

    int *d_ptr, *d_result;
    cudaMalloc(&d_ptr, N * sizeof(int));
    cudaMalloc(&d_result, sizeof(int));
    cudaMemcpy(d_ptr, h_ptr, N * sizeof(int), cudaMemcpyHostToDevice);

    // Measure latency
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    const int ITERATIONS = 10000;

    cudaEventRecord(start);
    pointerChase<<<1, 1>>>(d

gpu-benchmarking

How to add

Drop this on your repo README

Related skills

internal-comms

babysit

do

smart-explore

Get new DevOps e Infra skills every Monday

gpu-benchmarking

Overview

Prerequisites

Capabilities

1. CUDA Event Timing

2. Comprehensive Benchmark Framework

3. Roofline Model Analysis

4. Memory Bandwidth Benchmark

5. Latency Benchmark

Comments · No comments