[CK_BUILDER] Integrate CKB validation with CK verification (#3649)

* ck-builder: tensor copy function This function copies one tensor to another, so that the memory layout can be changed between them. * ck-builder: fix ck::bhalf literals These types don't work properly. * ck-builder: abstract compare_elements in gpu_verification.hpp and make builder use it This reduces the amount of duplicated code a bit. * ck-builder: add flat tensor iterator This "iterator" type pretends to be a pointer, useful for passing tensors to functions expecting pointer-like types. * ck-builder: integrate validation with ck gpu verification By templating the gpu_verify function over iterators, we can use the new FlatTensorIterator to adapt the function to multi- dimensional tensors without changing either implementation too much. * ck-builder: add check_by_accumulations This changes the gpu_verification.hpp code to also accept "iterator" types for the relevant gpu_verify and gpu_reduce_max functions. * ck: fix test_gpu_verification GenerateRandomData for bhalf is_integer_it<bhalf_t> yields true, but it is not actually an integer. * ck: make gpu_verification kernels be proper persistent kernels Previously these were using a hardcoded value for the grid size. This commit changes that so that the grid size is automatically derived from the kernel's occupancy and the number of multiprocessors on the GPU. * ck: clean up gpu_verification.hpp using block_reduce This implements a small generic block reduce function, and rewrites the rest of gpu_verification.hpp using that function to clean it up a bit. * ck-builder: doc typos * ck-builder: update testing readme with validation interface. * ck-builder: rebase fixes + review comments * ck-builder: fix device integer generation with float types Passing bfloat here causes a nans due to type_convert performing a bitcast. * ck: another bhalf_t bug CK expects that int-generation with ck::bhalf_t yields bhalf integers, not unsigned integers. This makes the logic of FillUniformRandInteger compatible with GeneratorTensor_2<InDataType>, however idiotic that may be.
2026-04-20 06:49:15 +00:00 · 2026-01-28 17:41:02 +01:00
parent d6cccf6093
commit 42048bdb7d
11 changed files with 636 additions and 291 deletions
--- a/include/ck/library/utility/device_tensor_generator.hpp
+++ b/include/ck/library/utility/device_tensor_generator.hpp
@@ -67,8 +67,12 @@ __global__ void fill_tensor_uniform_rand_int_values(T* p,
        }
        else
        {
-            p[i] = ck::type_convert<T, int>(
-                static_cast<int>((ran_gen_round_u32(s)) % (max_value - min_value)) + min_value);
+            const auto value =
+                static_cast<int>((ran_gen_round_u32(s)) % (max_value - min_value)) + min_value;
+            if constexpr(std::is_integral_v<T> && !std::is_same_v<T, ck::bhalf_t>)
+                p[i] = ck::type_convert<T, int>(value);
+            else
+                p[i] = ck::type_convert<T, float>(value);
        }
    }
 }
--- a/include/ck/library/utility/gpu_verification.hpp
+++ b/include/ck/library/utility/gpu_verification.hpp
@@ -5,10 +5,15 @@

 #include <iomanip>
 #include <iostream>
+#include <tuple>
+#include <type_traits>
+#include <cmath>
+#include <array>

 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"
 #include "ck/utility/type.hpp"
+#include "ck/utility/env.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -106,6 +111,102 @@ inline float compute_relative_tolerance(const int number_of_accumulations = 1)
    }
 }

+/// @brief Turn an iterator type into an iterator that can be dereferenced.
+///
+/// In gpu_verify and gpu_reduce_max, it is valid to pass a void pointer and
+/// have the function automatically derive the "concrete" pointer type to
+/// be used in the kernel. This function does that: depending on whether
+/// the `Iterator` is a void pointer or not, it returns either the iterator
+/// (assuming that it is already concrete), or returns the pointer casted
+/// to the concrete type.
+///
+/// @tparam T The value type of the pointer, when dereferenced.
+/// @tparam Iterator The abstract iterator, can be void* or an actual pointer.
+///
+/// @param it The iterator to make concrete.
+template <typename T, typename Iterator>
+__device__ Iterator make_concrete_iterator(Iterator it)
+{
+    return it;
+}
+
+template <typename T>
+__device__ const T* make_concrete_iterator(const void* it)
+{
+    return reinterpret_cast<const T*>(it);
+}
+
+template <typename T>
+__device__ const T* make_concrete_iterator(void* it)
+{
+    return reinterpret_cast<const T*>(it);
+}
+
+/// @brief Utility to launch persistent kernels.
+///
+/// This function launches a GPU kernel with a grid size derived from the kernel's
+/// occupancy and the total number of multiprocessors on the GPU.
+///
+/// @tparam Kernel The type of the kernel function.
+/// @tparam Args The types of the kernel arguments.
+///
+/// @param kernel An instance of the kernel function. This should be a __global__ function.
+/// @param block_size The kernel's (1D) block size.
+/// @param stream The stream to launch the kernel on.
+/// @param args The kernel launch arguments.
+template <typename Kernel, typename... Args>
+void launch_persistent_kernel(Kernel kernel,
+                              int block_size,
+                              hipStream_t stream,
+                              const Args&... args)
+{
+    int occupancy;
+    hip_check_error(
+        hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, block_size, 0));
+
+    int device;
+    hip_check_error(hipGetDevice(&device));
+
+    int multiprocessors;
+    hip_check_error(
+        hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, device));
+
+    kernel<<<occupancy * multiprocessors, block_size, 0, stream>>>(args...);
+
+    hip_check_error(hipGetLastError());
+}
+
+/// @brief Simple block reduce kernel.
+///
+/// This function reduces all `value`s across a block according to `reduce`. This function
+/// is a relatively simple implementation as its primary purpose is to be correct and
+/// readable: No special cases are done for warp reductions, and the function allocates
+/// its own shared memory. The result is broadcasted to all threads.
+///
+/// @tparam BlockSize The number of threads in a block.
+/// @tparam T The value type to reduce over.
+/// @tparam F The reduction functor type.
+///
+/// @param value This thread's value to reduce over.
+/// @param reduce The reduction functor, used to combine two values. Should be associative.
+template <int BlockSize, typename T, typename F>
+__device__ T block_reduce(const T& value, F reduce)
+{
+    __shared__ T workspace[BlockSize];
+
+    workspace[threadIdx.x] = value;
+    __syncthreads();
+
+    for(unsigned int s = BlockSize / 2; s >= 1; s >>= 1)
+    {
+        if(threadIdx.x < s)
+            workspace[threadIdx.x] = reduce(workspace[threadIdx.x], workspace[threadIdx.x + s]);
+        __syncthreads();
+    }
+
+    return workspace[0];
+}
+
 // Device-side result structure for kernel output
 // Packed into a single struct to minimize device memory allocations
 struct GpuVerifyDeviceResult
@@ -113,121 +214,142 @@ struct GpuVerifyDeviceResult
    unsigned long long error_count; // Number of errors found
    float max_error;                // Maximum error value
    int all_zero;                   // 1 = device result is all zeros, 0 = has non-zero values
+
+    /// @brief Return the neutral element of a GpuVerifyDeviceResult
+    ///
+    /// This function returns the "neutral element", the element which does nothing
+    /// when reduced with another with `reduce_results`. Good to be used as an
+    /// initial value.
+    __host__ __device__ static GpuVerifyDeviceResult identity()
+    {
+        GpuVerifyDeviceResult result;
+        result.error_count = 0;    // No errors yet
+        result.max_error   = 0.0f; // No error observed
+        result.all_zero    = 1;    // Start assuming all zeros (will be cleared if nonzero found)
+        return result;
+    }
 };

+/// @brief Combine two device verify results.
+///
+/// This function returns the "combined" version of two GpuVerifyDeviceResult values, which
+/// adds the total amount of errors, sets the correct max error, and records whether
+/// any of the values had any zeros.
+__device__ GpuVerifyDeviceResult reduce_results(const GpuVerifyDeviceResult& a,
+                                                const GpuVerifyDeviceResult& b)
+{
+    GpuVerifyDeviceResult result;
+    result.error_count = a.error_count + b.error_count;
+    result.max_error   = std::max(a.max_error, b.max_error);
+    result.all_zero    = a.all_zero & b.all_zero;
+    return result;
+}
+
+/// @brief Compare individual tensor elements.
+///
+/// This function is what actually does the comparison between two tensor
+/// elements. The function returns a tuple of three elements.
+/// - The absolute maximum difference.
+/// - If the second value is set to false, it indicates either that the elements are not
+///   equal according to the thresholds `rtol` and `atol`, or that either value is not
+///   finite (NaN/Infinity). If set to true, the values are considered equal.
+/// - If the third value is set to true, it indicates that both elements are bitwise
+///   equal to zero.
+template <typename T>
+__device__ std::tuple<float, bool, bool>
+compare_elements(const T& actual, const T& expected, const float rtol, const float atol)
+{
+    static_assert(!std::is_same_v<T, double>, "TODO: implement compare_elements() for double");
+
+    const auto o = type_convert<float>(actual);
+    const auto r = type_convert<float>(expected);
+    const auto e = std::abs(o - r);
+
+    const auto inequal = e > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r);
+
+    using Bytes        = std::array<std::byte, sizeof(T)>;
+    const auto o_bytes = *reinterpret_cast<const Bytes*>(&actual);
+    const auto r_bytes = *reinterpret_cast<const Bytes*>(&expected);
+    bool all_zero      = true;
+    for(const auto x : o_bytes)
+    {
+        if(x != std::byte{0})
+            all_zero = false;
+    }
+
+    for(const auto x : r_bytes)
+    {
+        if(x != std::byte{0})
+            all_zero = false;
+    }
+
+    return std::make_tuple(e, inequal, all_zero);
+}
+
 // GPU verification kernel - compares device result against reference using relative and absolute
 // tolerance. Tracks all errors (no early exit) to provide detailed error reporting.
 //
 // Uses LDS (shared memory) for block-level reduction to minimize atomic contention.
 // This reduces atomic operations from O(errors) to O(blocks), providing massive speedup
 // when there are many errors.
-//
-// Assumption: Block size is 256
-template <typename T>
-__global__ void gpu_verify_kernel(const T* __restrict__ device_result,
-                                  const T* __restrict__ reference_result,
-                                  float rtol,
-                                  float atol,
-                                  long long size,
-                                  GpuVerifyDeviceResult* result)
+template <int BlockSize, typename T, typename IteratorA, typename IteratorB>
+__global__ __launch_bounds__(BlockSize) //
+    void gpu_verify_kernel(IteratorA device_result_it,
+                           IteratorB reference_result_it,
+                           float rtol,
+                           float atol,
+                           long long size,
+                           GpuVerifyDeviceResult* result)
 {
-    constexpr int block_size = 256;
+    auto device_result    = make_concrete_iterator<T>(device_result_it);
+    auto reference_result = make_concrete_iterator<T>(reference_result_it);

-    // Shared memory for block-level reduction
-    __shared__ unsigned long long shared_error_count[block_size];
-    __shared__ float shared_max_error[block_size];
-    __shared__ int shared_has_error[block_size];
-    __shared__ int shared_has_nonzero[block_size];
-
-    // Thread-local accumulators (in registers)
-    unsigned long long local_error_count = 0;
-    float local_max_error                = 0.0f;
-    int local_has_error                  = 0;
-    int local_has_nonzero                = 0;
+    auto local_result = GpuVerifyDeviceResult::identity();

    // Grid-stride loop to handle any tensor size
-    long long idx    = blockIdx.x * blockDim.x + threadIdx.x;
-    long long stride = blockDim.x * gridDim.x;
+    long long idx    = blockIdx.x * BlockSize + threadIdx.x;
+    long long stride = BlockSize * gridDim.x;

    for(long long i = idx; i < size; i += stride)
    {
-        // Convert to float for comparison
-        float dev_val = type_convert<float>(device_result[i]);
-        float ref_val = type_convert<float>(reference_result[i]);
+        const auto [abs_diff, inequal, bitwise_zero] =
+            compare_elements(device_result[i], reference_result[i], rtol, atol);

-        // Check if device value is non-zero
-        if(dev_val != 0.0f)
-        {
-            local_has_nonzero = 1;
-        }
-
-        // Compute absolute difference
-        float abs_diff = fabsf(dev_val - ref_val);
-
-        // Check tolerance (matches CPU check_err logic: err > atol + rtol * abs(ref))
-        if(abs_diff > atol + rtol * fabsf(ref_val))
-        {
-            local_has_error = 1;
-            local_error_count++;
-            local_max_error = fmaxf(local_max_error, abs_diff);
-        }
+        local_result = reduce_results(local_result,
+                                      GpuVerifyDeviceResult{
+                                          static_cast<uint64_t>(inequal), // error_count
+                                          abs_diff,                       // max_error
+                                          bitwise_zero                    // all_zero
+                                      });
    }

-    // Store thread-local results to shared memory
-    shared_error_count[threadIdx.x] = local_error_count;
-    shared_max_error[threadIdx.x]   = local_max_error;
-    shared_has_error[threadIdx.x]   = local_has_error;
-    shared_has_nonzero[threadIdx.x] = local_has_nonzero;
-    __syncthreads();
-
-    // Block-level reduction: 256 -> 128 -> 64 -> 32
-    for(unsigned int s = block_size / 2; s >= 32; s >>= 1)
-    {
-        if(threadIdx.x < s)
-        {
-            shared_error_count[threadIdx.x] += shared_error_count[threadIdx.x + s];
-            shared_max_error[threadIdx.x] =
-                fmaxf(shared_max_error[threadIdx.x], shared_max_error[threadIdx.x + s]);
-            shared_has_error[threadIdx.x] |= shared_has_error[threadIdx.x + s];
-            shared_has_nonzero[threadIdx.x] |= shared_has_nonzero[threadIdx.x + s];
-        }
-        __syncthreads();
-    }
+    const auto block_result = block_reduce<BlockSize>(local_result, reduce_results);

    // Final reduction of remaining 32 elements in thread 0
    if(threadIdx.x == 0)
    {
-        for(int i = 1; i < 32; ++i)
+        // Single atomic update per block (reduces contention from O(errors) to O(blocks))
+        if(block_result.error_count > 0)
        {
-            shared_error_count[0] += shared_error_count[i];
-            shared_max_error[0] = fmaxf(shared_max_error[0], shared_max_error[i]);
-            shared_has_error[0] |= shared_has_error[i];
-            shared_has_nonzero[0] |= shared_has_nonzero[i];
+            atomicAdd(&result->error_count, block_result.error_count);
+            atomicMax(&result->max_error, block_result.max_error);
        }

-        // Single atomic update per block (reduces contention from O(errors) to O(blocks))
-        if(shared_has_error[0])
+        if(!block_result.all_zero)
        {
-            atomicAdd(&result->error_count, shared_error_count[0]);
-            atomicMax(&result->max_error, shared_max_error[0]);
-        }
-        // Update all_zero flag: if no nonzero values found, mark as all zero
-        if(!shared_has_nonzero[0])
-        {
-            atomicMin(&result->all_zero, 1);
-        }
-        else
-        {
-            atomicMin(&result->all_zero, 0);
+            // A nonzero was found, so set the global value to false.
+            // Note: this is a benign race condition; technically a race condition but
+            // all blocks write the same value, so its fine.
+            result->all_zero = 0;
        }
    }
 }

 // Host-side wrapper for GPU verification with explicit tolerances
 // Returns GpuVerifyResult with detailed error information
-template <typename T>
-GpuVerifyResult gpu_verify(const void* device_result,
-                           const void* reference_result,
+template <typename T, typename IteratorA, typename IteratorB>
+GpuVerifyResult gpu_verify(IteratorA device_result,
+                           IteratorB reference_result,
                           float rtol,
                           float atol,
                           std::size_t size,
@@ -238,31 +360,25 @@ GpuVerifyResult gpu_verify(const void* device_result,
    hip_check_error(hipMalloc(&result_dev, sizeof(GpuVerifyDeviceResult)));

    // Initialize result struct
-    GpuVerifyDeviceResult result_host;
-    result_host.error_count = 0;    // No errors yet
-    result_host.max_error   = 0.0f; // No error observed
-    result_host.all_zero    = 1;    // Start assuming all zeros (will be cleared if nonzero found)
+    auto result_host = GpuVerifyDeviceResult::identity();
    hip_check_error(
        hipMemcpy(result_dev, &result_host, sizeof(GpuVerifyDeviceResult), hipMemcpyHostToDevice));

-    // Launch kernel with grid-stride loop
-    // Use 65535 as max grid size (hardware limit for grid dimension in x)
-    // Grid-stride loop handles any tensor size regardless of grid dimensions
+    // Launch persistent kernel.
+    // automatically derive the optimal grid size from the kernel's occupancy and the
+    // number of multiprocessors.
    constexpr int block_size = 256;
-    int grid_size            = std::min<int>(65535, (size + block_size - 1) / block_size);
+    const auto kernel        = gpu_verify_kernel<block_size, T, IteratorA, IteratorB>;

-    gpu_verify_kernel<T>
-        <<<grid_size, block_size, 0, stream>>>(static_cast<const T*>(device_result),
-                                               static_cast<const T*>(reference_result),
-                                               rtol,
-                                               atol,
-                                               static_cast<long long>(size),
-                                               result_dev);
-
-    hip_check_error(hipGetLastError());
-
-    // Synchronize the stream to ensure kernel completion before reading results
-    hip_check_error(hipStreamSynchronize(stream));
+    launch_persistent_kernel(kernel,
+                             block_size,
+                             stream,
+                             device_result,
+                             reference_result,
+                             rtol,
+                             atol,
+                             static_cast<long long>(size),
+                             result_dev);

    // Get result
    hip_check_error(
@@ -276,23 +392,25 @@ GpuVerifyResult gpu_verify(const void* device_result,
    result.error_count = result_host.error_count;
    result.max_error   = result_host.max_error;
    result.total       = size;
-    result.all_zero    = (result_host.all_zero == 1);
+    result.all_zero    = result_host.all_zero == 1;

    return result;
 }

 // Forward declaration of gpu_reduce_max
-template <typename T>
-float gpu_reduce_max(const void* device_buffer, std::size_t size, hipStream_t stream = nullptr);
+template <typename T, typename Iterator>
+float gpu_reduce_max(Iterator device_buffer, std::size_t size, hipStream_t stream = nullptr);

 // Host-side wrapper for GPU verification with automatic tolerance computation
 // Computes max value on GPU, then computes tolerances and verifies
 // Returns GpuVerifyResult with detailed error information
 template <typename OutDataType,
          typename ComputeDataType = OutDataType,
-          typename AccDataType     = ComputeDataType>
-GpuVerifyResult gpu_verify(const void* device_result,
-                           const void* reference_result,
+          typename AccDataType     = ComputeDataType,
+          typename IteratorA,
+          typename IteratorB>
+GpuVerifyResult gpu_verify(IteratorA device_result,
+                           IteratorB reference_result,
                           int number_of_accumulations,
                           std::size_t size,
                           hipStream_t stream = nullptr)
@@ -323,23 +441,26 @@ GpuVerifyResult gpu_verify(const void* device_result,
                max_abs_value, number_of_accumulations));
    }

+    if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+    {
+        std::cout << "verify: accumulations=" << number_of_accumulations << " rtol = " << rtol
+                  << " atol=" << atol << std::endl;
+    }
+
    // Call the explicit tolerance version
    return gpu_verify<OutDataType>(device_result, reference_result, rtol, atol, size, stream);
 }

 // GPU reduction kernel for computing max(abs(data))
 // This is an internal kernel called only by gpu_reduce_max() wrapper.
-//
-// Assumption: Block size is 256
-template <typename T>
-__global__ void
-gpu_reduce_max_kernel(const T* __restrict__ data, long long size, float* __restrict__ max_val)
+template <int BlockSize, typename T, typename Iterator>
+__global__ __launch_bounds__((BlockSize)) //
+    void gpu_reduce_max_kernel(Iterator it, long long size, float* __restrict__ max_val)
 {
-    constexpr int block_size = 256;
-    __shared__ float shared_max[block_size];
+    auto data = make_concrete_iterator<T>(it);

-    long long idx    = blockIdx.x * blockDim.x + threadIdx.x;
-    long long stride = blockDim.x * gridDim.x;
+    long long idx    = blockIdx.x * BlockSize + threadIdx.x;
+    long long stride = BlockSize * gridDim.x;

    float local_max = 0.0f;

@@ -349,37 +470,18 @@ gpu_reduce_max_kernel(const T* __restrict__ data, long long size, float* __restr
        local_max = fmaxf(local_max, val);
    }

-    shared_max[threadIdx.x] = local_max;
-    __syncthreads();
+    const auto block_max = block_reduce<BlockSize>(
+        local_max, [](const auto& a, const auto& b) { return std::max(a, b); });

-    // Block-level reduction: 256 -> 128 -> 64 -> 32
-    for(unsigned int s = block_size / 2; s >= 32; s >>= 1)
-    {
-        if(threadIdx.x < s)
-        {
-            shared_max[threadIdx.x] = fmaxf(shared_max[threadIdx.x], shared_max[threadIdx.x + s]);
-        }
-        __syncthreads();
-    }
-
-    // Final reduction of remaining 32 elements in thread 0
    if(threadIdx.x == 0)
-    {
-        for(int i = 1; i < 32; ++i)
-        {
-            shared_max[0] = fmaxf(shared_max[0], shared_max[i]);
-        }
-
-        // Single atomic update per block
-        atomicMax(max_val, shared_max[0]);
-    }
+        atomicMax(max_val, block_max);
 }

 // Host-side wrapper for GPU max reduction
 // Computes max(abs(data)) and returns as float
 // Only transfers 4 bytes (the final max value) instead of entire tensor
-template <typename T>
-float gpu_reduce_max(const void* device_buffer, std::size_t size, hipStream_t stream)
+template <typename T, typename Iterator>
+float gpu_reduce_max(Iterator device_buffer, std::size_t size, hipStream_t stream)
 {
    if(size == 0)
    {
@@ -394,22 +496,14 @@ float gpu_reduce_max(const void* device_buffer, std::size_t size, hipStream_t st
    float init_val = 0.0f;
    hip_check_error(hipMemcpy(max_dev, &init_val, sizeof(float), hipMemcpyHostToDevice));

-    // Launch reduction kernel
-    // Use 1024 blocks max for reduction to balance occupancy vs. grid-stride iterations
-    // For very large tensors (>256M elements), grid-stride loop handles the remainder
+    // Launch persistent kernel.
+    // automatically derive the optimal grid size from the kernel's occupancy and the
+    // number of multiprocessors.
    constexpr int block_size = 256;
-    int grid_size            = std::min<int>(1024, (size + block_size - 1) / block_size);
+    const auto kernel        = gpu_reduce_max_kernel<block_size, T, Iterator>;

-    gpu_reduce_max_kernel<T><<<grid_size, block_size, 0, stream>>>(
-        static_cast<const T*>(device_buffer), static_cast<long long>(size), max_dev);
-
-    hip_check_error(hipGetLastError());
-
-    // Synchronize if using default stream
-    if(stream == nullptr)
-    {
-        hip_check_error(hipDeviceSynchronize());
-    }
+    launch_persistent_kernel(
+        kernel, block_size, stream, device_buffer, static_cast<long long>(size), max_dev);

    // Copy result to host (only 4 bytes!)
    float max_host;