composable_kernel/experimental/builder/include/ck_tile/builder/testing/validation.hpp

// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT

#pragma once

#include "ck_tile/builder/testing/error.hpp"
#include "ck_tile/builder/testing/tensor_buffer.hpp"
#include "ck_tile/builder/testing/tensor_foreach.hpp"
#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
#include "ck/utility/type_convert.hpp"
#include "ck/library/utility/gpu_verification.hpp"
#include <string_view>
#include <vector>
#include <algorithm>
#include <functional>
#include <bit>

/// This file implements functionality related to "validation", ie, functionality
/// to compare tensors. The functionality in this file should be testing-framework
/// agnostic, and it should NOT generate any error messages by itself. Instead,
/// all relevant information should be stored in the `ValidationReport` structure.
/// This structure should then be used to generate error messages, explainations,
/// etc, by the actual testing framework that the user has chosen.

namespace ck_tile::builder::test {

/// @brief Information about how a set of comparisons failed or succeeded.
///
/// This structure represents a "report" generated by comparing sets of tensors.
/// Its intended to be used as the result of `ckt::validate()`, where `check()`
/// is invoked for each of the output tensors of a particular device operation.
/// The test should be considered successful if _all_ of those checks passes,
/// which can inspected by asserting that `get_errors().size()` is 0.
struct ValidationReport
{
    /// @brief Information related to a single tensor comparison.
    ///
    /// This structure holds the information about the result of comparing
    /// two particular tensors.
    struct Case
    {
        /// The name of the tensor that was compared here, stored here for convenience
        /// so that reporting any errors is easier.
        std::string tensor_name;

        /// The number of elements which were different between the two compared tensors.
        uint64_t wrong_elements;

        /// The total number of elements in each tensor.
        uint64_t total_elements;

        /// Set to true if both tensors have all their elements be 0.
        bool both_all_zero;

        // Max error.
        double max_error;

        /// @brief Check whether both the output and reference tensor were both all zeros.
        ///
        /// If both tensors are all zero, it indicates either an incorrect testing setup
        /// or an issue with the testing framework. For that reason we also consider that
        /// a failure.
        bool is_all_zero() const { return both_all_zero; }

        /// @brief Return whether the check associated to this case was successful.
        ///
        /// This function returns whether the check associated to this case was successful,
        /// which is directly derived from checking whether the number of incorrect elements
        /// was 0 AND whether the tensor was not all zero.
        bool is_ok() const { return wrong_elements == 0 && !is_all_zero(); }
    };

    /// @brief Get comparison cases which were incorrect.
    ///
    /// This function returns a vector of comparison cases that did not succeed, ie, for
    /// which `Case::is_ok` return false. In order to check whether validation passed, it
    /// is sufficient to assert that this function returns no cases.
    std::vector<Case> get_errors() const
    {
        std::vector<Case> errors;
        std::copy_if(reports_.begin(),
                     reports_.end(),
                     std::back_inserter(errors),
                     [](const auto& report) { return !report.is_ok(); });
        return errors;
    }

    /// @brief Compare two tensors and record the results in the report.
    ///
    /// This is one of the main function used to compare two tensors. The results of this
    /// comparison, including any supplemental information, is recorded into the report.
    ///
    /// @returns `false` if the comparison failed. If so, the details can be found via
    /// `get_errors()`.
    ///
    /// @tparam DT The data type of the tensors to check.
    /// @tparam RANK The rank (number of spatial dimensions) of the tensor to check.
    ///
    /// @param tensor_name The name of the tensors to check. This should be a value by which
    /// whoever is debugging the associated test later can easily find out which of the
    /// outputs of a device operation was incorrect.
    /// @param descriptor The descriptor (memory layout) of the tensor.
    /// @param actual The device buffer with the values of the tensor to-be-tested, ie, the
    /// results of the device operation.
    /// @param expected The device buffer with the values of the reference tensor. These are
    /// treated as a "golden standard", and should usually be generated by a reference
    /// implementation.
    /// @param rtol The relative acceptable tolerance between two values.
    /// @param atol The absolute acceptable tolerance between two values.
    template <DataType DT, size_t RANK>
    bool check(std::string_view tensor_name,
               const TensorDescriptor<DT, RANK>& descriptor,
               const void* actual,
               const void* expected,
               float rtol = 1e-3f,
               float atol = 1e-3f);

    /// @brief Compare two tensors and record the results in the report, with automatic
    /// computation of tolerances.
    ///
    /// This variant computes the tolerances automatically based on the compute
    /// (accumulation) type, and the number of accumulations required per result value.
    /// This is one of the main function used to compare two tensors. The results of this
    /// comparison, including any supplemental information, is recorded into the report.
    /// @returns `false` if the comparison failed. If so, the details can be found via
    /// `get_errors()`.
    ///
    /// @tparam OutDataType The data type of the tensors to check. This is the type of the
    /// values in tensor memory.
    /// @tparam ComputeType The data type that tensor operations are computed with internally.
    /// @tparam AccType The data type that tensor values are accumulated with internally.
    /// @tparam RANK The rank (number of spatial dimensions) of the tensor to check.
    ///
    /// @param tensor_name The name of the tensors to check. This should be a value by which
    /// whoever is debugging the associated test later can easily find out which of the
    /// outputs of a device operation was incorrect.
    /// @param descriptor The descriptor (memory layout) of the tensor.
    /// @param actual The device buffer with the values of the tensor to-be-tested, ie, the
    /// results of the device operation.
    /// @param expected The device buffer with the values of the reference tensor. These are
    /// treated as a "golden standard", and should usually be generated by a reference
    /// implementation.
    /// @param number_of_accumulations The maximum number of accumulations required to compute
    /// a value of the result tensor.
    template <DataType OutDataType,
              DataType ComputeType = OutDataType,
              DataType AccType     = ComputeType,
              size_t RANK>
    bool check_by_accumulations(std::string_view tensor_name,
                                const TensorDescriptor<OutDataType, RANK>& descriptor,
                                const void* actual,
                                const void* expected,
                                const size_t number_of_accumulations);

    private:
    std::vector<Case> reports_;
};

template <DataType DT, size_t RANK>
bool ValidationReport::check(std::string_view tensor_name,
                             const TensorDescriptor<DT, RANK>& descriptor,
                             const void* actual,
                             const void* expected,
                             float rtol,
                             float atol)
{
    using CKType = detail::cpp_type_t<DT>;

    const auto a_it  = FlatTensorIterator(descriptor, static_cast<const CKType*>(actual));
    const auto e_it  = FlatTensorIterator(descriptor, static_cast<const CKType*>(expected));
    const auto numel = a_it.numel();

    const auto result = ck::profiler::gpu_verify<CKType>(a_it, e_it, rtol, atol, numel);

    // TODO: Gather detailed coordinates.

    reports_.push_back(Case{
        .tensor_name    = std::string(tensor_name),
        .wrong_elements = result.error_count,
        .total_elements = descriptor.get_element_size(),
        .both_all_zero  = result.all_zero,
        .max_error      = result.max_error,
    });

    return reports_.back().is_ok();
}

template <DataType OutDataType, DataType ComputeType, DataType AccType, size_t RANK>
bool ValidationReport::check_by_accumulations(std::string_view tensor_name,
                                              const TensorDescriptor<OutDataType, RANK>& descriptor,
                                              const void* actual,
                                              const void* expected,
                                              const size_t number_of_accumulations)
{
    using CKComputeType = detail::cpp_type_t<ComputeType>;
    using CKAccType     = detail::cpp_type_t<AccType>;
    using CKOutDataType = detail::cpp_type_t<OutDataType>;

    const auto a_it  = FlatTensorIterator(descriptor, static_cast<const CKOutDataType*>(actual));
    const auto e_it  = FlatTensorIterator(descriptor, static_cast<const CKOutDataType*>(expected));
    const auto numel = a_it.numel();

    const auto result = ck::profiler::gpu_verify<CKOutDataType, CKComputeType, CKAccType>(
        a_it, e_it, static_cast<int>(number_of_accumulations), numel);

    // TODO: Gather detailed coordinates.

    reports_.push_back(Case{
        .tensor_name    = std::string(tensor_name),
        .wrong_elements = result.error_count,
        .total_elements = descriptor.get_element_size(),
        .both_all_zero  = result.all_zero,
        .max_error      = result.max_error,
    });

    return reports_.back().is_ok();
}

} // namespace ck_tile::builder::test