mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-01 20:21:23 +00:00
* ck-builder: tensor copy function This function copies one tensor to another, so that the memory layout can be changed between them. * ck-builder: fix ck::bhalf literals These types don't work properly. * ck-builder: abstract compare_elements in gpu_verification.hpp and make builder use it This reduces the amount of duplicated code a bit. * ck-builder: add flat tensor iterator This "iterator" type pretends to be a pointer, useful for passing tensors to functions expecting pointer-like types. * ck-builder: integrate validation with ck gpu verification By templating the gpu_verify function over iterators, we can use the new FlatTensorIterator to adapt the function to multi- dimensional tensors without changing either implementation too much. * ck-builder: add check_by_accumulations This changes the gpu_verification.hpp code to also accept "iterator" types for the relevant gpu_verify and gpu_reduce_max functions. * ck: fix test_gpu_verification GenerateRandomData for bhalf is_integer_it<bhalf_t> yields true, but it is not actually an integer. * ck: make gpu_verification kernels be proper persistent kernels Previously these were using a hardcoded value for the grid size. This commit changes that so that the grid size is automatically derived from the kernel's occupancy and the number of multiprocessors on the GPU. * ck: clean up gpu_verification.hpp using block_reduce This implements a small generic block reduce function, and rewrites the rest of gpu_verification.hpp using that function to clean it up a bit. * ck-builder: doc typos * ck-builder: update testing readme with validation interface. * ck-builder: rebase fixes + review comments * ck-builder: fix device integer generation with float types Passing bfloat here causes a nans due to type_convert performing a bitcast. * ck: another bhalf_t bug CK expects that int-generation with ck::bhalf_t yields bhalf integers, not unsigned integers. This makes the logic of FillUniformRandInteger compatible with GeneratorTensor_2<InDataType>, however idiotic that may be.
220 lines
9.9 KiB
C++
220 lines
9.9 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#pragma once
|
|
|
|
#include "ck_tile/builder/testing/error.hpp"
|
|
#include "ck_tile/builder/testing/tensor_buffer.hpp"
|
|
#include "ck_tile/builder/testing/tensor_foreach.hpp"
|
|
#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
|
|
#include "ck/utility/type_convert.hpp"
|
|
#include "ck/library/utility/gpu_verification.hpp"
|
|
#include <string_view>
|
|
#include <vector>
|
|
#include <algorithm>
|
|
#include <functional>
|
|
#include <bit>
|
|
|
|
/// This file implements functionality related to "validation", ie, functionality
|
|
/// to compare tensors. The functionality in this file should be testing-framework
|
|
/// agnostic, and it should NOT generate any error messages by itself. Instead,
|
|
/// all relevant information should be stored in the `ValidationReport` structure.
|
|
/// This structure should then be used to generate error messages, explainations,
|
|
/// etc, by the actual testing framework that the user has chosen.
|
|
|
|
namespace ck_tile::builder::test {
|
|
|
|
/// @brief Information about how a set of comparisons failed or succeeded.
|
|
///
|
|
/// This structure represents a "report" generated by comparing sets of tensors.
|
|
/// Its intended to be used as the result of `ckt::validate()`, where `check()`
|
|
/// is invoked for each of the output tensors of a particular device operation.
|
|
/// The test should be considered successful if _all_ of those checks passes,
|
|
/// which can inspected by asserting that `get_errors().size()` is 0.
|
|
struct ValidationReport
|
|
{
|
|
/// @brief Information related to a single tensor comparison.
|
|
///
|
|
/// This structure holds the information about the result of comparing
|
|
/// two particular tensors.
|
|
struct Case
|
|
{
|
|
/// The name of the tensor that was compared here, stored here for convenience
|
|
/// so that reporting any errors is easier.
|
|
std::string tensor_name;
|
|
|
|
/// The number of elements which were different between the two compared tensors.
|
|
uint64_t wrong_elements;
|
|
|
|
/// The total number of elements in each tensor.
|
|
uint64_t total_elements;
|
|
|
|
/// Set to true if both tensors have all their elements be 0.
|
|
bool both_all_zero;
|
|
|
|
// Max error.
|
|
double max_error;
|
|
|
|
/// @brief Check whether both the output and reference tensor were both all zeros.
|
|
///
|
|
/// If both tensors are all zero, it indicates either an incorrect testing setup
|
|
/// or an issue with the testing framework. For that reason we also consider that
|
|
/// a failure.
|
|
bool is_all_zero() const { return both_all_zero; }
|
|
|
|
/// @brief Return whether the check associated to this case was successful.
|
|
///
|
|
/// This function returns whether the check associated to this case was successful,
|
|
/// which is directly derived from checking whether the number of incorrect elements
|
|
/// was 0 AND whether the tensor was not all zero.
|
|
bool is_ok() const { return wrong_elements == 0 && !is_all_zero(); }
|
|
};
|
|
|
|
/// @brief Get comparison cases which were incorrect.
|
|
///
|
|
/// This function returns a vector of comparison cases that did not succeed, ie, for
|
|
/// which `Case::is_ok` return false. In order to check whether validation passed, it
|
|
/// is sufficient to assert that this function returns no cases.
|
|
std::vector<Case> get_errors() const
|
|
{
|
|
std::vector<Case> errors;
|
|
std::copy_if(reports_.begin(),
|
|
reports_.end(),
|
|
std::back_inserter(errors),
|
|
[](const auto& report) { return !report.is_ok(); });
|
|
return errors;
|
|
}
|
|
|
|
/// @brief Compare two tensors and record the results in the report.
|
|
///
|
|
/// This is one of the main function used to compare two tensors. The results of this
|
|
/// comparison, including any supplemental information, is recorded into the report.
|
|
///
|
|
/// @returns `false` if the comparison failed. If so, the details can be found via
|
|
/// `get_errors()`.
|
|
///
|
|
/// @tparam DT The data type of the tensors to check.
|
|
/// @tparam RANK The rank (number of spatial dimensions) of the tensor to check.
|
|
///
|
|
/// @param tensor_name The name of the tensors to check. This should be a value by which
|
|
/// whoever is debugging the associated test later can easily find out which of the
|
|
/// outputs of a device operation was incorrect.
|
|
/// @param descriptor The descriptor (memory layout) of the tensor.
|
|
/// @param actual The device buffer with the values of the tensor to-be-tested, ie, the
|
|
/// results of the device operation.
|
|
/// @param expected The device buffer with the values of the reference tensor. These are
|
|
/// treated as a "golden standard", and should usually be generated by a reference
|
|
/// implementation.
|
|
/// @param rtol The relative acceptable tolerance between two values.
|
|
/// @param atol The absolute acceptable tolerance between two values.
|
|
template <DataType DT, size_t RANK>
|
|
bool check(std::string_view tensor_name,
|
|
const TensorDescriptor<DT, RANK>& descriptor,
|
|
const void* actual,
|
|
const void* expected,
|
|
float rtol = 1e-3f,
|
|
float atol = 1e-3f);
|
|
|
|
/// @brief Compare two tensors and record the results in the report, with automatic
|
|
/// computation of tolerances.
|
|
///
|
|
/// This variant computes the tolerances automatically based on the compute
|
|
/// (accumulation) type, and the number of accumulations required per result value.
|
|
/// This is one of the main function used to compare two tensors. The results of this
|
|
/// comparison, including any supplemental information, is recorded into the report.
|
|
/// @returns `false` if the comparison failed. If so, the details can be found via
|
|
/// `get_errors()`.
|
|
///
|
|
/// @tparam OutDataType The data type of the tensors to check. This is the type of the
|
|
/// values in tensor memory.
|
|
/// @tparam ComputeType The data type that tensor operations are computed with internally.
|
|
/// @tparam AccType The data type that tensor values are accumulated with internally.
|
|
/// @tparam RANK The rank (number of spatial dimensions) of the tensor to check.
|
|
///
|
|
/// @param tensor_name The name of the tensors to check. This should be a value by which
|
|
/// whoever is debugging the associated test later can easily find out which of the
|
|
/// outputs of a device operation was incorrect.
|
|
/// @param descriptor The descriptor (memory layout) of the tensor.
|
|
/// @param actual The device buffer with the values of the tensor to-be-tested, ie, the
|
|
/// results of the device operation.
|
|
/// @param expected The device buffer with the values of the reference tensor. These are
|
|
/// treated as a "golden standard", and should usually be generated by a reference
|
|
/// implementation.
|
|
/// @param number_of_accumulations The maximum number of accumulations required to compute
|
|
/// a value of the result tensor.
|
|
template <DataType OutDataType,
|
|
DataType ComputeType = OutDataType,
|
|
DataType AccType = ComputeType,
|
|
size_t RANK>
|
|
bool check_by_accumulations(std::string_view tensor_name,
|
|
const TensorDescriptor<OutDataType, RANK>& descriptor,
|
|
const void* actual,
|
|
const void* expected,
|
|
const size_t number_of_accumulations);
|
|
|
|
private:
|
|
std::vector<Case> reports_;
|
|
};
|
|
|
|
template <DataType DT, size_t RANK>
|
|
bool ValidationReport::check(std::string_view tensor_name,
|
|
const TensorDescriptor<DT, RANK>& descriptor,
|
|
const void* actual,
|
|
const void* expected,
|
|
float rtol,
|
|
float atol)
|
|
{
|
|
using CKType = detail::cpp_type_t<DT>;
|
|
|
|
const auto a_it = FlatTensorIterator(descriptor, static_cast<const CKType*>(actual));
|
|
const auto e_it = FlatTensorIterator(descriptor, static_cast<const CKType*>(expected));
|
|
const auto numel = a_it.numel();
|
|
|
|
const auto result = ck::profiler::gpu_verify<CKType>(a_it, e_it, rtol, atol, numel);
|
|
|
|
// TODO: Gather detailed coordinates.
|
|
|
|
reports_.push_back(Case{
|
|
.tensor_name = std::string(tensor_name),
|
|
.wrong_elements = result.error_count,
|
|
.total_elements = descriptor.get_element_size(),
|
|
.both_all_zero = result.all_zero,
|
|
.max_error = result.max_error,
|
|
});
|
|
|
|
return reports_.back().is_ok();
|
|
}
|
|
|
|
template <DataType OutDataType, DataType ComputeType, DataType AccType, size_t RANK>
|
|
bool ValidationReport::check_by_accumulations(std::string_view tensor_name,
|
|
const TensorDescriptor<OutDataType, RANK>& descriptor,
|
|
const void* actual,
|
|
const void* expected,
|
|
const size_t number_of_accumulations)
|
|
{
|
|
using CKComputeType = detail::cpp_type_t<ComputeType>;
|
|
using CKAccType = detail::cpp_type_t<AccType>;
|
|
using CKOutDataType = detail::cpp_type_t<OutDataType>;
|
|
|
|
const auto a_it = FlatTensorIterator(descriptor, static_cast<const CKOutDataType*>(actual));
|
|
const auto e_it = FlatTensorIterator(descriptor, static_cast<const CKOutDataType*>(expected));
|
|
const auto numel = a_it.numel();
|
|
|
|
const auto result = ck::profiler::gpu_verify<CKOutDataType, CKComputeType, CKAccType>(
|
|
a_it, e_it, static_cast<int>(number_of_accumulations), numel);
|
|
|
|
// TODO: Gather detailed coordinates.
|
|
|
|
reports_.push_back(Case{
|
|
.tensor_name = std::string(tensor_name),
|
|
.wrong_elements = result.error_count,
|
|
.total_elements = descriptor.get_element_size(),
|
|
.both_all_zero = result.all_zero,
|
|
.max_error = result.max_error,
|
|
});
|
|
|
|
return reports_.back().is_ok();
|
|
}
|
|
|
|
} // namespace ck_tile::builder::test
|