mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-15 18:42:06 +00:00
Merge commit 'e6e7dc29101bcd8a5d30ae99adf71a09fa544b09' into develop
This commit is contained in:
@@ -47,6 +47,11 @@ struct DataTypeToCK<DataType::FP8>
|
||||
{
|
||||
using type = ck::f8_t;
|
||||
};
|
||||
template <>
|
||||
struct DataTypeToCK<DataType::U8>
|
||||
{
|
||||
using type = uint8_t;
|
||||
};
|
||||
|
||||
struct CK_empty_tuple
|
||||
{
|
||||
|
||||
@@ -7,11 +7,14 @@
|
||||
#include "ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp"
|
||||
#include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp"
|
||||
#include "ck_tile/builder/testing/testing.hpp"
|
||||
#include "ck_tile/builder/testing/extent.hpp"
|
||||
#include "ck_tile/builder/testing/filter_extent.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_buffer.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_initialization.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_descriptor.hpp"
|
||||
#include "ck_tile/builder/testing/validation.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
|
||||
/// This file implements common functionality for invoking/testing grouped
|
||||
/// forward convolutions created through the CK Builder API. The main item
|
||||
/// of it is the ConvArgs structure - which contains a complete description
|
||||
@@ -37,12 +40,12 @@ namespace ck_tile::builder::test {
|
||||
template <int SPATIAL_DIM>
|
||||
struct ConvTensorLengths
|
||||
{
|
||||
size_t batch_size = 1; // N
|
||||
size_t groups = 1; // G
|
||||
size_t input_channels = 1; // C
|
||||
size_t output_channels = 1; // K
|
||||
Extent<SPATIAL_DIM> image = {}; // W, H, D
|
||||
Extent<SPATIAL_DIM> filter = {}; // X, Y, Z
|
||||
size_t batch_size = 1; // N
|
||||
size_t groups = 1; // G
|
||||
size_t input_channels = 1; // C
|
||||
size_t output_channels = 1; // K
|
||||
FilterExtent<SPATIAL_DIM> image = {}; // W, H, D
|
||||
FilterExtent<SPATIAL_DIM> filter = {}; // X, Y, Z
|
||||
};
|
||||
|
||||
/// @brief `Args` specialization for forward convolution.
|
||||
@@ -59,6 +62,14 @@ struct Args<SIGNATURE>
|
||||
constexpr static auto WEIGHT_TYPE = SIGNATURE.data_type;
|
||||
constexpr static auto OUTPUT_TYPE = SIGNATURE.data_type;
|
||||
|
||||
constexpr static int INPUT_RANK = 3 + SPATIAL_DIM;
|
||||
constexpr static int WEIGHT_RANK = 3 + SPATIAL_DIM;
|
||||
constexpr static int OUTPUT_RANK = 3 + SPATIAL_DIM;
|
||||
|
||||
using InputDescriptor = TensorDescriptor<INPUT_TYPE, INPUT_RANK>;
|
||||
using WeightDescriptor = TensorDescriptor<WEIGHT_TYPE, WEIGHT_RANK>;
|
||||
using OutputDescriptor = TensorDescriptor<OUTPUT_TYPE, OUTPUT_RANK>;
|
||||
|
||||
// TODO: We shouldn't need to call into an internal namespace here.
|
||||
using Ops = factory::internal::ElementwiseOps<SIGNATURE>;
|
||||
|
||||
@@ -73,10 +84,10 @@ struct Args<SIGNATURE>
|
||||
// implementation (based on ConvParam in old CK / CK Tile) does not
|
||||
// support strides at all.
|
||||
|
||||
Extent<SPATIAL_DIM> filter_strides;
|
||||
Extent<SPATIAL_DIM> filter_dilation;
|
||||
Extent<SPATIAL_DIM> input_left_pad;
|
||||
Extent<SPATIAL_DIM> input_right_pad;
|
||||
FilterExtent<SPATIAL_DIM> filter_strides;
|
||||
FilterExtent<SPATIAL_DIM> filter_dilation;
|
||||
FilterExtent<SPATIAL_DIM> input_left_pad;
|
||||
FilterExtent<SPATIAL_DIM> input_right_pad;
|
||||
|
||||
Ops::AElementwiseOp a_elementwise_op;
|
||||
Ops::BElementwiseOp b_elementwise_op;
|
||||
@@ -85,7 +96,7 @@ struct Args<SIGNATURE>
|
||||
/// This function returns the `TensorDescriptor` corresponding to
|
||||
/// the input-tensor of the convolution problem. This can then
|
||||
/// be used to, for example, allocate memory.
|
||||
TensorDescriptor<INPUT_TYPE> make_input_descriptor() const
|
||||
InputDescriptor make_input_descriptor() const
|
||||
{
|
||||
// TODO: We're using old CK functionality to compute the right
|
||||
// values here, mainly because CK tile does not support the
|
||||
@@ -96,31 +107,37 @@ struct Args<SIGNATURE>
|
||||
const auto param = to_ck_conv_param();
|
||||
const auto desc = ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<
|
||||
typename Layouts::ALayout>(param);
|
||||
return TensorDescriptor<INPUT_TYPE>(desc.GetLengths(), desc.GetStrides());
|
||||
using Extent = typename InputDescriptor::Extent;
|
||||
return InputDescriptor(Extent::from_vector(desc.GetLengths()),
|
||||
Extent::from_vector(desc.GetStrides()));
|
||||
}
|
||||
|
||||
/// This function returns the `TensorDescriptor` corresponding to
|
||||
/// the weight-tensor of the convolution problem. This can then
|
||||
/// be used to, for example, allocate memory.
|
||||
TensorDescriptor<WEIGHT_TYPE> make_weight_descriptor() const
|
||||
WeightDescriptor make_weight_descriptor() const
|
||||
{
|
||||
// See note in implementation of `make_input_descriptor`.
|
||||
const auto param = to_ck_conv_param();
|
||||
const auto desc = ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<
|
||||
typename Layouts::BLayout>(param);
|
||||
return TensorDescriptor<WEIGHT_TYPE>(desc.GetLengths(), desc.GetStrides());
|
||||
using Extent = typename WeightDescriptor::Extent;
|
||||
return WeightDescriptor(Extent::from_vector(desc.GetLengths()),
|
||||
Extent::from_vector(desc.GetStrides()));
|
||||
}
|
||||
|
||||
/// This function returns the `TensorDescriptor` corresponding to
|
||||
/// the output-tensor of the convolution problem. This can then
|
||||
/// be used to, for example, allocate memory.
|
||||
TensorDescriptor<OUTPUT_TYPE> make_output_descriptor() const
|
||||
OutputDescriptor make_output_descriptor() const
|
||||
{
|
||||
// See note in implementation of `make_input_descriptor`.
|
||||
const auto param = to_ck_conv_param();
|
||||
const auto desc = ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<
|
||||
typename Layouts::ELayout>(param);
|
||||
return TensorDescriptor<OUTPUT_TYPE>(desc.GetLengths(), desc.GetStrides());
|
||||
using Extent = typename OutputDescriptor::Extent;
|
||||
return OutputDescriptor(Extent::from_vector(desc.GetLengths()),
|
||||
Extent::from_vector(desc.GetStrides()));
|
||||
}
|
||||
|
||||
/// Convert the Args structure into a CK conv_param structure. This
|
||||
@@ -245,12 +262,11 @@ UniqueInputs<SIGNATURE> alloc_inputs(const Args<SIGNATURE>& args)
|
||||
///
|
||||
/// @see alloc_inputs()
|
||||
template <auto SIGNATURE>
|
||||
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE> &&
|
||||
ValidUniqueInputs<SIGNATURE>
|
||||
void init_inputs(const Args<SIGNATURE>& args, UniqueInputs<SIGNATURE>& inputs)
|
||||
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
|
||||
void init_inputs(const Args<SIGNATURE>& args, Inputs<SIGNATURE> inputs)
|
||||
{
|
||||
init_tensor_buffer_uniform_fp(inputs.input_buf, args.make_input_descriptor(), -2.0f, 2.0f);
|
||||
init_tensor_buffer_uniform_fp(inputs.weight_buf, args.make_weight_descriptor(), -2.0f, 2.0f);
|
||||
init_tensor_buffer_uniform_fp(inputs.input, args.make_input_descriptor(), -2.0f, 2.0f);
|
||||
init_tensor_buffer_uniform_fp(inputs.weight, args.make_weight_descriptor(), -2.0f, 2.0f);
|
||||
}
|
||||
|
||||
/// @brief `alloc_outputs()` specialization for forward convolution.
|
||||
@@ -268,4 +284,19 @@ UniqueOutputs<SIGNATURE> alloc_outputs(const Args<SIGNATURE>& args)
|
||||
};
|
||||
}
|
||||
|
||||
/// @brief `validate()` specialization for forward convolution.
|
||||
///
|
||||
/// @tparam SIGNATURE Forward convolution signature.
|
||||
///
|
||||
/// @see validate()
|
||||
template <auto SIGNATURE>
|
||||
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
|
||||
ValidationReport
|
||||
validate(const Args<SIGNATURE>& args, Outputs<SIGNATURE> actual, Outputs<SIGNATURE> expected)
|
||||
{
|
||||
ValidationReport report;
|
||||
report.check("output", args.make_output_descriptor(), actual.output, expected.output);
|
||||
return report;
|
||||
}
|
||||
|
||||
} // namespace ck_tile::builder::test
|
||||
|
||||
150
experimental/builder/include/ck_tile/builder/testing/error.hpp
Normal file
150
experimental/builder/include/ck_tile/builder/testing/error.hpp
Normal file
@@ -0,0 +1,150 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <source_location>
|
||||
#include <stdexcept>
|
||||
#include <sstream>
|
||||
|
||||
/// This file defines some utilities for dealing with HIP errors. In the CK-Builder
|
||||
/// testing code, we'd like to just turn them into exceptions: This cleans up testing
|
||||
/// code as we don't need to think about returning error codes, but its still much
|
||||
/// cleaner than just creating a hard crash and thereby possibly interrupting other
|
||||
/// units in the same test. The testing framework can catch these exceptions where
|
||||
/// necessary.
|
||||
///
|
||||
/// While the exceptions defined in this file are in principle suitable for general
|
||||
/// usage, HIP functions which return HIP error codes (`hipError_t`) should be
|
||||
/// checked using the `check_hip` function.
|
||||
|
||||
namespace ck_tile::builder::test {
|
||||
|
||||
/// @brief Generic HIP exception.
|
||||
///
|
||||
/// This is a derivation of `std::runtime_error` which represents a HIP error code.
|
||||
///
|
||||
/// @see std::runtime_error
|
||||
/// @see hipError_t
|
||||
struct HipError : std::runtime_error
|
||||
{
|
||||
/// @brief Utility for formatting HIP error messages
|
||||
///
|
||||
/// Returns a human-readable description of a HIP error. Given a description of the
|
||||
/// activity that the user tried to perform, this function appends the HIP-specific
|
||||
/// information such as the stringified version of the error code, and the error
|
||||
/// code itself (for reference).
|
||||
///
|
||||
/// @param user_msg User-given message about the activity at time of error.
|
||||
/// @param code The status to report.
|
||||
/// @param src The location where this error was discovered.
|
||||
static std::string
|
||||
format_error(std::string_view user_msg, hipError_t code, std::source_location src)
|
||||
{
|
||||
std::stringstream msg;
|
||||
msg << user_msg << ": " << hipGetErrorString(code) << " (" << code << ")";
|
||||
if(src.function_name())
|
||||
msg << " in function '" << src.function_name();
|
||||
msg << "' at " << src.file_name() << ":" << src.line() << ":" << src.column();
|
||||
return msg.str();
|
||||
}
|
||||
|
||||
/// @brief Construct a generic HIP error.
|
||||
///
|
||||
/// @param msg User-given message about the activity at time of error.
|
||||
/// @param code The status to report.
|
||||
/// @param src The location where this error was discovered. Defaults to the caller's
|
||||
/// location.
|
||||
HipError(std::string_view msg,
|
||||
hipError_t code,
|
||||
std::source_location src = std::source_location::current())
|
||||
: std::runtime_error(format_error(msg, code, src)), code_(code)
|
||||
{
|
||||
}
|
||||
|
||||
/// @brief Retrieve the inner error code.
|
||||
///
|
||||
/// This function returns the status code that was encountered while checking an
|
||||
/// operation for errors.
|
||||
hipError_t code() const { return code_; }
|
||||
|
||||
private:
|
||||
hipError_t code_;
|
||||
};
|
||||
|
||||
/// @brief HIP out of memory error.
|
||||
///
|
||||
/// This a derivation of `HipError` which is specialized for Out-of-memory errors. This
|
||||
/// makes it easier to attach additional context, and to match on these errors while
|
||||
/// using `catch` blocks.
|
||||
///
|
||||
/// @see HipError
|
||||
struct OutOfDeviceMemoryError : HipError
|
||||
{
|
||||
/// @brief Construct an out-of-device-memory error.
|
||||
///
|
||||
/// @param msg User-given message about the activity at time of error.
|
||||
/// @param src The location where this error was discovered. Defaults to the caller's
|
||||
/// location.
|
||||
OutOfDeviceMemoryError(std::string_view msg = "failed to allocate device memory",
|
||||
std::source_location src = std::source_location::current())
|
||||
: HipError(msg, hipErrorOutOfMemory, src)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
/// @brief Check HIP status for errors.
|
||||
///
|
||||
/// This function checks a HIP status code (obtained from a HIP function call) for any
|
||||
/// errors. If the status `code` is not `hipSuccess`, this function throws an instance of
|
||||
/// `HipError`. The exact type thats thrown depends on the status. If `code` represents
|
||||
/// an out-of-memory error `hipErrorOutOfMemory`, then `OutOfDeviceMemoryError` will be
|
||||
/// thrown instead.
|
||||
///
|
||||
/// @param msg User-given message about the activity at possible time of error.
|
||||
/// @param code The HIP status code to examine.
|
||||
/// @param src The location where this status was set. Defaults to the caller's location.
|
||||
///
|
||||
/// @throws HipError if `code` is not `hipSuccess`.
|
||||
///
|
||||
/// @see HipError
|
||||
/// @see OutOfDeviceMemoryError
|
||||
inline void check_hip(std::string_view msg,
|
||||
hipError_t code,
|
||||
std::source_location src = std::source_location::current())
|
||||
{
|
||||
// -Wswitch-enum throws a warning if this code is changed into a switch, even with
|
||||
// the `default` label...
|
||||
|
||||
if(code == hipSuccess)
|
||||
// When you beat the error allegations
|
||||
return;
|
||||
else if(code == hipErrorOutOfMemory)
|
||||
throw OutOfDeviceMemoryError(msg, src);
|
||||
else
|
||||
throw HipError(msg, code, src);
|
||||
}
|
||||
|
||||
/// @brief Check HIP status for errors.
|
||||
///
|
||||
/// This function is similar to `check_hip(std::string_view, hipError_t)`, except that a
|
||||
/// default message is given.
|
||||
///
|
||||
/// @param code The HIP status code to examine.
|
||||
/// @param src The location where this status was set. Defaults to the caller's location.
|
||||
///
|
||||
/// @throws HipError if `code` is not `hipSuccess`.
|
||||
///
|
||||
/// @see HipError
|
||||
/// @see OutOfDeviceMemoryError
|
||||
/// @see check_hip(std::string_view, hipError_t)
|
||||
inline void check_hip(hipError_t code, std::source_location src = std::source_location::current())
|
||||
{
|
||||
check_hip(code == hipErrorOutOfMemory ? "failed to allocate device memory"
|
||||
: "HIP runtime error",
|
||||
code,
|
||||
src);
|
||||
}
|
||||
|
||||
} // namespace ck_tile::builder::test
|
||||
@@ -5,28 +5,29 @@
|
||||
|
||||
namespace ck_tile::builder::test {
|
||||
|
||||
/// This structure describes a 1-, 2-, or 3-D extent. Its used to
|
||||
/// communicate 1-, 2- or 3-D sizes and strides of tensors.
|
||||
/// Depending on the dimension, the structure will have the `width`,
|
||||
/// `height`, and `depth` fields available.
|
||||
/// This structure describes a 1-, 2-, or 3-D extent for convolution
|
||||
/// filters. Its used to communicate 1-, 2- or 3-D sizes and strides
|
||||
/// of tensors, specifically for convolution filters. Depending on the
|
||||
/// dimension, the structure will have the `width`, `height`, and
|
||||
/// `depth` fields available.
|
||||
template <int SPATIAL_DIM>
|
||||
struct Extent;
|
||||
struct FilterExtent;
|
||||
|
||||
template <>
|
||||
struct Extent<1>
|
||||
struct FilterExtent<1>
|
||||
{
|
||||
size_t width = 1;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Extent<2>
|
||||
struct FilterExtent<2>
|
||||
{
|
||||
size_t width = 1;
|
||||
size_t height = 1;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Extent<3>
|
||||
struct FilterExtent<3>
|
||||
{
|
||||
size_t width = 1;
|
||||
size_t height = 1;
|
||||
@@ -3,19 +3,15 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/builder/testing/error.hpp"
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <stdexcept>
|
||||
#include <memory>
|
||||
#include <numeric>
|
||||
#include <span>
|
||||
#include <concepts>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "ck_tile/builder/conv_signature_concepts.hpp"
|
||||
#include "ck_tile/builder/testing/type_traits.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <sstream>
|
||||
|
||||
/// This file deals with tensor memory allocation: Both the act of allocating
|
||||
/// and (automatically) deallocating memory, as well as utilities for managing
|
||||
/// the layout of tensor data in memory.
|
||||
/// This file deals with tensor memory management and allocation. The main
|
||||
/// item is the `DeviceBuffer`: An owned piece of device memory, which is
|
||||
/// automatically freed when it goes out of scope.
|
||||
|
||||
namespace ck_tile::builder::test {
|
||||
|
||||
@@ -39,31 +35,6 @@ struct DeviceMemoryDeleter
|
||||
}
|
||||
};
|
||||
|
||||
/// @brief HIP out of memory error
|
||||
///
|
||||
/// This is a derivation of `std::runtime_error` specialized for HIP
|
||||
/// out-of-memory errors.
|
||||
///
|
||||
/// @see std::runtime_error
|
||||
struct OutOfDeviceMemoryError : std::runtime_error
|
||||
{
|
||||
/// @brief Utility for formatting out-of-memory error messages
|
||||
///
|
||||
/// Returns a human-readable description of a HIP out-of-memory error.
|
||||
///
|
||||
/// @param status The status to report
|
||||
static std::string format_error(hipError_t status)
|
||||
{
|
||||
return std::string("failed to allocate hip memory: ") + hipGetErrorString(status) + " (" +
|
||||
std::to_string(status) + ")";
|
||||
}
|
||||
|
||||
/// @brief Construct an out-of-memory error using `status` as message.
|
||||
///
|
||||
/// @param status A HIP error status that was encountered while allocating memory.
|
||||
OutOfDeviceMemoryError(hipError_t status) : std::runtime_error(format_error(status)) {}
|
||||
};
|
||||
|
||||
/// @brief Automatically managed GPU memory.
|
||||
///
|
||||
/// The `DeviceBuffer` is an automatically managed pointer for GPU memory. When
|
||||
@@ -96,117 +67,18 @@ inline DeviceBuffer alloc_buffer(size_t size)
|
||||
std::byte* d_buf = nullptr;
|
||||
if(const auto status = hipMalloc(&d_buf, size); status != hipSuccess)
|
||||
{
|
||||
throw OutOfDeviceMemoryError(status);
|
||||
// Add some additional context
|
||||
|
||||
size_t free, total;
|
||||
check_hip("failed to get HIP memory info", hipMemGetInfo(&free, &total));
|
||||
|
||||
std::stringstream ss;
|
||||
ss << "failed to allocate device memory (tried to allocate " << size << " bytes with only "
|
||||
<< free << " available)";
|
||||
|
||||
throw OutOfDeviceMemoryError(ss.str());
|
||||
}
|
||||
return DeviceBuffer(d_buf);
|
||||
}
|
||||
|
||||
/// @brief Type managing tensor data layout in memory.
|
||||
///
|
||||
/// This structure describes a tensor in memory. It does not actually hold any
|
||||
/// reference to memory, it just describes how the memory should be laid out if it
|
||||
/// were.
|
||||
///
|
||||
/// @note This type is very much like ck_tile::HostTensorDescriptor, except that it
|
||||
/// also includes the data type of the elements of htis tensor. This is mainly to
|
||||
/// make the descriptor a _complete_ description of a tensor rather than just the
|
||||
/// dimensions in strides, which helps in reducing clutter in uses of this type.
|
||||
///
|
||||
/// @note All strides are still in _elements_.
|
||||
///
|
||||
/// @tparam DT The conceptual data type of the tensor elements. This need not be the
|
||||
/// type that the data is actually stored as in memory.
|
||||
template <DataType DT>
|
||||
struct TensorDescriptor
|
||||
{
|
||||
// For now, the implementation of this type is based on
|
||||
// `ck_tile::HostTensorDescriptor`, so that we can prototype without
|
||||
// reimplementing the `HostTensorDescriptor` for the 3rd time. You can regard
|
||||
// the use of `ck_tile::HostTensorDescriptor` here as an implementation detail.
|
||||
|
||||
/// The conceptual data type of the tensor elements. This need not be the type
|
||||
/// that the data is actually stored as in memory.
|
||||
constexpr static DataType data_type = DT;
|
||||
|
||||
/// @brief Create a tensor descriptor from lengths and strides.
|
||||
///
|
||||
/// @param lengths A sequence of tensor lengths, the conceptial dimensions of
|
||||
/// the tensor in elements.
|
||||
/// @param strides A sequence of in-memory strides of the tensor, measured in
|
||||
/// elements. Each element of `strides`` corresponds to one at the same index
|
||||
/// in `lengths`, the amount of elements to skip in memory to find the next
|
||||
/// element along that axis.
|
||||
TensorDescriptor(std::span<const size_t> lengths, std::span<const size_t> strides)
|
||||
: inner_descriptor_(lengths, strides)
|
||||
{
|
||||
// TODO: Validation of strides? For now we just delegate the details of the
|
||||
// construction to the CK Tile HostTensorDescriptor.
|
||||
}
|
||||
|
||||
/// Query the conceptual dimensions of the tensor.
|
||||
///
|
||||
/// @returns A span of tensor dimensions, one for every axis. Note that the order
|
||||
/// does *not* correspond with memory layout, query the in-memory strides for
|
||||
/// that.
|
||||
///
|
||||
/// @see get_strides()
|
||||
std::span<const size_t> get_lengths() const { return inner_descriptor_.get_lengths(); }
|
||||
|
||||
/// Query the in-memory strides of the tensor.
|
||||
///
|
||||
/// @returns A span of tensor dimensions, one for every axis. Each element
|
||||
/// corresponds directly with the stride in elements at the same index in the
|
||||
/// tensor dimensions.
|
||||
///
|
||||
/// @see get_lengths()
|
||||
std::span<const size_t> get_strides() const { return inner_descriptor_.get_strides(); }
|
||||
|
||||
/// @brief Compute total tensor size in elements.
|
||||
///
|
||||
/// This function returns the total size of the memory backing a tensor with
|
||||
/// this descriptor in *elements*, including required extra size for strides.
|
||||
///
|
||||
/// @see get_element_space_size_in_bytes()
|
||||
size_t get_element_space_size() const { return inner_descriptor_.get_element_space_size(); }
|
||||
|
||||
/// @brief Compute total tensor size in bytes.
|
||||
///
|
||||
/// This function is like `get_element_space_size()`, except that the returned
|
||||
/// value is measured in *bytes* rather than *elements*. Use this function for
|
||||
/// figuring out how much memory needs to be allocated for a particular tensor.
|
||||
///
|
||||
/// @see get_element_space_size()
|
||||
size_t get_element_space_size_in_bytes() const
|
||||
{
|
||||
// For now, the backing type is the naive C++-type that represents the data
|
||||
// type. When we are going to support packed types such as i4 and fp6, this
|
||||
// is going to become more complicated.
|
||||
return get_element_space_size() * data_type_sizeof(DT);
|
||||
}
|
||||
|
||||
private:
|
||||
ck_tile::HostTensorDescriptor inner_descriptor_;
|
||||
};
|
||||
|
||||
/// @brief Allocate automatically managed GPU memory corresponding to a tensor descriptor.
|
||||
///
|
||||
/// This function is similar to `alloc_buffer()`, except that the required size is
|
||||
/// derived automatically from a tensor descriptor. The returned buffer is valid for
|
||||
/// tensors with that layout. Strides are also taken into account when computing the
|
||||
/// required size.
|
||||
///
|
||||
/// @tparam DT The conceptual datatype of the elements of the tensor.
|
||||
/// @param descriptor A descriptor of the memory layout of the tensor to allocate.
|
||||
/// @throws OutOfDeviceMemoryError if memory allocation failed.
|
||||
///
|
||||
/// @see TensorDescriptor
|
||||
/// @see DeviceBuffer
|
||||
/// @see OutOfDeviceMemoryError
|
||||
/// @see hipMalloc()
|
||||
template <DataType DT>
|
||||
DeviceBuffer alloc_tensor_buffer(const TensorDescriptor<DT>& descriptor)
|
||||
{
|
||||
return alloc_buffer(descriptor.get_element_space_size_in_bytes());
|
||||
}
|
||||
|
||||
} // namespace ck_tile::builder::test
|
||||
|
||||
@@ -0,0 +1,444 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdexcept>
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <concepts>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "ck_tile/builder/conv_signature_concepts.hpp"
|
||||
#include "ck_tile/builder/testing/type_traits.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_buffer.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
|
||||
/// This file deals with tensor memory layout. The `TensorDescriptor` is the
|
||||
/// main item, which is a type that describes (but not manages!) the layout
|
||||
/// of tensor memory. There are also some related utilities.
|
||||
|
||||
namespace ck_tile::builder::test {
|
||||
|
||||
/// @brief Tensor dimensions type
|
||||
///
|
||||
/// An Extent describes size in tensor space, usually either the tensor lengths
|
||||
/// (conceptual size) or the tensor strides (memory layout). This type is mainly
|
||||
/// used by the `TensorDescriptor`. This type is based on `std::array<size_t, RANK>`
|
||||
/// and supports all relevant operations on that.
|
||||
///
|
||||
/// @note In practical terms, this type is not just an alias of `std::array` for
|
||||
/// two reasons: First, writing a separate type allows us to write a custom
|
||||
/// CTAD deduction guideline. This allows users to write `Extent{1, 2, 3}` and
|
||||
/// get an instance of the correct type, whereas `std::array{1, 2, 3}` yields an
|
||||
/// instance of `std::array<int, 3>`. This, in turn, allows inferring the rank
|
||||
/// from the instance (useful in combination with `make_descriptor`), as it alows
|
||||
/// us to write `function(Extent{1, 2, 3})`. Note that `function({1, 2, 3})` is
|
||||
/// not valid before C++26 because `{1, 2, 3}` is an initializer list (even if
|
||||
/// `function` accepts an instance of `Extent`), which does not have a known size
|
||||
/// at compile time. Second, creating a separate struct for the `Extent` allows
|
||||
/// additional (static) member functions.
|
||||
///
|
||||
/// @tparam RANK The rank (number of spatial dimensions) of the tensor that this
|
||||
/// extent describes a size of.
|
||||
///
|
||||
/// @see TensorDescriptor
|
||||
/// @see make_descriptor
|
||||
template <size_t RANK>
|
||||
struct Extent : std::array<size_t, RANK>
|
||||
{
|
||||
using Base = std::array<size_t, RANK>;
|
||||
// Note: Default constructor inherited from std::array.
|
||||
|
||||
/// @brief Construct an extent from an `std::vector`.
|
||||
///
|
||||
/// This function can be used to turn an `std::vector` into an `Extent`.
|
||||
/// Because this code is mainly intended for testing, the vector's size is
|
||||
/// checked. If its not equal to `RANK`, an exception is thrown.
|
||||
///
|
||||
/// @throws std::runtime_error if the size of `extent` is not equal to `RANK`.
|
||||
static Extent from_vector(const std::vector<size_t>& extent)
|
||||
{
|
||||
if(extent.size() != RANK)
|
||||
{
|
||||
std::stringstream msg;
|
||||
msg << "invalid rank! expected: " << RANK << ", got: " << extent.size();
|
||||
throw std::runtime_error(msg.str());
|
||||
}
|
||||
|
||||
Extent result;
|
||||
std::copy_n(extent.begin(), RANK, result.begin());
|
||||
return result;
|
||||
}
|
||||
|
||||
// Note: std::array doesn't like generating indexing code when the RANK
|
||||
// is zero. Looks like there is a missing __device__ overload in ROCm 7.1
|
||||
// at least. Its not terribly important, but just override the default
|
||||
// operator[] to fix it.
|
||||
|
||||
/// @brief Array indexing operator
|
||||
///
|
||||
/// `std::array` has issues with this operator when RANK=0, this version
|
||||
/// fixes that.
|
||||
///
|
||||
/// @param i The index to index the array with.
|
||||
///
|
||||
/// @see std::array::operator[]
|
||||
__device__ __host__ size_t operator[](size_t i) const
|
||||
{
|
||||
if constexpr(RANK > 0)
|
||||
{
|
||||
return Base::operator[](i);
|
||||
}
|
||||
else
|
||||
{
|
||||
__builtin_unreachable();
|
||||
}
|
||||
}
|
||||
|
||||
/// @brief Array indexing operator
|
||||
///
|
||||
/// `std::array` has issues with this operator when RANK=0, this version
|
||||
/// fixes that.
|
||||
///
|
||||
/// @param i The index to index the array with.
|
||||
///
|
||||
/// @see std::array::operator[]
|
||||
__device__ __host__ size_t& operator[](size_t i)
|
||||
{
|
||||
if constexpr(RANK > 0)
|
||||
{
|
||||
return Base::operator[](i);
|
||||
}
|
||||
else
|
||||
{
|
||||
__builtin_unreachable();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// This is a deduction guideline necessary to resolve `Extent{1, 2, 3}` to the
|
||||
// correct type. This definition is practically the same as that of `std::array`.
|
||||
template <typename... T>
|
||||
Extent(T...) -> Extent<sizeof...(T)>;
|
||||
|
||||
/// @brief Concept for automatically deriving tensor memory layout.
|
||||
///
|
||||
/// A `TensorStridesGenerator` is a type which can be used to automatically
|
||||
/// derive the strides (memory layout) of a tensor, given the tensor lengths.
|
||||
/// This is mainly used to avoid manually computing strides.
|
||||
///
|
||||
/// Implementors of this concept are required to implement `operator()`,
|
||||
/// which accepts an instance of `Extent<RANK>` (the tensor lengths) and
|
||||
/// yields another instance of `Extent<RANK>` (the tensor strides). Note
|
||||
/// that the returned strides are expected to be "pre-scanned", meaning
|
||||
/// that the offset in memory of a tensor can be computed as
|
||||
/// `dot(index * strides)` (where `*` is element-wise multiplication).
|
||||
///
|
||||
/// @see TensorDescriptor
|
||||
/// @see PackedRightLayout
|
||||
/// @see PackedLeftLayout
|
||||
template <typename G, int RANK>
|
||||
concept TensorStridesGenerator = requires(const G& generator, const Extent<RANK>& lengths) {
|
||||
{ generator(lengths) } -> std::convertible_to<Extent<RANK>>;
|
||||
};
|
||||
|
||||
/// @brief Layout generator where right-most dimension has stride 1 and
|
||||
/// all dimensions are packed.
|
||||
///
|
||||
/// This structure implements a `TensorStridesGenerator` which generates
|
||||
/// a memory layout which has the right-most dimension equal to 1, and
|
||||
/// all other strides increase right-to-left as a products of the extent.
|
||||
/// This corresponds with a row-major layout.
|
||||
///
|
||||
/// @see TensorStridesGenerator
|
||||
/// @see TensorDescriptor
|
||||
struct PackedRightLayout
|
||||
{
|
||||
/// @brief Stride generation implementation.
|
||||
///
|
||||
/// This is the main function which implements the stride generation
|
||||
///
|
||||
/// @tparam RANK The rank of the tensor.
|
||||
///
|
||||
/// @param lengths The lengths of the tensor.
|
||||
///
|
||||
/// @returns The tensor's memory layout according to the definition
|
||||
/// of `PackedRightLayout`.
|
||||
///
|
||||
/// @see TensorStridesGenerator
|
||||
template <size_t RANK>
|
||||
Extent<RANK> operator()(const Extent<RANK>& lengths) const
|
||||
{
|
||||
Extent<RANK> strides = {};
|
||||
size_t numel = 1;
|
||||
|
||||
for(size_t i = RANK; i > 0; --i)
|
||||
{
|
||||
strides[i - 1] = numel;
|
||||
numel *= lengths[i - 1];
|
||||
}
|
||||
|
||||
return strides;
|
||||
}
|
||||
};
|
||||
static_assert(TensorStridesGenerator<PackedRightLayout, 3>,
|
||||
"PackedRightLayout should be a TensorStridesGenerator!");
|
||||
|
||||
/// @brief Layout generator where left-most dimension has stride 1 and
|
||||
/// all dimensions are packed.
|
||||
///
|
||||
/// This structure implements a `TensorStridesGenerator` which generates
|
||||
/// a memory layout which has the left-most dimension equal to 1, and
|
||||
/// all other strides increase left-to-right as a products of the extent.
|
||||
/// This corresponds with a column-major layout.
|
||||
///
|
||||
/// @see TensorStridesGenerator
|
||||
/// @see TensorDescriptor
|
||||
struct PackedLeftLayout
|
||||
{
|
||||
/// @brief Stride generation implementation.
|
||||
///
|
||||
/// This is the main function which implements the stride generation
|
||||
///
|
||||
/// @tparam RANK The rank of the tensor.
|
||||
///
|
||||
/// @param lengths The lengths of the tensor.
|
||||
///
|
||||
/// @returns The tensor's memory layout according to the definition
|
||||
/// of `PackedLeftLayout`.
|
||||
///
|
||||
/// @see TensorStridesGenerator
|
||||
template <size_t RANK>
|
||||
Extent<RANK> operator()(const Extent<RANK>& lengths) const
|
||||
{
|
||||
Extent<RANK> strides = {};
|
||||
size_t numel = 1;
|
||||
|
||||
for(size_t i = 0; i < RANK; ++i)
|
||||
{
|
||||
strides[i] = numel;
|
||||
numel *= lengths[i];
|
||||
}
|
||||
|
||||
return strides;
|
||||
}
|
||||
};
|
||||
static_assert(TensorStridesGenerator<PackedLeftLayout, 3>,
|
||||
"PackedLeftLayout should be a TensorStridesGenerator!");
|
||||
|
||||
/// @brief Type managing tensor data layout in memory.
|
||||
///
|
||||
/// This structure describes a tensor in memory. It does not actually hold any
|
||||
/// reference to memory, it just describes how the memory should be laid out if it
|
||||
/// were.
|
||||
///
|
||||
/// @note This type is very much like ck_tile::HostTensorDescriptor, except that it
|
||||
/// also includes the data type of the elements of htis tensor. This is mainly to
|
||||
/// make the descriptor a _complete_ description of a tensor rather than just the
|
||||
/// dimensions in strides, which helps in reducing clutter in uses of this type.
|
||||
///
|
||||
/// @note All strides are still in _elements_.
|
||||
///
|
||||
/// @tparam DT The conceptual data type of the tensor elements. This need not be the
|
||||
/// type that the data is actually stored as in memory.
|
||||
/// @tparam RANK The tensor "rank": the number of conceptial spatial dimensions that
|
||||
/// the tensor covers.
|
||||
template <DataType DT, size_t RANK>
|
||||
struct TensorDescriptor
|
||||
{
|
||||
// For now, the implementation of this type is based on
|
||||
// `ck_tile::HostTensorDescriptor`, so that we can prototype without
|
||||
// reimplementing the `HostTensorDescriptor` for the 3rd time. You can regard
|
||||
// the use of `ck_tile::HostTensorDescriptor` here as an implementation detail.
|
||||
|
||||
/// @brief Tensor extent alias
|
||||
///
|
||||
/// This alias represents a std::array which holds tensor dimensions. There is one
|
||||
/// item for each dimension in the tensor, and each item corresponds with the
|
||||
/// value for that dimension.
|
||||
using Extent = ::ck_tile::builder::test::Extent<RANK>;
|
||||
|
||||
/// The conceptual data type of the tensor elements. This need not be the type
|
||||
/// that the data is actually stored as in memory.
|
||||
constexpr static DataType data_type = DT;
|
||||
|
||||
/// The tensor "rank": the number of conceptial spatial dimensions that the
|
||||
/// tensor covers.
|
||||
constexpr static size_t rank = RANK;
|
||||
|
||||
/// @brief Create a tensor descriptor from lengths and strides.
|
||||
///
|
||||
/// @param lengths A sequence of tensor lengths, the conceptial dimensions of
|
||||
/// the tensor in elements.
|
||||
/// @param strides A sequence of in-memory strides of the tensor, measured in
|
||||
/// elements. Each element of `strides`` corresponds to one at the same index
|
||||
/// in `lengths`, the amount of elements to skip in memory to find the next
|
||||
/// element along that axis.
|
||||
TensorDescriptor(const Extent& lengths, const Extent& strides)
|
||||
: inner_descriptor_(lengths, strides)
|
||||
{
|
||||
// TODO: Validation of strides? For now we just delegate the details of the
|
||||
// construction to the CK Tile HostTensorDescriptor.
|
||||
}
|
||||
|
||||
/// @brief Create a tensor descriptor with lengths and automatic layout.
|
||||
///
|
||||
/// This function initializes a tensor descriptor using lengths, and by deriving
|
||||
/// the memory layout from the layout generator `Generator`. The tensor will be
|
||||
/// initialized with the strides yielded from `Generator`.
|
||||
///
|
||||
/// @tparam Generator The generator type to generate the strides with. For example,
|
||||
/// `PackedRightLayout` or `PackedLeftLayout`.
|
||||
///
|
||||
/// @param lengths A sequence of tensor lengths, the conceptial dimensions of
|
||||
/// the tensor in elements.
|
||||
/// @param gen An instance of `Generator` to generate the strides with.
|
||||
///
|
||||
/// @see TensorStridesGenerator
|
||||
/// @see PackedLeftLayout
|
||||
/// @see PackedRightLayout
|
||||
template <typename Generator>
|
||||
requires TensorStridesGenerator<Generator, RANK>
|
||||
TensorDescriptor(const Extent& lengths, const Generator& gen)
|
||||
: TensorDescriptor(lengths, gen(lengths))
|
||||
{
|
||||
}
|
||||
|
||||
/// Query the conceptual dimensions of the tensor.
|
||||
///
|
||||
/// @returns A span of tensor dimensions, one for every axis. Note that the order
|
||||
/// does *not* correspond with memory layout, query the in-memory strides for that.
|
||||
///
|
||||
/// @see get_strides()
|
||||
Extent get_lengths() const
|
||||
{
|
||||
// TODO: This is ugly for now. We should ditch the HostTensorDescriptor, and
|
||||
// after that this can just be `return lengths_;` (and make it const Extent&).
|
||||
Extent result;
|
||||
std::copy_n(inner_descriptor_.get_lengths().begin(), RANK, result.begin());
|
||||
return result;
|
||||
}
|
||||
|
||||
/// Query the in-memory strides of the tensor.
|
||||
///
|
||||
/// @returns A span of tensor dimensions, one for every axis. Each element
|
||||
/// corresponds directly with the stride in elements at the same index in the
|
||||
/// tensor dimensions.
|
||||
///
|
||||
/// @see get_lengths()
|
||||
Extent get_strides() const
|
||||
{
|
||||
// TODO: This is ugly for now. We should ditch the HostTensorDescriptor, and
|
||||
// after that this can just be `return strides_;` (and make it const Extent&).
|
||||
Extent result;
|
||||
std::copy_n(inner_descriptor_.get_strides().begin(), RANK, result.begin());
|
||||
return result;
|
||||
}
|
||||
|
||||
/// @brief Compute conceptual tensor size in elements.
|
||||
///
|
||||
/// This function returns the size of the tensor in elements. This function only
|
||||
/// takes the lengths into account, not the strides. In order to allocate memory
|
||||
/// for the tensor, use `get_element_space_size()`.
|
||||
///
|
||||
/// @see get_lengths
|
||||
/// @see get_element_space_size
|
||||
size_t get_element_size() const { return inner_descriptor_.get_element_size(); }
|
||||
|
||||
/// @brief Compute total tensor space size in elements.
|
||||
///
|
||||
/// This function returns the total size of the memory backing a tensor with
|
||||
/// this descriptor in *elements*, including required extra size for strides.
|
||||
///
|
||||
/// @see get_element_space_size_in_bytes()
|
||||
size_t get_element_space_size() const { return inner_descriptor_.get_element_space_size(); }
|
||||
|
||||
/// @brief Compute total tensor size in bytes.
|
||||
///
|
||||
/// This function is like `get_element_space_size()`, except that the returned
|
||||
/// value is measured in *bytes* rather than *elements*. Use this function for
|
||||
/// figuring out how much memory needs to be allocated for a particular tensor.
|
||||
///
|
||||
/// @see get_element_space_size()
|
||||
size_t get_element_space_size_in_bytes() const
|
||||
{
|
||||
// For now, the backing type is the naive C++-type that represents the data
|
||||
// type. When we are going to support packed types such as i4 and fp6, this
|
||||
// is going to become more complicated.
|
||||
return get_element_space_size() * data_type_sizeof(DT);
|
||||
}
|
||||
|
||||
/// @brief Get a tensor descriptor for the space backing a tensor.
|
||||
///
|
||||
/// This function returns a tensor descriptor which represents the buffer space
|
||||
/// required to a tensor with this descriptor. This is mainly useful to process
|
||||
/// buffers with functions which normally operate over tensor descriptors. The
|
||||
/// resulting tensor descriptor describes a 1D tensor with the same number of
|
||||
/// elements as in the space.
|
||||
///
|
||||
/// @see get_element_space_size()
|
||||
TensorDescriptor<DT, 1> get_space_descriptor() const
|
||||
{
|
||||
ck_tile::builder::test::Extent<1> lengths = {this->get_element_space_size()};
|
||||
ck_tile::builder::test::Extent<1> strides = {1};
|
||||
return TensorDescriptor<DT, 1>(lengths, strides);
|
||||
}
|
||||
|
||||
private:
|
||||
ck_tile::HostTensorDescriptor inner_descriptor_;
|
||||
};
|
||||
|
||||
/// @brief Tensor descriptor construction helper.
|
||||
///
|
||||
/// This function can be used to create a tensor descriptor. It accepts the same
|
||||
/// parameters as the constructor of `TensorDescriptor`, that is, a sequence of
|
||||
/// lengths and a sequence of strides (or a generator to generate the strides).
|
||||
/// The main use of this function is that it allows automatic inference of the `RANK`
|
||||
/// parameter. C++ constructors do not allow partial specification of type parameters,
|
||||
/// and so its impossible to write `TensorDescriptor<DT> x(Extent{1, 2, 3}, ...)`
|
||||
/// and have the `RANK` be automatically inferred. Functions do allow this though,
|
||||
/// so this function can be used to write `make_descriptor(Extent{1, 2, 3}, ...)`
|
||||
///
|
||||
/// @tparam DT The conceptual data type of the tensor elements. This need not be the
|
||||
/// type that the data is actually stored as in memory.
|
||||
/// @tparam RANK The tensor "rank": the number of conceptial spatial dimensions that
|
||||
/// the tensor covers.
|
||||
///
|
||||
/// @param lengths A sequence of tensor lengths, the conceptial dimensions of
|
||||
/// the tensor in elements.
|
||||
/// @param strides A sequence of in-memory strides of the tensor, or a generator
|
||||
/// to generate those strides from the tensor lengths.
|
||||
///
|
||||
/// @see TensorDescriptor
|
||||
template <DataType DT, size_t RANK>
|
||||
TensorDescriptor<DT, RANK> make_descriptor(const Extent<RANK>& lengths, const auto& strides)
|
||||
{
|
||||
return TensorDescriptor<DT, RANK>(lengths, strides);
|
||||
}
|
||||
|
||||
/// @brief Allocate automatically managed GPU memory corresponding to a tensor descriptor.
|
||||
///
|
||||
/// This function is similar to `alloc_buffer()`, except that the required size is
|
||||
/// derived automatically from a tensor descriptor. The returned buffer is valid for
|
||||
/// tensors with that layout. Strides are also taken into account when computing the
|
||||
/// required size.
|
||||
///
|
||||
/// @tparam DT The conceptual datatype of the elements of the tensor.
|
||||
/// @tparam RANK The conceptual rank (number of dimensions) of the tensor.
|
||||
///
|
||||
/// @param descriptor A descriptor of the memory layout of the tensor to allocate.
|
||||
///
|
||||
/// @throws OutOfDeviceMemoryError if memory allocation failed.
|
||||
///
|
||||
/// @see TensorDescriptor
|
||||
/// @see DeviceBuffer
|
||||
/// @see OutOfDeviceMemoryError
|
||||
/// @see hipMalloc()
|
||||
template <DataType DT, size_t RANK>
|
||||
DeviceBuffer alloc_tensor_buffer(const TensorDescriptor<DT, RANK>& descriptor)
|
||||
{
|
||||
return alloc_buffer(descriptor.get_element_space_size_in_bytes());
|
||||
}
|
||||
|
||||
} // namespace ck_tile::builder::test
|
||||
@@ -0,0 +1,258 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/builder/testing/tensor_descriptor.hpp"
|
||||
#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
|
||||
#include <cstdint>
|
||||
#include <concepts>
|
||||
#include <array>
|
||||
|
||||
/// This file implements a generic GPU tensor "foreach" function. This
|
||||
/// functionality turned out useful in separate parts of the testing
|
||||
/// system, hence its implemented in a separate file. This version is
|
||||
/// not particularly efficient (but it should at least be readable),
|
||||
/// but it should be easy to replace the implementation in the future,
|
||||
/// should that be needed.
|
||||
|
||||
namespace ck_tile::builder::test {
|
||||
|
||||
/// @brief Concept for constraining tensor iteration functors.
|
||||
///
|
||||
/// This concept checks that a functor has the correct signature for
|
||||
/// use with the `tensor_foreach` function.
|
||||
template <typename F, int RANK>
|
||||
concept ForeachFunctor = requires(const F& f, const Extent<RANK>& index) {
|
||||
{ f(index) } -> std::same_as<void>;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
/// @brief Default foreach kernel block size
|
||||
///
|
||||
/// This value is the default number of threads in each block when
|
||||
/// executing the foreach kernel. This value is mostly arbitrary,
|
||||
/// 256 is usually a good default for AMD GPUs.
|
||||
///
|
||||
/// @see tensor_foreach
|
||||
constexpr int DEVICE_FOREACH_BLOCK_SIZE = 256;
|
||||
|
||||
/// @brief Tensor iteration kernel
|
||||
///
|
||||
/// This kernel implements the actual iteration logic, and is intended
|
||||
/// to be used solely by `tensor_foreach` to iterate & invoke the
|
||||
/// actual callback.
|
||||
///
|
||||
/// @tparam BLOCK_SIZE The number of threads in each block on the GPU.
|
||||
/// @tparam RANK The rank (number of spatial dimensions) of the tensor to
|
||||
/// iterate.
|
||||
/// @tparam F The type of the callback to invoke. This function must be
|
||||
/// compatible with execution as a __device__ function.
|
||||
///
|
||||
/// @param numel The total number of elements in the tensor.
|
||||
/// @param shape_scan A right-exclusive scan of the shape of the tensor.
|
||||
/// @param f The callback to invoke for each index of the tensor. This
|
||||
/// functor must be eligible for running on the GPU.
|
||||
template <int BLOCK_SIZE, size_t RANK, typename F>
|
||||
requires ForeachFunctor<F, RANK>
|
||||
__global__ __launch_bounds__(BLOCK_SIZE) //
|
||||
void foreach_kernel(const size_t numel, Extent<RANK> shape_scan, F f)
|
||||
{
|
||||
const auto gid = blockIdx.x * BLOCK_SIZE + threadIdx.x;
|
||||
for(size_t flat_idx = gid; flat_idx < numel; flat_idx += gridDim.x * BLOCK_SIZE)
|
||||
{
|
||||
// Compute the current index.
|
||||
Extent<RANK> index = {};
|
||||
|
||||
size_t idx = flat_idx;
|
||||
for(size_t i = 0; i < RANK; ++i)
|
||||
{
|
||||
const auto scanned_dim = shape_scan[i];
|
||||
index[i] = idx / scanned_dim;
|
||||
idx %= scanned_dim;
|
||||
}
|
||||
|
||||
// Then invoke the callback with the index.
|
||||
f(index);
|
||||
}
|
||||
}
|
||||
|
||||
/// @brief A utility to get a C++ type for a CKB type
|
||||
///
|
||||
/// Right now this is just an alias of an internal CKB helper,
|
||||
/// but this should probably be moved elsewhere.
|
||||
template <builder::DataType DT>
|
||||
using cpp_type_t = typename builder::factory::internal::DataTypeToCK<DT>::type;
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// @brief Calculate tensor memory offset given index and strides.
|
||||
///
|
||||
/// This function returns the offset in memory in a tensor, given a particular
|
||||
/// multi-dimensional index and a particular set of strides. Each value in the
|
||||
/// index corresponds one-to-one with a value in the strides, which are the
|
||||
/// index and stride at that dimension in the tensor. These strides must be
|
||||
/// pre-scanned, meaning that each index is the absolute stride of elements
|
||||
/// along that axis. In essence, this means that you should pass the output of
|
||||
/// `TensorDescriptor::get_strides()` into this function.
|
||||
///
|
||||
/// @pre The index must be inside the tensor space.
|
||||
///
|
||||
/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
|
||||
///
|
||||
/// @param index A multi-dimensional index inside the tensor space.
|
||||
/// @param strides A set of strides, one for each dimension.
|
||||
///
|
||||
/// @see TensorDescriptor
|
||||
template <size_t RANK>
|
||||
__host__ __device__ size_t calculate_offset(const Extent<RANK>& index, const Extent<RANK>& strides)
|
||||
{
|
||||
size_t offset = 0;
|
||||
#pragma unroll
|
||||
for(size_t i = 0; i < RANK; ++i)
|
||||
{
|
||||
offset += index[i] * strides[i];
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
/// @brief Invoke a callback on the GPU for every index in a tensor.
|
||||
///
|
||||
/// This function invokes a callback functor on the GPU, for each index in
|
||||
/// a tensor. This function _only_ takes care of iterating over all indices
|
||||
/// in a tensor of a particular shape; this function does not handle or know
|
||||
/// about actual tensor data.
|
||||
///
|
||||
/// @note This function is currently implemented relatively naively: The
|
||||
/// iteration order is always row-wise, implemented as a persistent kernel.
|
||||
/// The main objective of this function is to be used with the CK-Builder
|
||||
/// testing system, and so readability and correctness should be preferred
|
||||
/// over performance. If this is ever a source of performance problems,
|
||||
/// feel free to replace the implementation with something better.
|
||||
///
|
||||
/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
|
||||
///
|
||||
/// @param shape The shape of the tensor to iterate over.
|
||||
/// @param f The callback to invoke for each index of the tensor. This
|
||||
/// functor must be eligible for running on the GPU.
|
||||
///
|
||||
/// @see ForeachFunctor
|
||||
/// @see detail::foreach_kernel
|
||||
template <size_t RANK>
|
||||
void tensor_foreach(const Extent<RANK>& shape, ForeachFunctor<RANK> auto f)
|
||||
{
|
||||
constexpr int block_size = detail::DEVICE_FOREACH_BLOCK_SIZE;
|
||||
const auto kernel = detail::foreach_kernel<block_size, RANK, decltype(f)>;
|
||||
|
||||
int occupancy;
|
||||
check_hip(hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, block_size, 0));
|
||||
|
||||
int device;
|
||||
check_hip(hipGetDevice(&device));
|
||||
|
||||
int multiprocessors;
|
||||
check_hip(
|
||||
hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, device));
|
||||
|
||||
// Pre-scan the shape to help indexing in the kernel.
|
||||
// Note: the order is not that important, so long as the iteration
|
||||
// order in the kernel is from large-to-small. Right layout is the
|
||||
// easiest solution for that.
|
||||
|
||||
Extent<RANK> shape_scan;
|
||||
size_t numel = 1;
|
||||
for(int i = RANK; i > 0; --i)
|
||||
{
|
||||
shape_scan[i - 1] = numel;
|
||||
numel *= shape[i - 1];
|
||||
}
|
||||
|
||||
// Reset any errors from previous launches.
|
||||
(void)hipGetLastError();
|
||||
|
||||
kernel<<<occupancy * multiprocessors, block_size>>>(numel, shape_scan, f);
|
||||
check_hip(hipGetLastError());
|
||||
}
|
||||
|
||||
/// @brief Concept for tensor initializing functors.
|
||||
///
|
||||
/// This concept checks that a functor has the correct signature for
|
||||
/// use with the `fill_tensor` function.
|
||||
template <typename F, builder::DataType DT, size_t RANK>
|
||||
concept FillTensorFunctor = requires(const F& f, const Extent<RANK>& index) {
|
||||
{ f(index) } -> std::convertible_to<detail::cpp_type_t<DT>>;
|
||||
};
|
||||
|
||||
/// @brief Utility for initializing tensors.
|
||||
///
|
||||
/// This function is a utility helper for initializing tensors. It accepts a
|
||||
/// tensor descriptor, buffer, and a callback. The callback is invoked for every
|
||||
/// coordinate (which is passed to the callback), and the tensor is initialized
|
||||
/// with resulting value.
|
||||
///
|
||||
/// @tparam DT The tensor element datatype
|
||||
/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
|
||||
///
|
||||
/// @param desc The descriptor of the tensor to initialize.
|
||||
/// @param buffer The memory of the tensor to initialize.
|
||||
/// @param f A functor used to get the value at a particular coordinate.
|
||||
///
|
||||
/// @see FillTensorFunctor
|
||||
template <builder::DataType DT, size_t RANK>
|
||||
void fill_tensor(const TensorDescriptor<DT, RANK>& desc,
|
||||
void* buffer,
|
||||
FillTensorFunctor<DT, RANK> auto f)
|
||||
{
|
||||
const auto strides = desc.get_strides();
|
||||
tensor_foreach(desc.get_lengths(), [buffer, f, strides](const auto& index) {
|
||||
using T = detail::cpp_type_t<DT>;
|
||||
auto* ptr = static_cast<T*>(buffer);
|
||||
const auto offset = calculate_offset(index, strides);
|
||||
|
||||
ptr[offset] = f(index);
|
||||
});
|
||||
}
|
||||
|
||||
/// @brief Concept for tensor buffer initializing functors.
|
||||
///
|
||||
/// This concept checks that a functor has the correct signature for
|
||||
/// use with the `fill_tensor_buffer` function.
|
||||
template <typename F, builder::DataType DT>
|
||||
concept FillTensorBufferFunctor = requires(const F& f, size_t index) {
|
||||
{ f(index) } -> std::convertible_to<detail::cpp_type_t<DT>>;
|
||||
};
|
||||
|
||||
/// @brief Utility for initializing tensor buffers.
|
||||
///
|
||||
/// This function is a utility for initializing memory backing a tensor buffer. In
|
||||
/// contrast to `fill_tensor`, this function first extracts the backing space of
|
||||
/// the tensor, and then invokes the callback for each (flat) index. This function
|
||||
/// is particular useful for initializing out-of-bounds indices with a known with a
|
||||
/// known value.
|
||||
///
|
||||
/// @tparam DT The tensor element datatype
|
||||
/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
|
||||
///
|
||||
/// @param desc The descriptor of the tensor to initialize.
|
||||
/// @param buffer The memory of the tensor to initialize.
|
||||
/// @param f A functor used to get the value at a particular index.
|
||||
///
|
||||
/// @see FillTensorBufferFunctor
|
||||
template <builder::DataType DT, size_t RANK>
|
||||
void fill_tensor_buffer(const TensorDescriptor<DT, RANK>& desc,
|
||||
void* buffer,
|
||||
FillTensorBufferFunctor<DT> auto f)
|
||||
{
|
||||
fill_tensor(desc.get_space_descriptor(), buffer, [f](auto index) { return f(index[0]); });
|
||||
}
|
||||
|
||||
template <builder::DataType DT, size_t RANK>
|
||||
void clear_tensor_buffer(const TensorDescriptor<DT, RANK>& desc,
|
||||
void* buffer,
|
||||
detail::cpp_type_t<DT> value = detail::cpp_type_t<DT>{0})
|
||||
{
|
||||
fill_tensor_buffer(desc, buffer, [value]([[maybe_unused]] size_t i) { return value; });
|
||||
}
|
||||
|
||||
} // namespace ck_tile::builder::test
|
||||
@@ -19,15 +19,30 @@
|
||||
|
||||
namespace ck_tile::builder::test {
|
||||
|
||||
template <DataType DT>
|
||||
void init_tensor_buffer_uniform_int(const DeviceBuffer& buf,
|
||||
const TensorDescriptor<DT>& descriptor,
|
||||
int min_val,
|
||||
int max_val)
|
||||
/// @brief Initialize tensor data with a uniform int distribution
|
||||
///
|
||||
/// This function initializes a tensor's device memory with random integer data,
|
||||
/// drawn from a uniform distribution. The initialization is done directly on the
|
||||
/// GPU. Note that the entire buffer is filled with the specified distribution
|
||||
/// regardless of whether the layout is packed.
|
||||
///
|
||||
/// @tparam DT The data type of the tensor memory to initialize
|
||||
/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
|
||||
///
|
||||
/// @param buf The device memory to initialize
|
||||
/// @param descriptor A tensor descriptor describing the precise layout of the
|
||||
/// tensor memory.
|
||||
/// @param min_value The minimum value of the distribution (inclusive).
|
||||
/// @param max_value The maximum value of the distribution (exclusive).
|
||||
template <DataType DT, size_t RANK>
|
||||
void init_tensor_buffer_uniform_int(void* buf,
|
||||
const TensorDescriptor<DT, RANK>& descriptor,
|
||||
int min_value,
|
||||
int max_value)
|
||||
{
|
||||
size_t size = descriptor.get_element_space_size_in_bytes();
|
||||
|
||||
if(max_val - min_val <= 1)
|
||||
if(max_value - min_value <= 1)
|
||||
{
|
||||
throw std::runtime_error("Error while filling device tensor with random integer data: max "
|
||||
"value must be at least 2 greater than min value, otherwise "
|
||||
@@ -38,19 +53,34 @@ void init_tensor_buffer_uniform_int(const DeviceBuffer& buf,
|
||||
|
||||
// we might be asked to generate int values on fp data types that don't have the required
|
||||
// precision
|
||||
if(static_cast<ck_type>(max_val - 1) == static_cast<ck_type>(min_val))
|
||||
if(static_cast<ck_type>(max_value - 1) == static_cast<ck_type>(min_value))
|
||||
{
|
||||
throw std::runtime_error("Error while filling device tensor with random integer data: "
|
||||
"insufficient precision in specified range");
|
||||
}
|
||||
size_t packed_size = ck::packed_size_v<ck_type>;
|
||||
fill_tensor_uniform_rand_int_values<<<256, 256>>>(
|
||||
static_cast<ck_type>(buf.get()), min_val, max_val, (size * packed_size) / sizeof(ck_type));
|
||||
static_cast<ck_type>(buf), min_value, max_value, (size * packed_size) / sizeof(ck_type));
|
||||
}
|
||||
|
||||
template <DataType DT>
|
||||
void init_tensor_buffer_uniform_fp(const DeviceBuffer& buf,
|
||||
const TensorDescriptor<DT>& descriptor,
|
||||
/// @brief Initialize tensor data with a uniform float distribution
|
||||
///
|
||||
/// This function initializes a tensor's device memory with random floating data,
|
||||
/// drawn from a uniform distribution. The initialization is done directly on the
|
||||
/// GPU. Note that the entire buffer is filled with the specified distribution
|
||||
/// regardless of whether the layout is packed.
|
||||
///
|
||||
/// @tparam DT The data type of the tensor memory to initialize
|
||||
/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
|
||||
///
|
||||
/// @param buf The device memory to initialize
|
||||
/// @param descriptor A tensor descriptor describing the precise layout of the
|
||||
/// tensor memory.
|
||||
/// @param min_value The minimum value of the distribution (inclusive).
|
||||
/// @param max_value The maximum value of the distribution (exclusive).
|
||||
template <DataType DT, size_t RANK>
|
||||
void init_tensor_buffer_uniform_fp(void* buf,
|
||||
const TensorDescriptor<DT, RANK>& descriptor,
|
||||
float min_value,
|
||||
float max_value)
|
||||
{
|
||||
@@ -59,15 +89,30 @@ void init_tensor_buffer_uniform_fp(const DeviceBuffer& buf,
|
||||
using ck_type = factory::internal::DataTypeToCK<DT>::type;
|
||||
|
||||
size_t packed_size = ck::packed_size_v<ck_type>;
|
||||
fill_tensor_uniform_rand_fp_values<<<256, 256>>>(reinterpret_cast<ck_type*>(buf.get()),
|
||||
fill_tensor_uniform_rand_fp_values<<<256, 256>>>(reinterpret_cast<ck_type*>(buf),
|
||||
min_value,
|
||||
max_value,
|
||||
(size * packed_size) / sizeof(ck_type));
|
||||
}
|
||||
|
||||
template <DataType DT>
|
||||
void init_tensor_buffer_normal_fp(const DeviceBuffer& buf,
|
||||
const TensorDescriptor<DT>& descriptor,
|
||||
/// @brief Initialize tensor data with a normal float distribution
|
||||
///
|
||||
/// This function initializes a tensor's device memory with random floating data,
|
||||
/// drawn from a normal distribution. The initialization is done directly on the
|
||||
/// GPU. Note that the entire buffer is filled with the specified distribution
|
||||
/// regardless of whether the layout is packed.
|
||||
///
|
||||
/// @tparam DT The data type of the tensor memory to initialize
|
||||
/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
|
||||
///
|
||||
/// @param buf The device memory to initialize
|
||||
/// @param descriptor A tensor descriptor describing the precise layout of the
|
||||
/// tensor memory.
|
||||
/// @param sigma The standard deviation of the distribution.
|
||||
/// @param mean The mean of the distribution.
|
||||
template <DataType DT, size_t RANK>
|
||||
void init_tensor_buffer_normal_fp(void* buf,
|
||||
const TensorDescriptor<DT, RANK>& descriptor,
|
||||
float sigma,
|
||||
float mean)
|
||||
{
|
||||
@@ -76,7 +121,7 @@ void init_tensor_buffer_normal_fp(const DeviceBuffer& buf,
|
||||
using ck_type = factory::internal::DataTypeToCK<DT>::type;
|
||||
size_t packed_size = ck::packed_size_v<ck_type>;
|
||||
fill_tensor_norm_rand_fp_values<<<256, 256>>>(
|
||||
static_cast<ck_type*>(buf.get()), sigma, mean, (size * packed_size) / sizeof(ck_type));
|
||||
static_cast<ck_type*>(buf), sigma, mean, (size * packed_size) / sizeof(ck_type));
|
||||
}
|
||||
|
||||
} // namespace ck_tile::builder::test
|
||||
|
||||
@@ -5,6 +5,8 @@
|
||||
|
||||
#include <concepts>
|
||||
|
||||
#include "ck_tile/builder/testing/validation.hpp"
|
||||
|
||||
/// This file is the main header for the CK-Builder testing system. A high-level
|
||||
/// description of this testing system is documented in
|
||||
/// `ck_tile/builder/testing/README.md`. This file deals mainly deals with the
|
||||
@@ -78,7 +80,7 @@ namespace ck_tile::builder::test {
|
||||
/// that this structure is an aggregrate so that it can be initialized using C++20
|
||||
/// designated initializers to keep the tests readable.
|
||||
///
|
||||
/// @tparam SIGNATURE the signature to specialize the structure for.
|
||||
/// @tparam SIGNATURE The signature to specialize the structure for.
|
||||
template <auto SIGNATURE>
|
||||
struct Args;
|
||||
|
||||
@@ -98,7 +100,7 @@ struct Args;
|
||||
/// structure is an aggregrate so that it can be initialized using C++20
|
||||
/// designated initializers to keep the tests readable.
|
||||
///
|
||||
/// @tparam SIGNATURE the signature to specialize the structure for.
|
||||
/// @tparam SIGNATURE The signature to specialize the structure for.
|
||||
template <auto SIGNATURE>
|
||||
struct Inputs;
|
||||
|
||||
@@ -118,7 +120,7 @@ struct Inputs;
|
||||
/// structure is an aggregrate so that it can be initialized using C++20
|
||||
/// designated initializers to keep the tests readable.
|
||||
///
|
||||
/// @tparam SIGNATURE the signature to specialize the structure for.
|
||||
/// @tparam SIGNATURE The signature to specialize the structure for.
|
||||
template <auto SIGNATURE>
|
||||
struct Outputs;
|
||||
|
||||
@@ -133,7 +135,7 @@ struct Outputs;
|
||||
/// @note The easiest way to implement this type is to use the `DeviceBuffer`
|
||||
/// type to allocate individual device buffers for each input tensor.
|
||||
///
|
||||
/// @tparam SIGNATURE the signature to specialize the structure for.
|
||||
/// @tparam SIGNATURE The signature to specialize the structure for.
|
||||
///
|
||||
/// @see alloc_inputs()
|
||||
/// @see ValidUniqueInputs
|
||||
@@ -152,7 +154,7 @@ struct UniqueInputs;
|
||||
/// @note The easiest way to implement this type is to use the `DeviceBuffer`
|
||||
/// type to allocate individual device buffers for each output tensor.
|
||||
///
|
||||
/// @tparam SIGNATURE the signature to specialize the structure for.
|
||||
/// @tparam SIGNATURE The signature to specialize the structure for.
|
||||
///
|
||||
/// @see alloc_outputs()
|
||||
/// @see ValidUniqueOutputs
|
||||
@@ -195,7 +197,9 @@ concept ValidUniqueOutputs = requires(UniqueOutputs<SIGNATURE>& inputs) {
|
||||
/// amount of memory required and then allocate it on the device, for example
|
||||
/// using `alloc_buffer` or `alloc_tensor_buffer`.
|
||||
///
|
||||
/// @tparam SIGNATURE the signature to specialize the structure for.
|
||||
/// @tparam SIGNATURE The signature to specialize the structure for.
|
||||
///
|
||||
/// @param args The run-time arguments of the operation.
|
||||
///
|
||||
/// @see Inputs
|
||||
/// @see UniqueInputs
|
||||
@@ -208,16 +212,18 @@ UniqueInputs<SIGNATURE> alloc_inputs(const Args<SIGNATURE>& args);
|
||||
/// @brief Allocate inputs corresponding to a signature.
|
||||
///
|
||||
/// The `init_inputs()` function is used to initialize pseudo-random data
|
||||
/// to the tensors specified in the Inputs structure.
|
||||
/// to the tensors specified in the Inputs structure. Implementors should
|
||||
/// fill each of the tensors in `inputs` with appropriate random data.
|
||||
///
|
||||
/// @tparam SIGNATURE the signature to specialize the structure for.
|
||||
///
|
||||
/// @param args The run-time arguments of the operation.
|
||||
/// @param inputs The operation inputs to initialize with random data.
|
||||
///
|
||||
/// @see Inputs
|
||||
/// @see UniqueInputs
|
||||
/// @see tensor_initialization
|
||||
template <auto SIGNATURE>
|
||||
requires ValidUniqueInputs<SIGNATURE>
|
||||
void init_inputs(const Args<SIGNATURE>& args, UniqueInputs<SIGNATURE>& inputs);
|
||||
void init_inputs(const Args<SIGNATURE>& args, Inputs<SIGNATURE> inputs);
|
||||
|
||||
/// @brief Allocate outputs corresponding to a signature.
|
||||
///
|
||||
@@ -226,7 +232,9 @@ void init_inputs(const Args<SIGNATURE>& args, UniqueInputs<SIGNATURE>& inputs);
|
||||
/// amount of memory required and then allocate it on the device, for example
|
||||
/// using `alloc_buffer` or `alloc_tensor_buffer`.
|
||||
///
|
||||
/// @tparam SIGNATURE the signature to specialize the structure for.
|
||||
/// @tparam SIGNATURE The signature to specialize the structure for.
|
||||
///
|
||||
/// @param args The run-time arguments of the operation.
|
||||
///
|
||||
/// @see Outputs
|
||||
/// @see UniqueOutputs
|
||||
@@ -236,6 +244,29 @@ template <auto SIGNATURE>
|
||||
requires ValidUniqueOutputs<SIGNATURE>
|
||||
UniqueInputs<SIGNATURE> alloc_outputs(const Args<SIGNATURE>& args);
|
||||
|
||||
/// @brief Compare device operation outputs.
|
||||
///
|
||||
/// This function implements the main comparison functionality, used to compare
|
||||
/// the output of one implementation for a particular `SIGNATURE` with that of
|
||||
/// another. Usually, the `expected` output should be computed by a reference
|
||||
/// implementation.
|
||||
///
|
||||
/// The implementation of this function generates a "report", which includes
|
||||
/// detailed information about which tensors are different, how many elements
|
||||
/// were incorrect, and where (a subset of) those elements are located within
|
||||
/// the tensor. See `ValidationReport` for more information about the report.
|
||||
///
|
||||
/// @tparam SIGNATURE The signature to specialize the structure for.
|
||||
///
|
||||
/// @param args The run-time arguments of the operation.
|
||||
/// @param actual The actual results, the results of the operation to-be-tested.
|
||||
/// @param expected The expected results, the results of the reference implementation.
|
||||
///
|
||||
/// @see ValidationReport
|
||||
template <auto SIGNATURE>
|
||||
ValidationReport
|
||||
validate(const Args<SIGNATURE>& args, Outputs<SIGNATURE> actual, Outputs<SIGNATURE> expected);
|
||||
|
||||
/// @brief Invoke a device operation created by CK Builder.
|
||||
///
|
||||
/// This is the main function used to invoke a particular device operation
|
||||
@@ -257,7 +288,7 @@ UniqueInputs<SIGNATURE> alloc_outputs(const Args<SIGNATURE>& args);
|
||||
/// @post The tensors in `outputs` are overwritten with the outputs of the device
|
||||
/// operation.
|
||||
///
|
||||
/// @tparam SIGNATURE the signature to specialize this function for
|
||||
/// @tparam SIGNATURE The signature to specialize this function for
|
||||
/// @tparam Operation the kernel of the operation to invoke. This type should be
|
||||
/// one that is created using the Builder API.
|
||||
/// @param operation An instance of the operation to invoke.
|
||||
|
||||
@@ -0,0 +1,167 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/builder/testing/error.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_buffer.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_foreach.hpp"
|
||||
#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/utility/type_convert.hpp"
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
|
||||
/// This file implements functionality related to "validation", ie, functionality
|
||||
/// to compare tensors. The functionality in this file should be testing-framework
|
||||
/// agnostic, and it should NOT generate any error messages by itself. Instead,
|
||||
/// all relevant information should be stored in the `ValidationReport` structure.
|
||||
/// This structure should then be used to generate error messages, explainations,
|
||||
/// etc, by the actual testing framework that the user has chosen.
|
||||
|
||||
namespace ck_tile::builder::test {
|
||||
|
||||
/// @brief Information about how a set of comparisons failed or succeeded.
|
||||
///
|
||||
/// This structure represents a "report" generated by comparing sets of tensors.
|
||||
/// Its intended to be used as the result of `ckt::validate()`, where `check()`
|
||||
/// is invoked for each of the output tensors of a particular device operation.
|
||||
/// The test should be considered successful if _all_ of those checks passes,
|
||||
/// which can inspected by asserting that `get_errors().size()` is 0.
|
||||
struct ValidationReport
|
||||
{
|
||||
/// @brief Information related to a single tensor comparison.
|
||||
///
|
||||
/// This structure holds the information about the result of comparing
|
||||
/// two particular tensors.
|
||||
struct Case
|
||||
{
|
||||
/// The name of the tensor that was compared here, stored here for convenience
|
||||
/// so that reporting any errors is easier.
|
||||
std::string tensor_name;
|
||||
|
||||
/// The number of elements which were different between the two compared tensors.
|
||||
uint64_t wrong_elements;
|
||||
|
||||
/// The total number of elements in each tensor.
|
||||
uint64_t total_elements;
|
||||
|
||||
/// @brief Return whether the check associated to this case was successful.
|
||||
///
|
||||
/// This function returns whether the check associated to this case was successful,
|
||||
/// which is directly derived from checking whether the number of incorrect elements
|
||||
/// was 0.
|
||||
bool is_ok() const { return wrong_elements == 0; }
|
||||
};
|
||||
|
||||
/// @brief Get comparison cases which were incorrect.
|
||||
///
|
||||
/// This function returns a vector of comparison cases that did not succeed, ie, for
|
||||
/// which `Case::is_ok` return false. In order to check whether validation passed, it
|
||||
/// is sufficient to assert that this function returns no cases.
|
||||
std::vector<Case> get_errors() const
|
||||
{
|
||||
std::vector<Case> errors;
|
||||
std::copy_if(reports_.begin(),
|
||||
reports_.end(),
|
||||
std::back_inserter(errors),
|
||||
[](const auto& report) { return !report.is_ok(); });
|
||||
return errors;
|
||||
}
|
||||
|
||||
/// @brief Compare two tensors and record the results in the report.
|
||||
///
|
||||
/// This is the main function used to compare two tensors. The results of this
|
||||
/// comparison, including any supplemental information, is recorded into the report.
|
||||
///
|
||||
/// @returns `false` if the comparison failed. If so, the details can be found via
|
||||
/// `get_errors()`.
|
||||
///
|
||||
/// @tparam DT The data type of the tensors to check.
|
||||
/// @tparam RANK The rank (number of spatial dimensions) of the tensor to check.
|
||||
///
|
||||
/// @param tensor_name The name of the tensors to check. This should be a value by which
|
||||
/// whoever is debugging the associated test later can easily find out which of the
|
||||
/// outputs of a device operation was incorrect.
|
||||
/// @param descriptor The descriptor (memory layout) of the tensor.
|
||||
/// @param actual The device buffer with the values of the tensor to-be-tested, ie, the
|
||||
/// results of the device operation.
|
||||
/// @param expected The device buffer with the values of the reference tensor. These are
|
||||
/// treated as a "golden standard", and should usually be generated by a reference
|
||||
/// implementation.
|
||||
/// @param rtol The relative acceptable tolerance between two values.
|
||||
/// @param atol The absolute acceptable tolerance between two values.
|
||||
template <DataType DT, size_t RANK>
|
||||
bool check(std::string_view tensor_name,
|
||||
const TensorDescriptor<DT, RANK>& descriptor,
|
||||
const void* actual,
|
||||
const void* expected,
|
||||
double rtol = 1e-3,
|
||||
double atol = 1e-3);
|
||||
|
||||
private:
|
||||
std::vector<Case> reports_;
|
||||
};
|
||||
|
||||
template <DataType DT, size_t RANK>
|
||||
bool ValidationReport::check(std::string_view tensor_name,
|
||||
const TensorDescriptor<DT, RANK>& descriptor,
|
||||
const void* actual_data,
|
||||
const void* expected_data,
|
||||
double rtol,
|
||||
double atol)
|
||||
{
|
||||
const auto strides = descriptor.get_strides();
|
||||
|
||||
// During development and CI, only the kernels that were changed would fail, and so we can
|
||||
// assume that the average case does not have errors. Therefore, split out testing into a
|
||||
// quick test which just counts the incorrect elements, and a more in-depth test that also
|
||||
// returns the indices of the incorrect items.
|
||||
|
||||
// Initial pass: count errors
|
||||
|
||||
// Allocate and reset counter
|
||||
auto d_error_count = alloc_buffer(sizeof(uint64_t));
|
||||
check_hip(hipMemset(d_error_count.get(), 0, sizeof(uint64_t)));
|
||||
|
||||
tensor_foreach(descriptor.get_lengths(), [=, error_count = d_error_count.get()](auto index) {
|
||||
using CKType = typename factory::internal::DataTypeToCK<DT>::type;
|
||||
|
||||
const auto* actual = static_cast<const CKType*>(actual_data);
|
||||
const auto* expected = static_cast<const CKType*>(expected_data);
|
||||
|
||||
static_assert(!std::is_same_v<CKType, double>,
|
||||
"TODO implement compare_kernel() for double");
|
||||
|
||||
const auto offset = calculate_offset(index, strides);
|
||||
|
||||
const auto o = static_cast<double>(type_convert<float>(actual[offset]));
|
||||
const auto r = static_cast<double>(type_convert<float>(expected[offset]));
|
||||
const auto err = std::abs(o - r);
|
||||
|
||||
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
|
||||
{
|
||||
// We expect the number of errors to be very low, so just use an atomic
|
||||
// for now.
|
||||
atomicAdd(reinterpret_cast<uint64_t*>(error_count), 1);
|
||||
}
|
||||
});
|
||||
|
||||
uint64_t error_count = 0;
|
||||
check_hip(
|
||||
hipMemcpy(&error_count, d_error_count.get(), sizeof(uint64_t), hipMemcpyDeviceToHost));
|
||||
|
||||
// TODO: Gather detailed coordinates.
|
||||
|
||||
reports_.push_back(Case{
|
||||
.tensor_name = std::string(tensor_name),
|
||||
.wrong_elements = error_count,
|
||||
.total_elements = descriptor.get_element_size(),
|
||||
});
|
||||
|
||||
return error_count == 0;
|
||||
}
|
||||
|
||||
} // namespace ck_tile::builder::test
|
||||
@@ -80,33 +80,36 @@ add_ck_builder_test(test_ckb_conv_builder
|
||||
test_instance_traits_util.cpp
|
||||
unit_device_buffer.cpp
|
||||
unit_tensor_descriptor.cpp
|
||||
unit_tensor_foreach.cpp
|
||||
unit_error.cpp
|
||||
unit_validation.cpp
|
||||
unit_conv_elementwise_op.cpp
|
||||
unit_conv_tensor_layout.cpp
|
||||
unit_conv_tensor_type.cpp
|
||||
unit_conv_thread_block.cpp
|
||||
unit_conv_tuning_params.cpp)
|
||||
|
||||
# Tests the inline diff utility used for comparing strings in tests assertions
|
||||
add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp)
|
||||
|
||||
# GPU reference validation tests (in validation/ folder)
|
||||
# 1. Reference kernel execution and InstanceTraits
|
||||
add_ck_builder_test(test_ckb_reference_execution
|
||||
validation/test_reference_execution.cpp
|
||||
validation/test_reference_instance_traits.cpp)
|
||||
target_link_libraries(test_ckb_reference_execution PRIVATE utility)
|
||||
|
||||
# Note: Optimized kernel validation tests will be added after merging dev branch
|
||||
# with kernel Run() implementation from colleague's work
|
||||
# Tests the inline diff utility used for comparing strings in tests assertions
|
||||
add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp)
|
||||
|
||||
# GPU reference validation tests (in validation/ folder)
|
||||
# 1. Reference kernel execution and InstanceTraits
|
||||
add_ck_builder_test(test_ckb_reference_execution
|
||||
validation/test_reference_execution.cpp
|
||||
validation/test_reference_instance_traits.cpp)
|
||||
target_link_libraries(test_ckb_reference_execution PRIVATE utility)
|
||||
|
||||
# Note: Optimized kernel validation tests will be added after merging dev branch
|
||||
# with kernel Run() implementation from colleague's work
|
||||
|
||||
# Tests convolution trait selection and configuration
|
||||
add_ck_builder_test(test_ckb_conv_traits
|
||||
conv/ck/test_conv_traits.cpp)
|
||||
|
||||
# Tests convolution problem description and parameter handling
|
||||
add_ck_builder_test(test_ckb_conv_description
|
||||
test_conv_description.cpp)
|
||||
|
||||
# Tests convolution trait selection and configuration
|
||||
add_ck_builder_test(test_ckb_conv_traits
|
||||
conv/ck/test_conv_traits.cpp)
|
||||
|
||||
# Tests convolution problem description and parameter handling
|
||||
add_ck_builder_test(test_ckb_conv_description
|
||||
test_conv_description.cpp)
|
||||
|
||||
################################################################################
|
||||
# REGRESSION TESTS - Integration Tests (With Kernel Compilation)
|
||||
################################################################################
|
||||
|
||||
@@ -6,11 +6,14 @@
|
||||
#include "utils/conv_algorithm_type_utils.hpp"
|
||||
#include "ck_tile/builder/testing/conv_fwd_ck.hpp"
|
||||
#include "ck_tile/host/device_prop.hpp"
|
||||
#include "testing_utils.hpp"
|
||||
|
||||
namespace ckb = ck_tile::builder;
|
||||
namespace ckt = ck_tile::builder::test;
|
||||
namespace cku = ck_tile::builder::test_utils;
|
||||
|
||||
using ck_tile::test::MatchesReference;
|
||||
|
||||
constexpr auto SIGNATURE =
|
||||
ckt::ConvSignature{.spatial_dim = 2,
|
||||
.direction = ckb::ConvDirection::FORWARD,
|
||||
@@ -78,11 +81,18 @@ TEST(Fwd2DFp16_CShufV3_GNHWC, EndToEnd)
|
||||
.cde_elementwise_op = {},
|
||||
};
|
||||
|
||||
auto inputs = alloc_inputs(args);
|
||||
auto outputs = alloc_outputs(args);
|
||||
auto inputs = ckt::alloc_inputs(args);
|
||||
auto outputs = ckt::alloc_outputs(args);
|
||||
|
||||
init_inputs(args, inputs);
|
||||
ckt::init_inputs(args, inputs.get());
|
||||
|
||||
auto conv = Instance{};
|
||||
ckt::run(conv, args, inputs.get(), outputs.get());
|
||||
|
||||
// TODO: This should be allocated via ckt::alloc_outputs() and
|
||||
// initialized via ckt::run() with the reference implementation
|
||||
// instead.
|
||||
auto reference = outputs.get();
|
||||
|
||||
EXPECT_THAT(outputs.get(), MatchesReference(args, reference));
|
||||
}
|
||||
|
||||
@@ -5,8 +5,7 @@
|
||||
|
||||
#include "testing_utils.hpp"
|
||||
|
||||
namespace ck_tile::builder {
|
||||
namespace {
|
||||
using ck_tile::test::inlineDiff;
|
||||
|
||||
TEST(InlineDiff, simpleColorDiff)
|
||||
{
|
||||
@@ -16,8 +15,8 @@ TEST(InlineDiff, simpleColorDiff)
|
||||
|
||||
// some easy tests
|
||||
// you can veryfy the ungodly strings are meaningful by running echo -e "<string>"
|
||||
EXPECT_THAT(test::inlineDiff(str1, str2, true), "hello");
|
||||
EXPECT_THAT(test::inlineDiff(str1, str3, true),
|
||||
EXPECT_THAT(inlineDiff(str1, str2, true), "hello");
|
||||
EXPECT_THAT(inlineDiff(str1, str3, true),
|
||||
"[\x1B[36mwor\x1B[0m|\x1B[35mhel\x1B[0m]l[\x1B[36md\x1B[0m|\x1B[35mo\x1B[0m]");
|
||||
}
|
||||
|
||||
@@ -28,8 +27,8 @@ TEST(InlineDiff, noColorDiff)
|
||||
std::string str3{"world"};
|
||||
|
||||
// some easy tests without color
|
||||
EXPECT_THAT(test::inlineDiff(str1, str2, false), "hello");
|
||||
EXPECT_THAT(test::inlineDiff(str1, str3, false), "[wor|hel]l[d|o]");
|
||||
EXPECT_THAT(inlineDiff(str1, str2, false), "hello");
|
||||
EXPECT_THAT(inlineDiff(str1, str3, false), "[wor|hel]l[d|o]");
|
||||
}
|
||||
|
||||
TEST(InlineDiff, complexColorDiff)
|
||||
@@ -42,11 +41,8 @@ TEST(InlineDiff, complexColorDiff)
|
||||
"this part has degeahc, this part has, this part added, this part has ana extra letter"};
|
||||
|
||||
EXPECT_THAT(
|
||||
test::inlineDiff(str5, str4, true),
|
||||
inlineDiff(str5, str4, true),
|
||||
"this part has [\x1B[36mchanged\x1B[0m|\x1B[35mdegeahc\x1B[0m], this part has[\x1B[36m "
|
||||
"been left out\x1B[0m|\x1B[35m\x1B[0m], this part[\x1B[36m\x1B[0m|\x1B[35m added\x1B[0m], "
|
||||
"this part has an[\x1B[36m\x1B[0m|\x1B[35ma\x1B[0m] extra letter");
|
||||
};
|
||||
|
||||
} // namespace
|
||||
} // namespace ck_tile::builder
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <ck/library/tensor_operation_instance/device_operation_instance_factory.hpp>
|
||||
#include "ck_tile/builder/testing/testing.hpp"
|
||||
#include <gtest/gtest.h>
|
||||
#include <gmock/gmock.h>
|
||||
#include <string>
|
||||
@@ -21,6 +22,16 @@
|
||||
/// dedicated function to override to provide printing support.
|
||||
std::ostream& operator<<(std::ostream& os, hipError_t status);
|
||||
|
||||
namespace ck_tile::builder::test {
|
||||
|
||||
template <auto SIGNATURE>
|
||||
std::ostream& operator<<(std::ostream& os, [[maybe_unused]] Outputs<SIGNATURE> outputs)
|
||||
{
|
||||
return os << "<tensor outputs>";
|
||||
}
|
||||
|
||||
} // namespace ck_tile::builder::test
|
||||
|
||||
namespace ck_tile::test {
|
||||
|
||||
static bool isTerminalOutput() { return isatty(fileno(stdout)) || isatty(fileno(stderr)); }
|
||||
@@ -150,4 +161,47 @@ struct HipStatusMatcher : public ::testing::MatcherInterface<hipError_t>
|
||||
/// @param error The error to expect.
|
||||
::testing::Matcher<hipError_t> HipError(hipError_t error);
|
||||
|
||||
template <auto SIGNATURE>
|
||||
struct ReferenceOutputMatcher
|
||||
: public ::testing::MatcherInterface<builder::test::Outputs<SIGNATURE>>
|
||||
{
|
||||
ReferenceOutputMatcher(const builder::test::Args<SIGNATURE>& args,
|
||||
builder::test::Outputs<SIGNATURE> expected)
|
||||
: args_(&args), expected_(expected)
|
||||
{
|
||||
}
|
||||
|
||||
bool MatchAndExplain(builder::test::Outputs<SIGNATURE> actual,
|
||||
[[maybe_unused]] ::testing::MatchResultListener* listener) const override
|
||||
{
|
||||
const auto report = ck_tile::builder::test::validate(*args_, actual, expected_);
|
||||
const auto errors = report.get_errors();
|
||||
|
||||
if(listener->IsInterested() && !errors.empty())
|
||||
{
|
||||
*listener << errors.size() << " tensors failed to validate";
|
||||
}
|
||||
|
||||
return errors.empty();
|
||||
}
|
||||
|
||||
void DescribeTo(std::ostream* os) const override { *os << "<tensor outputs>"; }
|
||||
|
||||
void DescribeNegationTo(std::ostream* os) const override
|
||||
{
|
||||
*os << "isn't equal to <tensor outputs>";
|
||||
}
|
||||
|
||||
const builder::test::Args<SIGNATURE>* args_;
|
||||
builder::test::Outputs<SIGNATURE> expected_;
|
||||
};
|
||||
|
||||
template <auto SIGNATURE>
|
||||
::testing::Matcher<builder::test::Outputs<SIGNATURE>>
|
||||
MatchesReference(const builder::test::Args<SIGNATURE>& args,
|
||||
builder::test::Outputs<SIGNATURE> expected)
|
||||
{
|
||||
return ::testing::MakeMatcher(new ReferenceOutputMatcher<SIGNATURE>(args, expected));
|
||||
}
|
||||
|
||||
} // namespace ck_tile::test
|
||||
|
||||
@@ -11,40 +11,27 @@ namespace {
|
||||
namespace ckb = ck_tile::builder;
|
||||
using ck_tile::builder::factory::internal::DataTypeToCK;
|
||||
|
||||
TEST(ConvTensorType, AssignsTypesForFP16)
|
||||
{
|
||||
using CKType = DataTypeToCK<ckb::DataType::FP16>::type;
|
||||
EXPECT_TRUE((std::is_same_v<CKType, ck::half_t>));
|
||||
}
|
||||
template <ckb::DataType DT, typename T>
|
||||
constexpr auto check_same = std::is_same_v<typename DataTypeToCK<DT>::type, T>;
|
||||
|
||||
TEST(ConvTensorType, AssignsTypesForBF16)
|
||||
TEST(ConvTensorType, Exhaustive)
|
||||
{
|
||||
using CKType = DataTypeToCK<ckb::DataType::BF16>::type;
|
||||
EXPECT_TRUE((std::is_same_v<CKType, ck::bhalf_t>));
|
||||
}
|
||||
using enum ckb::DataType;
|
||||
|
||||
TEST(ConvTensorType, AssignsTypesForFP32)
|
||||
{
|
||||
using CKType = DataTypeToCK<ckb::DataType::FP32>::type;
|
||||
EXPECT_TRUE((std::is_same_v<CKType, float>));
|
||||
}
|
||||
|
||||
TEST(ConvTensorType, AssignsTypesForINT32)
|
||||
{
|
||||
using CKType = DataTypeToCK<ckb::DataType::INT32>::type;
|
||||
EXPECT_TRUE((std::is_same_v<CKType, int32_t>));
|
||||
}
|
||||
|
||||
TEST(ConvTensorType, AssignsTypesForI8)
|
||||
{
|
||||
using CKType = DataTypeToCK<ckb::DataType::I8>::type;
|
||||
EXPECT_TRUE((std::is_same_v<CKType, int8_t>));
|
||||
}
|
||||
|
||||
TEST(ConvTensorType, AssignsTypesForFP8)
|
||||
{
|
||||
using CKType = DataTypeToCK<ckb::DataType::FP8>::type;
|
||||
EXPECT_TRUE((std::is_same_v<CKType, ck::f8_t>));
|
||||
const auto type = FP32;
|
||||
// This switch ensures that we get a warning (error with -Werror) if
|
||||
// a variant is missing.
|
||||
switch(type)
|
||||
{
|
||||
case UNDEFINED_DATA_TYPE: break;
|
||||
case FP32: EXPECT_TRUE((check_same<FP32, float>)); break;
|
||||
case FP16: EXPECT_TRUE((check_same<FP16, ck::half_t>)); break;
|
||||
case BF16: EXPECT_TRUE((check_same<BF16, ck::bhalf_t>)); break;
|
||||
case INT32: EXPECT_TRUE((check_same<INT32, uint32_t>)); break;
|
||||
case FP8: EXPECT_TRUE((check_same<FP8, ck::f8_t>)); break;
|
||||
case I8: EXPECT_TRUE((check_same<I8, int8_t>)); break;
|
||||
case U8: EXPECT_TRUE((check_same<U8, uint8_t>)); break;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
@@ -2,10 +2,11 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck_tile/builder/testing/tensor_buffer.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_descriptor.hpp"
|
||||
#include "testing_utils.hpp"
|
||||
#include <gtest/gtest.h>
|
||||
#include <gmock/gmock.h>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
|
||||
namespace ckb = ck_tile::builder;
|
||||
namespace ckt = ck_tile::builder::test;
|
||||
@@ -54,6 +55,11 @@ TEST(DeviceBuffer, AutoFree)
|
||||
|
||||
// Trying to use a pointer after freeing should return en error in HIP.
|
||||
EXPECT_THAT(hipMemset(ptr, 0xFF, size), HipError(hipErrorInvalidValue));
|
||||
|
||||
// Reset internal HIP error state.
|
||||
// Otherwise, the error may leak into other tests, triggering anything that
|
||||
// checks the output of hipGetLastError();
|
||||
(void)hipGetLastError();
|
||||
}
|
||||
|
||||
TEST(DeviceBuffer, ThrowsOnOom)
|
||||
@@ -62,13 +68,16 @@ TEST(DeviceBuffer, ThrowsOnOom)
|
||||
|
||||
auto check = [] { auto buffer = ckt::alloc_buffer(size); };
|
||||
EXPECT_THAT(check, Throws<ckt::OutOfDeviceMemoryError>());
|
||||
|
||||
// Reset internal HIP error state.
|
||||
// Otherwise, the error may leak into other tests, triggering anything that
|
||||
// checks the output of hipGetLastError();
|
||||
(void)hipGetLastError();
|
||||
}
|
||||
|
||||
TEST(DeviceBuffer, AllocTensorBuffer)
|
||||
{
|
||||
std::vector<size_t> lengths = {128, 128, 128};
|
||||
std::vector<size_t> strides = {128 * 128, 128, 1};
|
||||
ckt::TensorDescriptor<ckb::DataType::FP32> descriptor(lengths, strides);
|
||||
ckt::TensorDescriptor<ckb::DataType::FP32, 3> descriptor({128, 128, 128}, {128 * 128, 128, 1});
|
||||
|
||||
auto buffer = ckt::alloc_tensor_buffer(descriptor);
|
||||
|
||||
|
||||
46
experimental/builder/test/unit_error.cpp
Normal file
46
experimental/builder/test/unit_error.cpp
Normal file
@@ -0,0 +1,46 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck_tile/builder/testing/error.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_buffer.hpp"
|
||||
#include "testing_utils.hpp"
|
||||
#include <gtest/gtest.h>
|
||||
#include <gmock/gmock.h>
|
||||
|
||||
namespace ckt = ck_tile::builder::test;
|
||||
|
||||
using ::testing::AllOf;
|
||||
using ::testing::HasSubstr;
|
||||
using ::testing::Throws;
|
||||
using ::testing::ThrowsMessage;
|
||||
|
||||
[[noreturn]] void throw_error() { throw ckt::HipError("test error", hipErrorInvalidValue); }
|
||||
|
||||
TEST(HipError, SourceInfo)
|
||||
{
|
||||
EXPECT_THAT(throw_error,
|
||||
ThrowsMessage<ckt::HipError>(AllOf(
|
||||
// The error message should include...
|
||||
// ...the user message
|
||||
HasSubstr("test error"),
|
||||
// ...the HIP message
|
||||
HasSubstr("invalid argument"),
|
||||
// ...the HIP status code,
|
||||
HasSubstr("(1)"),
|
||||
// ...the filename
|
||||
HasSubstr("experimental/builder/test/unit_error.cpp"),
|
||||
// ...the function name
|
||||
HasSubstr("throw_error"),
|
||||
// Note: Don't include the row/column so that we can move
|
||||
// stuff around in this file.
|
||||
)));
|
||||
}
|
||||
|
||||
TEST(CheckHip, BasicUsage)
|
||||
{
|
||||
EXPECT_THAT([] { ckt::check_hip(hipSuccess); }, Not(Throws<ckt::HipError>()));
|
||||
EXPECT_THAT([] { ckt::check_hip(hipErrorNotMapped); }, Throws<ckt::HipError>());
|
||||
EXPECT_THAT([] { ckt::check_hip(hipErrorOutOfMemory); }, Throws<ckt::OutOfDeviceMemoryError>());
|
||||
EXPECT_THAT([] { ckt::check_hip("test message", hipErrorAlreadyMapped); },
|
||||
ThrowsMessage<ckt::HipError>(HasSubstr("test message")));
|
||||
}
|
||||
@@ -1,25 +1,28 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck_tile/builder/testing/tensor_buffer.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_descriptor.hpp"
|
||||
#include "testing_utils.hpp"
|
||||
#include <gtest/gtest.h>
|
||||
#include <gmock/gmock.h>
|
||||
#include <array>
|
||||
#include <vector>
|
||||
|
||||
namespace ckb = ck_tile::builder;
|
||||
namespace ckt = ck_tile::builder::test;
|
||||
|
||||
using ::testing::ElementsAreArray;
|
||||
using ::testing::Ge;
|
||||
using ::testing::Eq;
|
||||
using ::testing::Throws;
|
||||
|
||||
TEST(TensorDescriptor, Basic)
|
||||
{
|
||||
constexpr auto dt = ckb::DataType::FP16;
|
||||
std::vector<size_t> lengths = {123, 456, 789};
|
||||
std::vector<size_t> strides = {456 * 789, 789, 1};
|
||||
constexpr auto dt = ckb::DataType::FP16;
|
||||
constexpr size_t rank = 3;
|
||||
ckt::Extent lengths = {123, 456, 789};
|
||||
ckt::Extent strides = {456 * 789, 789, 1};
|
||||
|
||||
ckt::TensorDescriptor<dt> descriptor(lengths, strides);
|
||||
ckt::TensorDescriptor<dt, rank> descriptor(lengths, strides);
|
||||
|
||||
EXPECT_THAT(descriptor.get_lengths(), ElementsAreArray(lengths));
|
||||
EXPECT_THAT(descriptor.get_strides(), ElementsAreArray(strides));
|
||||
@@ -27,21 +30,143 @@ TEST(TensorDescriptor, Basic)
|
||||
|
||||
TEST(TensorDescriptor, ComputeSize)
|
||||
{
|
||||
constexpr auto dt = ckb::DataType::FP32;
|
||||
std::vector<size_t> lengths = {305, 130, 924};
|
||||
std::vector<size_t> strides = {1000 * 1000, 1, 1000};
|
||||
constexpr auto dt = ckb::DataType::FP32;
|
||||
constexpr size_t rank = 3;
|
||||
ckt::Extent lengths = {305, 130, 924};
|
||||
ckt::Extent strides = {1001 * 1000, 1, 1000};
|
||||
|
||||
ckt::TensorDescriptor<dt> descriptor(lengths, strides);
|
||||
ckt::TensorDescriptor<dt, rank> descriptor(lengths, strides);
|
||||
|
||||
// Compute the location of the last item in memory, then add one
|
||||
// to get the minimum size.
|
||||
size_t expected_size = 1;
|
||||
// Compute the location of the last item in memory,
|
||||
// then add one to get the minimum size.
|
||||
size_t expected_size = 1;
|
||||
size_t expected_numel = 1;
|
||||
for(size_t i = 0; i < lengths.size(); ++i)
|
||||
{
|
||||
expected_size += (lengths[i] - 1) * strides[i];
|
||||
expected_numel *= lengths[i];
|
||||
}
|
||||
|
||||
EXPECT_THAT(descriptor.get_element_space_size(), Ge(expected_size));
|
||||
EXPECT_THAT(descriptor.get_element_size(), Eq(expected_numel));
|
||||
EXPECT_THAT(descriptor.get_element_space_size(), Eq(expected_size));
|
||||
EXPECT_THAT(descriptor.get_element_space_size_in_bytes(),
|
||||
Ge(expected_size * ckt::data_type_sizeof(dt)));
|
||||
Eq(expected_size * ckt::data_type_sizeof(dt)));
|
||||
}
|
||||
|
||||
TEST(TensorDescriptor, PackedRightLayout)
|
||||
{
|
||||
const ckt::Extent lengths = {5125, 623, 1177, 1534};
|
||||
const auto strides = ckt::PackedRightLayout{}(lengths);
|
||||
|
||||
EXPECT_THAT(strides, ElementsAreArray({623 * 1177 * 1534, 1177 * 1534, 1534, 1}));
|
||||
}
|
||||
|
||||
TEST(TensorDescriptor, PackedLeftLayout)
|
||||
{
|
||||
const ckt::Extent lengths = {4, 15, 925, 662, 1462};
|
||||
const auto strides = ckt::PackedLeftLayout{}(lengths);
|
||||
|
||||
EXPECT_THAT(strides, ElementsAreArray({1, 4, 4 * 15, 4 * 15 * 925, 4 * 15 * 925 * 662}));
|
||||
}
|
||||
|
||||
TEST(TensorDescriptor, MakeDescriptor)
|
||||
{
|
||||
{
|
||||
const ckt::Extent lengths = {10, 11, 12, 13, 14};
|
||||
|
||||
// Note: automatic inference of RANK.
|
||||
const auto desc =
|
||||
ckt::make_descriptor<ckb::DataType::INT32>(lengths, ckt::PackedRightLayout{});
|
||||
|
||||
EXPECT_THAT(desc.get_lengths(), ElementsAreArray(lengths));
|
||||
EXPECT_THAT(desc.get_strides(),
|
||||
ElementsAreArray({11 * 12 * 13 * 14, 12 * 13 * 14, 13 * 14, 14, 1}));
|
||||
}
|
||||
|
||||
{
|
||||
const ckt::Extent lengths = {4, 3, 2};
|
||||
const ckt::Extent strides = {60, 1, 7};
|
||||
|
||||
// Note: automatic inference of RANK.
|
||||
const auto desc = ckt::make_descriptor<ckb::DataType::FP8>(lengths, strides);
|
||||
|
||||
EXPECT_THAT(desc.get_lengths(), ElementsAreArray(lengths));
|
||||
EXPECT_THAT(desc.get_strides(), ElementsAreArray(strides));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TensorDescriptor, GetSpaceDescriptor)
|
||||
{
|
||||
{
|
||||
const auto desc = ckt::make_descriptor<ckb::DataType::FP32>(ckt::Extent{4, 4, 4},
|
||||
ckt::PackedLeftLayout{});
|
||||
const auto space = desc.get_space_descriptor();
|
||||
|
||||
const auto expected = 4 * 4 * 4;
|
||||
|
||||
EXPECT_THAT(decltype(space)::data_type, Eq(ckb::DataType::FP32));
|
||||
EXPECT_THAT(decltype(space)::rank, Eq(1));
|
||||
|
||||
EXPECT_THAT(decltype(space)::data_type, Eq(ckb::DataType::FP32));
|
||||
EXPECT_THAT(decltype(space)::rank, Eq(1));
|
||||
EXPECT_THAT(space.get_lengths(), ElementsAreArray({expected}));
|
||||
EXPECT_THAT(space.get_strides(), ElementsAreArray({1}));
|
||||
EXPECT_THAT(space.get_element_size(), Eq(expected));
|
||||
EXPECT_THAT(space.get_element_space_size(), Eq(expected));
|
||||
}
|
||||
|
||||
{
|
||||
const ckt::Extent lengths = {6, 3, 4};
|
||||
const ckt::Extent strides = {102, 1, 2002};
|
||||
const auto desc = ckt::make_descriptor<ckb::DataType::FP32>(lengths, strides);
|
||||
const auto space = desc.get_space_descriptor();
|
||||
|
||||
// Compute the location of the last item in memory,
|
||||
// then add one to get the minimum size.
|
||||
size_t expected_size = 1;
|
||||
for(size_t i = 0; i < lengths.size(); ++i)
|
||||
{
|
||||
expected_size += (lengths[i] - 1) * strides[i];
|
||||
}
|
||||
|
||||
EXPECT_THAT(decltype(space)::data_type, Eq(ckb::DataType::FP32));
|
||||
EXPECT_THAT(decltype(space)::rank, Eq(1));
|
||||
EXPECT_THAT(space.get_lengths(), ElementsAreArray({expected_size}));
|
||||
EXPECT_THAT(space.get_strides(), ElementsAreArray({1}));
|
||||
EXPECT_THAT(space.get_element_size(), Eq(expected_size));
|
||||
EXPECT_THAT(space.get_element_space_size(), Eq(expected_size));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TensorDescriptor, EmptyExtent)
|
||||
{
|
||||
// A rank-0 tensor points to a single element
|
||||
const auto desc = ckt::make_descriptor<ckb::DataType::FP16>(ckt::Extent{}, ckt::Extent{});
|
||||
EXPECT_THAT(decltype(desc)::rank, Eq(0));
|
||||
EXPECT_THAT(desc.get_lengths().size(), Eq(0));
|
||||
EXPECT_THAT(desc.get_strides().size(), Eq(0));
|
||||
EXPECT_THAT(desc.get_element_size(), Eq(1));
|
||||
EXPECT_THAT(desc.get_element_space_size(), Eq(1));
|
||||
EXPECT_THAT(desc.get_element_space_size_in_bytes(), Eq(2));
|
||||
|
||||
// We expect a rank-1 tensor with the one dimension being 1.
|
||||
const auto space = desc.get_space_descriptor();
|
||||
|
||||
const auto expected = 1;
|
||||
|
||||
EXPECT_THAT(decltype(space)::rank, Eq(1));
|
||||
EXPECT_THAT(space.get_lengths(), ElementsAreArray({expected}));
|
||||
EXPECT_THAT(space.get_strides(), ElementsAreArray({1}));
|
||||
EXPECT_THAT(space.get_element_size(), Eq(expected));
|
||||
EXPECT_THAT(space.get_element_space_size(), Eq(expected));
|
||||
EXPECT_THAT(space.get_element_space_size_in_bytes(), Eq(2));
|
||||
}
|
||||
|
||||
TEST(TensorDescriptor, ExtentFromVector)
|
||||
{
|
||||
EXPECT_THAT(ckt::Extent<4>::from_vector(std::vector<size_t>{1, 2, 3, 4}),
|
||||
ElementsAreArray({1, 2, 3, 4}));
|
||||
|
||||
EXPECT_THAT([] { return ckt::Extent<5>::from_vector(std::vector<size_t>{1, 2}); },
|
||||
Throws<std::runtime_error>());
|
||||
}
|
||||
|
||||
205
experimental/builder/test/unit_tensor_foreach.cpp
Normal file
205
experimental/builder/test/unit_tensor_foreach.cpp
Normal file
@@ -0,0 +1,205 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck_tile/builder/testing/tensor_descriptor.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_buffer.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_foreach.hpp"
|
||||
#include "testing_utils.hpp"
|
||||
#include <gtest/gtest.h>
|
||||
#include <gmock/gmock.h>
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
|
||||
namespace ckb = ck_tile::builder;
|
||||
namespace ckt = ck_tile::builder::test;
|
||||
|
||||
using ::testing::Each;
|
||||
using ::testing::Eq;
|
||||
|
||||
TEST(TensorForeach, CalculateOffset)
|
||||
{
|
||||
EXPECT_THAT(ckt::calculate_offset(ckt::Extent{1, 2, 3}, ckt::Extent{100, 10, 1}), Eq(123));
|
||||
EXPECT_THAT(ckt::calculate_offset(ckt::Extent{523, 266, 263}, ckt::Extent{1, 545, 10532}),
|
||||
Eq(2915409));
|
||||
EXPECT_THAT(ckt::calculate_offset(ckt::Extent{}, ckt::Extent{}), Eq(0));
|
||||
// Note: >4 GB overflow test
|
||||
EXPECT_THAT(ckt::calculate_offset(ckt::Extent{8, 2, 5, 7, 0, 4, 1, 3, 6, 9},
|
||||
ckt::Extent{1'000,
|
||||
1'000'000,
|
||||
10'000'000,
|
||||
1'000'000'000,
|
||||
1,
|
||||
10'000,
|
||||
100,
|
||||
10,
|
||||
100'000'000,
|
||||
100'000}),
|
||||
Eq(size_t{7'652'948'130}));
|
||||
}
|
||||
|
||||
TEST(TensorForeach, VisitsCorrectCount)
|
||||
{
|
||||
// tensor_foreach should visit every index exactly once.
|
||||
// This test checks that the count is at least correct.
|
||||
|
||||
const ckt::Extent shape = {10, 20, 30};
|
||||
|
||||
auto d_count = ckt::alloc_buffer(sizeof(uint64_t));
|
||||
ckt::check_hip(hipMemset(d_count.get(), 0, sizeof(uint64_t)));
|
||||
|
||||
ckt::tensor_foreach(shape, [count = d_count.get()]([[maybe_unused]] const auto& index) {
|
||||
atomicAdd(reinterpret_cast<uint64_t*>(count), 1);
|
||||
});
|
||||
|
||||
uint64_t actual;
|
||||
ckt::check_hip(hipMemcpy(&actual, d_count.get(), sizeof(uint64_t), hipMemcpyDeviceToHost));
|
||||
|
||||
const auto expected = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
|
||||
|
||||
EXPECT_THAT(actual, Eq(expected));
|
||||
}
|
||||
|
||||
TEST(TensorForeach, VisitsEveryIndex)
|
||||
{
|
||||
const ckt::Extent shape = {5, 6, 7, 8, 9, 10, 11};
|
||||
const auto total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
|
||||
|
||||
// We know this is correct due to testing in unit_tensor_descriptor.cpp
|
||||
const auto stride = ckt::PackedRightLayout{}(shape);
|
||||
|
||||
auto d_output = ckt::alloc_buffer(sizeof(uint32_t) * total);
|
||||
ckt::check_hip(hipMemset(d_output.get(), 0, sizeof(uint32_t) * total));
|
||||
|
||||
ckt::tensor_foreach(shape, [output = d_output.get(), stride](const auto& index) {
|
||||
// We know this is correct due to the CalculateOffset test.
|
||||
auto offset = ckt::calculate_offset(index, stride);
|
||||
|
||||
// Use atomic add so that we can check that every index is visited exactly once.
|
||||
atomicAdd(&reinterpret_cast<uint32_t*>(output)[offset], 1);
|
||||
});
|
||||
|
||||
std::vector<uint32_t> actual(total);
|
||||
ckt::check_hip(
|
||||
hipMemcpy(actual.data(), d_output.get(), sizeof(uint32_t) * total, hipMemcpyDeviceToHost));
|
||||
|
||||
EXPECT_THAT(actual, Each(Eq(1)));
|
||||
}
|
||||
|
||||
TEST(TensorForeach, FillTensorBuffer)
|
||||
{
|
||||
auto desc = ckt::make_descriptor<ckb::DataType::INT32>(ckt::Extent{31, 54, 13},
|
||||
ckt::PackedRightLayout{});
|
||||
|
||||
auto buffer = ckt::alloc_tensor_buffer(desc);
|
||||
|
||||
ckt::fill_tensor_buffer(desc, buffer.get(), [](size_t i) { return static_cast<uint32_t>(i); });
|
||||
|
||||
std::vector<uint32_t> h_buffer(desc.get_element_space_size());
|
||||
ckt::check_hip(hipMemcpy(
|
||||
h_buffer.data(), buffer.get(), h_buffer.size() * sizeof(uint32_t), hipMemcpyDeviceToHost));
|
||||
|
||||
for(size_t i = 0; i < h_buffer.size(); ++i)
|
||||
{
|
||||
EXPECT_THAT(h_buffer[i], Eq(static_cast<uint32_t>(i)));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TensorForeach, FillTensor)
|
||||
{
|
||||
// FillTensor with non-packed indices should not write out-of-bounds.
|
||||
const ckt::Extent shape = {4, 23, 35};
|
||||
const ckt::Extent pad = {12, 53, 100};
|
||||
auto desc = ckt::make_descriptor<ckb::DataType::INT32>(shape, ckt::PackedRightLayout{}(pad));
|
||||
const auto strides = desc.get_strides();
|
||||
|
||||
auto size = desc.get_element_space_size();
|
||||
auto buffer = ckt::alloc_tensor_buffer(desc);
|
||||
|
||||
ckt::fill_tensor_buffer(desc, buffer.get(), []([[maybe_unused]] size_t i) { return 123; });
|
||||
|
||||
ckt::fill_tensor(desc, buffer.get(), []([[maybe_unused]] const auto& index) { return 1; });
|
||||
|
||||
auto d_error = ckt::alloc_buffer(sizeof(uint32_t) * size);
|
||||
ckt::check_hip(hipMemset(d_error.get(), 0, sizeof(uint32_t)));
|
||||
|
||||
ckt::tensor_foreach(
|
||||
// Iterate over the entire padding so that we can check out-of-bounds elements
|
||||
pad,
|
||||
[shape, pad, strides, size, error = d_error.get(), tensor = buffer.get()](
|
||||
const auto& index) {
|
||||
const auto offset = ckt::calculate_offset(index, strides);
|
||||
const auto value = reinterpret_cast<const uint32_t*>(tensor)[offset];
|
||||
|
||||
// Note: The space of the descriptor will not actually be (12, 53, 100) but
|
||||
// more like (4, 53, 100), as the outer stride is irrelevant. So we have to
|
||||
// perform an extra bounds check here.
|
||||
if(offset < size)
|
||||
{
|
||||
// Check if the coordinate is within the shape bounds.
|
||||
bool in_bounds = true;
|
||||
for(size_t i = 0; i < shape.size(); ++i)
|
||||
{
|
||||
if(index[i] >= shape[i])
|
||||
{
|
||||
in_bounds = false;
|
||||
}
|
||||
}
|
||||
|
||||
// In-bounds elements are 1, out-of-bounds is 123.
|
||||
if(in_bounds && value != 1)
|
||||
{
|
||||
atomicAdd(reinterpret_cast<uint32_t*>(error), 1);
|
||||
}
|
||||
else if(!in_bounds && value != 123)
|
||||
{
|
||||
atomicAdd(reinterpret_cast<uint32_t*>(error), 1);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
uint32_t error_count = 0;
|
||||
ckt::check_hip(hipMemcpy(&error_count, d_error.get(), sizeof(uint32_t), hipMemcpyDeviceToHost));
|
||||
|
||||
EXPECT_THAT(error_count, Eq(0));
|
||||
}
|
||||
|
||||
TEST(TensorForeach, ClearTensorZeros)
|
||||
{
|
||||
const ckt::Extent shape = {5, 4, 5, 4, 5, 4, 5, 6};
|
||||
const ckt::Extent pad = {6, 6, 6, 6, 6, 6, 6, 6};
|
||||
|
||||
const auto desc =
|
||||
ckt::make_descriptor<ckb::DataType::INT32>(shape, ckt::PackedRightLayout{}(pad));
|
||||
|
||||
auto buffer = ckt::alloc_tensor_buffer(desc);
|
||||
ckt::clear_tensor_buffer(desc, buffer.get());
|
||||
|
||||
// Check that all values are zeroed.
|
||||
auto d_count = ckt::alloc_buffer(sizeof(uint64_t));
|
||||
ckt::check_hip(hipMemset(d_count.get(), 0, sizeof(uint64_t)));
|
||||
|
||||
{
|
||||
const auto size = desc.get_element_space_size();
|
||||
const auto strides = desc.get_strides();
|
||||
auto* count = d_count.get();
|
||||
const auto* tensor = reinterpret_cast<const uint32_t*>(buffer.get());
|
||||
// Note: iterate over the entire pad, so that we can check out-of-bounds elements.
|
||||
ckt::tensor_foreach(pad,
|
||||
[count, tensor, strides, size]([[maybe_unused]] const auto& index) {
|
||||
const auto offset = ckt::calculate_offset(index, strides);
|
||||
|
||||
// Note: The space of the descriptor will not actually be (6, 6,
|
||||
// ...) but more like (5, 6, ...), as the outer stride is
|
||||
// irrelevant. So we have to perform an extra bounds check here.
|
||||
if(offset < size && tensor[offset] != 0)
|
||||
{
|
||||
atomicAdd(reinterpret_cast<uint64_t*>(count), 1);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
uint64_t actual;
|
||||
ckt::check_hip(hipMemcpy(&actual, d_count.get(), sizeof(uint64_t), hipMemcpyDeviceToHost));
|
||||
|
||||
EXPECT_THAT(actual, Eq(0));
|
||||
}
|
||||
277
experimental/builder/test/unit_validation.cpp
Normal file
277
experimental/builder/test/unit_validation.cpp
Normal file
@@ -0,0 +1,277 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck_tile/builder/testing/error.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_buffer.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_descriptor.hpp"
|
||||
#include "ck_tile/builder/testing/validation.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_foreach.hpp"
|
||||
#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
|
||||
#include "ck_tile/builder/testing/testing.hpp"
|
||||
#include "testing_utils.hpp"
|
||||
#include <gtest/gtest.h>
|
||||
#include <gmock/gmock.h>
|
||||
#include <span>
|
||||
#include <array>
|
||||
|
||||
namespace ckb = ck_tile::builder;
|
||||
namespace ckt = ck_tile::builder::test;
|
||||
|
||||
using testing::ElementsAreArray;
|
||||
using testing::Eq;
|
||||
using testing::StrEq;
|
||||
|
||||
using ck_tile::test::MatchesReference;
|
||||
using ck_tile::test::StringEqWithDiff;
|
||||
|
||||
// Googletest cannot have both type AND value parameterized tests.
|
||||
// For now just act lazy and use value template parameters.
|
||||
template <ckb::DataType DT, ckt::Extent SHAPE, auto STRIDES>
|
||||
struct Param
|
||||
{
|
||||
constexpr static auto data_type = DT;
|
||||
constexpr static auto shape = SHAPE;
|
||||
constexpr static auto strides = STRIDES;
|
||||
|
||||
constexpr static auto rank = shape.size();
|
||||
|
||||
static ckt::TensorDescriptor<data_type, rank> get_descriptor()
|
||||
{
|
||||
return ckt::make_descriptor<data_type, rank>(shape, strides);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Param>
|
||||
struct ValidationReportTests : public ::testing::Test
|
||||
{
|
||||
};
|
||||
|
||||
using Types = ::testing::Types<
|
||||
Param<ckb::DataType::FP32, ckt::Extent{52, 152, 224}, ckt::PackedRightLayout{}>,
|
||||
Param<ckb::DataType::FP32, ckt::Extent{72, 1, 49, 2, 4, 5}, ckt::PackedLeftLayout{}>,
|
||||
Param<ckb::DataType::FP32, ckt::Extent{}, ckt::Extent{}>,
|
||||
Param<ckb::DataType::FP32, ckt::Extent{12, 34, 43, 21}, ckt::Extent{41, 1, 43210, 1831}>>;
|
||||
|
||||
TYPED_TEST_SUITE(ValidationReportTests, Types);
|
||||
|
||||
TYPED_TEST(ValidationReportTests, SingleCorrect)
|
||||
{
|
||||
const auto desc = TypeParam::get_descriptor();
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
auto b = ckt::alloc_tensor_buffer(desc);
|
||||
|
||||
ckt::clear_tensor_buffer(desc, a.get());
|
||||
ckt::clear_tensor_buffer(desc, b.get());
|
||||
|
||||
// Generate a sort-of-random looking sequence
|
||||
auto generator = [strides = desc.get_strides()](const auto& index) {
|
||||
const auto flat_index = ckt::calculate_offset(index, strides);
|
||||
return static_cast<float>(flat_index * 10'000'019 % 768'351);
|
||||
};
|
||||
|
||||
ckt::fill_tensor(desc, a.get(), generator);
|
||||
ckt::fill_tensor(desc, b.get(), generator);
|
||||
|
||||
ckt::ValidationReport report;
|
||||
report.check("correct", desc, b.get(), a.get());
|
||||
|
||||
EXPECT_THAT(report.get_errors().size(), Eq(0));
|
||||
}
|
||||
|
||||
TYPED_TEST(ValidationReportTests, SingleIncorrect)
|
||||
{
|
||||
const auto desc = TypeParam::get_descriptor();
|
||||
const auto packed_strides = ckt::PackedRightLayout{}(desc.get_lengths());
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
auto b = ckt::alloc_tensor_buffer(desc);
|
||||
|
||||
ckt::clear_tensor_buffer(desc, a.get());
|
||||
ckt::clear_tensor_buffer(desc, b.get());
|
||||
|
||||
ckt::fill_tensor(desc, a.get(), []([[maybe_unused]] const auto& i) { return 123; });
|
||||
ckt::fill_tensor(desc, b.get(), [packed_strides](const auto& index) {
|
||||
const auto flat_index = ckt::calculate_offset(index, packed_strides);
|
||||
return flat_index == 0 ? 0 : flat_index == 12345 ? 456 : flat_index == 999999 ? 1 : 123;
|
||||
});
|
||||
|
||||
ckt::ValidationReport report;
|
||||
report.check("incorrect", desc, b.get(), a.get());
|
||||
|
||||
const auto errors = report.get_errors();
|
||||
|
||||
const auto flat_size = desc.get_element_size();
|
||||
const auto expected_errors = flat_size >= 999999 ? 3 : flat_size >= 12345 ? 2 : 1;
|
||||
|
||||
ASSERT_THAT(errors.size(), Eq(1));
|
||||
EXPECT_THAT(errors[0].tensor_name, StrEq("incorrect"));
|
||||
EXPECT_THAT(errors[0].wrong_elements, Eq(expected_errors));
|
||||
EXPECT_THAT(errors[0].total_elements, Eq(desc.get_element_size()));
|
||||
}
|
||||
|
||||
TEST(ValidationReportTests, MultipleSomeIncorrect)
|
||||
{
|
||||
ckt::ValidationReport report;
|
||||
|
||||
{
|
||||
auto desc = ckt::make_descriptor<ckb::DataType::BF16, 4>({'R', 'O', 'C', 'm'},
|
||||
ckt::PackedLeftLayout{});
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
auto b = ckt::alloc_tensor_buffer(desc);
|
||||
|
||||
ckt::fill_tensor_buffer(
|
||||
desc, a.get(), [](size_t i) { return ck::type_convert<ck::bhalf_t>(i % 100); });
|
||||
ckt::fill_tensor_buffer(
|
||||
desc, b.get(), [](size_t i) { return ck::type_convert<ck::bhalf_t>(i % 101); });
|
||||
|
||||
report.check("incorrect 1", desc, b.get(), a.get());
|
||||
}
|
||||
|
||||
{
|
||||
auto desc =
|
||||
ckt::make_descriptor<ckb::DataType::U8, 3>({'H', 'I', 'P'}, ckt::PackedRightLayout{});
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
auto b = ckt::alloc_tensor_buffer(desc);
|
||||
|
||||
ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return "ROCm"[i % 4]; });
|
||||
ckt::fill_tensor_buffer(desc, b.get(), [](size_t i) {
|
||||
switch(i % 4)
|
||||
{
|
||||
case 0: return 'R';
|
||||
case 1: return 'O';
|
||||
case 2: return 'C';
|
||||
case 3: return 'm';
|
||||
default: return 'x';
|
||||
}
|
||||
});
|
||||
|
||||
report.check("correct", desc, b.get(), a.get());
|
||||
}
|
||||
|
||||
{
|
||||
auto desc = ckt::make_descriptor<ckb::DataType::INT32, 3>({'G', 'P', 'U'},
|
||||
ckt::PackedRightLayout{});
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
auto b = ckt::alloc_tensor_buffer(desc);
|
||||
|
||||
ckt::fill_tensor_buffer(desc, a.get(), []([[maybe_unused]] size_t i) { return 1; });
|
||||
ckt::fill_tensor_buffer(desc, b.get(), []([[maybe_unused]] size_t i) { return 555; });
|
||||
|
||||
report.check("incorrect 2", desc, b.get(), a.get());
|
||||
}
|
||||
|
||||
const auto errors = report.get_errors();
|
||||
|
||||
ASSERT_THAT(errors.size(), Eq(2));
|
||||
EXPECT_THAT(errors[0].tensor_name, StrEq("incorrect 1"));
|
||||
EXPECT_THAT(errors[0].wrong_elements, Eq(46840334));
|
||||
EXPECT_THAT(errors[1].tensor_name, StrEq("incorrect 2"));
|
||||
EXPECT_THAT(errors[1].wrong_elements, Eq(482800));
|
||||
}
|
||||
|
||||
// MatchesReference operates on the types defined in testing.hpp, so just
|
||||
// quickly define a bunch of dummy values for that.
|
||||
|
||||
struct DummySignature
|
||||
{
|
||||
};
|
||||
|
||||
constexpr DummySignature DUMMY_SIGNATURE = {};
|
||||
|
||||
namespace ck_tile::builder::test {
|
||||
template <>
|
||||
struct Args<DUMMY_SIGNATURE>
|
||||
{
|
||||
auto make_a_descriptor() const
|
||||
{
|
||||
return make_descriptor<builder::DataType::FP32>(Extent{5, 5, 5, 5}, PackedRightLayout{});
|
||||
}
|
||||
|
||||
auto make_b_descriptor() const
|
||||
{
|
||||
return make_descriptor<builder::DataType::FP16>(Extent{100000}, PackedLeftLayout{});
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Outputs<DUMMY_SIGNATURE>
|
||||
{
|
||||
void* a;
|
||||
void* b;
|
||||
};
|
||||
|
||||
template <>
|
||||
ValidationReport validate<DUMMY_SIGNATURE>(const Args<DUMMY_SIGNATURE>& args,
|
||||
Outputs<DUMMY_SIGNATURE> actual,
|
||||
Outputs<DUMMY_SIGNATURE> expected)
|
||||
{
|
||||
ValidationReport report;
|
||||
report.check("a", args.make_a_descriptor(), actual.a, expected.a);
|
||||
report.check("b", args.make_b_descriptor(), actual.b, expected.b);
|
||||
return report;
|
||||
}
|
||||
|
||||
} // namespace ck_tile::builder::test
|
||||
|
||||
TEST(MatchesReference, Correct)
|
||||
{
|
||||
const ckt::Args<DUMMY_SIGNATURE> args;
|
||||
|
||||
const auto a_desc = args.make_a_descriptor();
|
||||
const auto b_desc = args.make_b_descriptor();
|
||||
|
||||
auto a_actual = ckt::alloc_tensor_buffer(a_desc);
|
||||
auto b_actual = ckt::alloc_tensor_buffer(b_desc);
|
||||
ckt::clear_tensor_buffer(a_desc, a_actual.get(), 1);
|
||||
ckt::clear_tensor_buffer(b_desc, b_actual.get(), 2);
|
||||
const auto actual = ckt::Outputs<DUMMY_SIGNATURE>{
|
||||
.a = a_actual.get(),
|
||||
.b = b_actual.get(),
|
||||
};
|
||||
|
||||
auto a_expected = ckt::alloc_tensor_buffer(a_desc);
|
||||
auto b_expected = ckt::alloc_tensor_buffer(b_desc);
|
||||
ckt::clear_tensor_buffer(a_desc, a_expected.get(), 1);
|
||||
ckt::clear_tensor_buffer(b_desc, b_expected.get(), 2);
|
||||
const auto expected = ckt::Outputs<DUMMY_SIGNATURE>{
|
||||
.a = a_expected.get(),
|
||||
.b = b_expected.get(),
|
||||
};
|
||||
|
||||
EXPECT_THAT(actual, MatchesReference(args, expected));
|
||||
}
|
||||
|
||||
TEST(MatchesReference, Incorrect)
|
||||
{
|
||||
const ckt::Args<DUMMY_SIGNATURE> args;
|
||||
|
||||
const auto a_desc = args.make_a_descriptor();
|
||||
const auto b_desc = args.make_b_descriptor();
|
||||
|
||||
auto a_actual = ckt::alloc_tensor_buffer(a_desc);
|
||||
auto b_actual = ckt::alloc_tensor_buffer(b_desc);
|
||||
ckt::clear_tensor_buffer(a_desc, a_actual.get(), 1);
|
||||
ckt::clear_tensor_buffer(b_desc, b_actual.get(), 2);
|
||||
const auto actual = ckt::Outputs<DUMMY_SIGNATURE>{
|
||||
.a = a_actual.get(),
|
||||
.b = b_actual.get(),
|
||||
};
|
||||
|
||||
auto a_expected = ckt::alloc_tensor_buffer(a_desc);
|
||||
auto b_expected = ckt::alloc_tensor_buffer(b_desc);
|
||||
ckt::clear_tensor_buffer(a_desc, a_expected.get(), 2);
|
||||
ckt::clear_tensor_buffer(b_desc, b_expected.get(), 2);
|
||||
const auto expected = ckt::Outputs<DUMMY_SIGNATURE>{
|
||||
.a = a_expected.get(),
|
||||
.b = b_expected.get(),
|
||||
};
|
||||
|
||||
testing::StringMatchResultListener listener;
|
||||
EXPECT_TRUE(!ExplainMatchResult(MatchesReference(args, expected), actual, &listener));
|
||||
|
||||
EXPECT_THAT(listener.str(), StringEqWithDiff("1 tensors failed to validate"));
|
||||
}
|
||||
Reference in New Issue
Block a user