mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 06:49:15 +00:00
[CK_BUILDER] Debug utilities (#3528)
* ck-builder: make toString to_string We are using snake case for CK-Builder * ck-builder: add debug.hpp with tensor descriptor printing function This adds some initial functionality to debug.hpp, a header which will be used to house some debug utilities. * ck-builder: abstract nd-iteration Abstracting this makes it easier to test, clearer, and allows us to use it elsewhere (such as in debug.hpp soon) * ck-builder: tensor printing * ck-builder: rename INT32 to I32 This makes it more in line with the other data type definitions.
This commit is contained in:
@@ -33,7 +33,7 @@ struct DataTypeToCK<DataType::FP32>
|
||||
using type = float;
|
||||
};
|
||||
template <>
|
||||
struct DataTypeToCK<DataType::INT32>
|
||||
struct DataTypeToCK<DataType::I32>
|
||||
{
|
||||
using type = int32_t;
|
||||
};
|
||||
|
||||
634
experimental/builder/include/ck_tile/builder/testing/debug.hpp
Normal file
634
experimental/builder/include/ck_tile/builder/testing/debug.hpp
Normal file
@@ -0,0 +1,634 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/builder/testing/tensor_descriptor.hpp"
|
||||
#include "ck_tile/builder/testing/error.hpp"
|
||||
#include "ck_tile/builder/testing/type_traits.hpp"
|
||||
#include "ck/utility/type_convert.hpp"
|
||||
#include <iostream>
|
||||
#include <locale>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <syncstream>
|
||||
#include <concepts>
|
||||
#include <limits>
|
||||
|
||||
/// This file contains a few debugging utilities, mainly focused around
|
||||
/// tensor data. The idea is that the functionality in this file is not
|
||||
/// necessarily used in any testing directly, but is available for the
|
||||
/// programmer to help with debugging problems. These utilities themselves
|
||||
/// should be tested just the same, though, so that they don't undergo
|
||||
/// bitrot while they are not actively being used.
|
||||
|
||||
namespace ck_tile::builder::test {
|
||||
|
||||
namespace detail {
|
||||
|
||||
/// @brief Custom number punctuation for CK-Builder debugging.
|
||||
///
|
||||
/// During debugging, the locale is usually left to the default C locale.
|
||||
/// The C locale does not have any thousands separator, which makes
|
||||
/// large numbers hard to read. This is a specialization of the default
|
||||
/// C++ number punctuation (`std::numpunct`) which separates thousands
|
||||
/// using `'`, which helps getting a quick overview of the magnitude of
|
||||
/// a number. This character is chosen because C++14 allows number literals
|
||||
/// to have this character.
|
||||
///
|
||||
/// @note When using this locale, be sure to restore the old locale in the
|
||||
/// event that the user actually wants to use a non-standard locale.
|
||||
///
|
||||
/// @see std::numpunct
|
||||
struct numpunct : std::numpunct<char>
|
||||
{
|
||||
char do_thousands_sep() const override { return '\''; }
|
||||
|
||||
std::string do_grouping() const override
|
||||
{
|
||||
// See std::numpunct, this separates by thousands.
|
||||
return "\3";
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// @brief Print information about a tensor descriptor.
|
||||
///
|
||||
/// This function dumps useful information from a tensor descriptor to a
|
||||
/// stream, `std::cout` by default. This includes the number of elements
|
||||
/// in the tensor, the size of the backing space, lengths, strides, etc.
|
||||
///
|
||||
/// @note All information is printed using a lightly modified locale to
|
||||
/// get a unified printing experience. The original locale in `stream` is
|
||||
/// temporarily replaced, but restored before the function returns.
|
||||
///
|
||||
/// @tparam DT The tensor element datatype
|
||||
/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
|
||||
///
|
||||
/// @param name A name for the tensor descriptor.
|
||||
/// @param desc The tensor descriptor to print.
|
||||
/// @param out The stream to print to, `std::cout` by default.
|
||||
template <DataType DT, size_t RANK>
|
||||
void print_descriptor(std::string_view name,
|
||||
const TensorDescriptor<DT, RANK>& desc,
|
||||
std::ostream& out = std::cout)
|
||||
{
|
||||
// Create a custom stream with a completely new config (locale,
|
||||
/// precision, fill, etc). Use an osyncstream to buffer the output
|
||||
/// while were at it (its not likely to help a lot, but why not).
|
||||
std::osyncstream stream(out.rdbuf());
|
||||
stream.imbue(std::locale(std::locale(), new detail::numpunct{}));
|
||||
|
||||
// Print name along with some generic info
|
||||
const auto size = desc.get_element_size();
|
||||
const auto space = desc.get_element_space_size();
|
||||
const auto bytes = desc.get_element_space_size_in_bytes();
|
||||
const auto packed = desc.is_packed();
|
||||
|
||||
stream << "Descriptor \"" << name << "\":\n"
|
||||
<< " data type: " << DT << '\n'
|
||||
<< " size: " << size << " elements\n"
|
||||
<< " space: " << space << " elements (" << bytes << " bytes)\n"
|
||||
<< " lengths: " << desc.get_lengths() << '\n'
|
||||
<< " strides: " << desc.get_strides() << '\n'
|
||||
<< " packed: " << (packed ? "yes" : "no") << std::endl;
|
||||
}
|
||||
|
||||
/// @brief User configuration for printing tensors.
|
||||
///
|
||||
/// This structure houses some configuration fields for customizing how tensors
|
||||
/// are printed. The default is usually good, though `TensorPrintConfig::unlimited()`
|
||||
/// is useful if you want to print the entire tensor to the output regardless of size.
|
||||
struct TensorPrintConfig
|
||||
{
|
||||
/// @brief A limit for the number of columns in a tensor row to print.
|
||||
///
|
||||
/// Each row of a tensor will be printed as a sequence of values. At most
|
||||
/// this number of values are printed, if there are more, `row_skip_val`
|
||||
/// will be printed in between.
|
||||
size_t col_limit = 10;
|
||||
|
||||
/// @brief A limit for the number of rows in a 2D matrix to print
|
||||
///
|
||||
/// Tensors with rank higher than 1 are printed as a single matrix or a series
|
||||
/// of matrix slices. At most this number of rows of the matrix will be printed.
|
||||
/// If there are more rows, a row of `matrix_row_skip_val` and possibly
|
||||
/// `row_skip_val` will be printed in between.
|
||||
size_t row_limit = 10;
|
||||
|
||||
/// @brief A limit for the number of 2D tensor slices to print.
|
||||
///
|
||||
/// Tensors with rank higher than 2 are flattened into a sequence of slices. At
|
||||
/// most this number of slices will be printed.
|
||||
size_t slice_limit = 8;
|
||||
|
||||
/// @brief Text to print at the start of a row of values.
|
||||
///
|
||||
/// This is used by `TensorPrinter`, and printed at the start of a row of tensor
|
||||
/// values.
|
||||
std::string_view row_prefix = " ";
|
||||
|
||||
/// @brief Text to print between fields of a row.
|
||||
///
|
||||
/// This is used by `TensorPrinter`, and printed between each value of a row of
|
||||
/// tensor values.
|
||||
std::string_view row_field_sep = " ";
|
||||
|
||||
/// @brief Text to print when skipping some number of row values.
|
||||
///
|
||||
/// This is used by `TensorPrinter`, and printed instead of some number of values
|
||||
/// when the number of values in a row is too large to all print.
|
||||
std::string_view row_skip_val = "...";
|
||||
|
||||
/// @brief Text to print when skipping a row of a matrix.
|
||||
///
|
||||
/// This is used by `TensorPrinter`, and printed instead of a value when some
|
||||
/// number of rows is skipped when printing a matrix. This is similar to
|
||||
/// `row_skip_val`, except in the vertical direction. Note that ALL values
|
||||
/// in the skip row is printed this way.
|
||||
std::string_view matrix_row_skip_val = "...";
|
||||
|
||||
/// @brief The precision of tensor floating point values.
|
||||
///
|
||||
/// Set the number of decimal digits that is printed for a floating point value.
|
||||
int float_precision = 3;
|
||||
|
||||
/// @brief Return the default print config, but without any printing limits.
|
||||
///
|
||||
/// This is useful if you want to print the *entire* tensor, but be aware that
|
||||
/// this may print a lot of data if the tensor is large!
|
||||
constexpr static TensorPrintConfig unlimited()
|
||||
{
|
||||
return {
|
||||
.col_limit = std::numeric_limits<size_t>::max(),
|
||||
.row_limit = std::numeric_limits<size_t>::max(),
|
||||
.slice_limit = std::numeric_limits<size_t>::max(),
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
/// @brief Iterate over a range of values, but limit the amount of iterations.
|
||||
///
|
||||
/// Iterate over values `0..n`, but if `limit > n`, only iterate over the
|
||||
/// first and last few (`limit // 2)` items. This can be used to iterate over
|
||||
/// large ranges in a way that not too many values are visited. Its primarily
|
||||
/// used when printing tensors so that not all values of a giant tensor are
|
||||
/// dumped to the user's terminal.
|
||||
///
|
||||
/// @param n The total number of items to iterate over.
|
||||
/// @param limit The maximum number of items to iterate over. Use even values
|
||||
/// for best results, as this will lead to the same amount of values in the
|
||||
/// "begin" and "end" sections.
|
||||
/// @param f A functor to invoke for each element. The sole parameter is the
|
||||
/// index.
|
||||
/// @param delim A functor to invoke between the begin and end sections. This
|
||||
/// function is only invoked if any items are skipped at all.
|
||||
void limited_foreach(size_t n, size_t limit, auto f, auto delim)
|
||||
{
|
||||
if(n <= limit)
|
||||
{
|
||||
for(size_t i = 0; i < n; ++i)
|
||||
f(i);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto begin_count = (limit + 1) / 2; // Round up in case `delim` is odd.
|
||||
const auto end_count = limit / 2;
|
||||
const auto skip_count = n - limit;
|
||||
|
||||
for(size_t i = 0; i < begin_count; ++i)
|
||||
f(i);
|
||||
|
||||
delim(skip_count);
|
||||
|
||||
for(size_t i = n - end_count; i < n; ++i)
|
||||
f(i);
|
||||
}
|
||||
};
|
||||
|
||||
/// @brief Output stream requirements for use with `TensorPrinter`.
|
||||
///
|
||||
/// The `TensorPrinter` does not write to an ostream directly, but rather writes to
|
||||
/// a custom stream object. This is mainly so that the user of `TensorPrinter` can
|
||||
/// get more details than directly with an ostream. Basically, a valid implementation
|
||||
/// of `TensorPrintStream` exposes 3 things:
|
||||
/// - A way to print (stringified) tensor elements.
|
||||
/// - A way to print arbitrary text messages. These are mostly for formatting. This
|
||||
/// should be implemented using varargs which are directly folded into an ostream,
|
||||
/// so that <iomanip> functions can be used.
|
||||
/// - A way to query the max width of any `val` field.
|
||||
///
|
||||
/// @see TensorPrinter for more information.
|
||||
template <typename Stream>
|
||||
concept TensorPrintStream = requires(Stream& stream, std::string_view val) {
|
||||
{ stream.max_width } -> std::convertible_to<size_t>;
|
||||
{ stream.val(val) } -> std::same_as<void>;
|
||||
{ stream.msg() } -> std::same_as<void>;
|
||||
{ stream.msg("msg") } -> std::same_as<void>;
|
||||
{ stream.msg(std::setw(3), std::setfill(4), "msg", val) } -> std::same_as<void>;
|
||||
};
|
||||
|
||||
/// @brief Utility to print tensors.
|
||||
///
|
||||
/// This structure implements the main logic for printing tensors to a stream.
|
||||
/// In order to help with formatting, the `TensorPrinter` abstracts over a custom
|
||||
/// stream type, see `TensorPrintStream`. This type is actually mostly an internal
|
||||
/// helper and mainly used by `print_tensor`. Its supposed to be constructed
|
||||
/// manually, but see the field docs for what is required.
|
||||
///
|
||||
/// @tparam DT The data type of the tensor to print.
|
||||
/// @tparam RANK The rank (number of spatial dimensions) of the tensor to print.
|
||||
///
|
||||
/// @see print_tensor
|
||||
template <DataType DT, size_t RANK>
|
||||
struct TensorPrinter
|
||||
{
|
||||
/// The name of this tensor. This will be used during printing to add extra
|
||||
/// clarity about what the user is seeing.
|
||||
std::string_view name;
|
||||
|
||||
/// Configuration details of how to print the tensor. This should be able to
|
||||
/// be specified by the user, but the default is good in most cases.
|
||||
TensorPrintConfig config;
|
||||
|
||||
/// The lengths of the tensor to print. These values are directly from
|
||||
/// `TensorDescriptor::get_lengths()`, stored here to avoid querying them
|
||||
/// repeatedly.
|
||||
Extent<RANK> lengths;
|
||||
|
||||
/// The strides of the tensor to print. These values are directly from
|
||||
/// `TensorDescriptor::get_strides()`, stored here to avoid querying them
|
||||
/// repeatedly.
|
||||
Extent<RANK> strides;
|
||||
|
||||
/// The tensor's backing buffer. This memory should be host-accessible, for
|
||||
/// example by copying it back to the host first.
|
||||
const void* h_buffer;
|
||||
|
||||
/// A common stringstream for stringifying tensor values. This is here mostly
|
||||
/// so that we can cache the internal allocation.
|
||||
std::stringstream ss;
|
||||
|
||||
/// @brief Low-level tensor value stringifying function.
|
||||
///
|
||||
/// Print value `value` to the stringstream `ss` (member value). This function
|
||||
/// is the actual low-level printing function that prints each element of the
|
||||
/// tensor. In order to get a robust printing implementation, the value is written
|
||||
/// directly into a stringstream, which is then further processed to be actually
|
||||
/// written to the output. This way, the format doesn't depend on the ostream
|
||||
/// configuration.
|
||||
///
|
||||
/// @param value The value to print to the stream.
|
||||
void stringify_value(const void* value)
|
||||
{
|
||||
if constexpr(DT == DataType::UNDEFINED_DATA_TYPE)
|
||||
{
|
||||
ss << "??";
|
||||
return;
|
||||
}
|
||||
|
||||
using CKType = detail::cpp_type_t<DT>;
|
||||
const auto ck_value = *static_cast<const CKType*>(value);
|
||||
|
||||
if constexpr(DT == DataType::I32 || DT == DataType::I8 || DT == DataType::U8)
|
||||
ss << ck_value;
|
||||
else if constexpr(DT == DataType::FP64 || DT == DataType::FP32)
|
||||
ss << std::fixed << std::setprecision(config.float_precision) << ck_value;
|
||||
else if constexpr(DT == DataType::FP16 || DT == DataType::BF16 || DT == DataType::FP8 ||
|
||||
DT == DataType::BF8)
|
||||
ss << std::fixed
|
||||
<< std::setprecision(config.float_precision)
|
||||
// Note: We are using CK types here (cpp_type_t uses DataTypeToCK), so
|
||||
// use CK's type_convert function.
|
||||
<< ::ck::type_convert<float>(ck_value);
|
||||
else
|
||||
// TODO: Tuple types? Currently not implemented in DataTypeToCK...
|
||||
static_assert(false, "stringify_value unsupported data type, please implement");
|
||||
}
|
||||
|
||||
/// @brief Print the value at an index to a stream.
|
||||
///
|
||||
/// This function reads the value at `index` and prints it to `stream` (using
|
||||
/// `stream.val(...)`).
|
||||
///
|
||||
/// @param stream The stream to print to.
|
||||
/// @param index The index in the tensor of the value to print.
|
||||
void print_value(TensorPrintStream auto& stream, const Extent<RANK>& index)
|
||||
{
|
||||
const auto offset = calculate_offset(index, strides);
|
||||
const auto* value_ptr =
|
||||
&static_cast<const std::byte*>(h_buffer)[offset * data_type_sizeof(DT)];
|
||||
|
||||
// Reset the stream without allocating.
|
||||
// ss.str("") allocates...
|
||||
ss.clear();
|
||||
ss.seekg(0);
|
||||
ss.seekp(0);
|
||||
stringify_value(value_ptr);
|
||||
// ss.view() returns a view of the ENTIRE buffer, which may have
|
||||
// lingering data since we used seekp() and seekg() to reset the
|
||||
// stream. For some reason std::stringstream works this way...
|
||||
// Fortunately tellp() returns how many bytes we've actually
|
||||
// written.
|
||||
const auto view = ss.view().substr(0, ss.tellp());
|
||||
stream.val(view);
|
||||
}
|
||||
|
||||
/// @brief Print a 1D row to a stream.
|
||||
///
|
||||
/// Print a row of tensor values to the stream. This function is used for both
|
||||
/// 1D tensors and for rows of 2D tensors, in which the base coordinate is given
|
||||
/// by `index`. Note that the print configuration is taken into account to avoid
|
||||
/// flooding the user's terminal with values.
|
||||
///
|
||||
/// @param stream The stream to print to.
|
||||
/// @param index The index of the row to print. The rightmost index element is
|
||||
/// ignored, as that is the index of the value _within_ the row.
|
||||
void print_row(TensorPrintStream auto& stream, Extent<RANK>& index)
|
||||
{
|
||||
// See note in `print_matrix`.
|
||||
stream.msg(config.row_prefix);
|
||||
limited_foreach(
|
||||
lengths[RANK - 1],
|
||||
config.col_limit,
|
||||
[&](auto i) {
|
||||
stream.msg(config.row_field_sep);
|
||||
index[RANK - 1] = i;
|
||||
print_value(stream, index);
|
||||
},
|
||||
[&]([[maybe_unused]] auto skip_count) {
|
||||
stream.msg(config.row_field_sep);
|
||||
// Note: Not using stream.val(...) here because we don't want this
|
||||
// field to partake in max_width computation, nor do we want to
|
||||
// pad it to the max width.
|
||||
stream.msg(config.row_skip_val);
|
||||
});
|
||||
|
||||
stream.msg('\n');
|
||||
}
|
||||
|
||||
/// @brief Print a 2D matrix to a stream.
|
||||
///
|
||||
/// Print a matrix of tensor values to the stream. This function is used for both
|
||||
/// 2D and slices of higher-dimensional tensors, in which the base coordinate is
|
||||
/// given by `index`. Note that the print configuration is taken into account to
|
||||
/// avoid flooding the user's terminal with values.
|
||||
///
|
||||
/// @param stream The stream to print to.
|
||||
/// @param index The index of the row to print. The 2 rightmost index elements are
|
||||
/// ignored, as those are the indices of values _within_ the matrix.
|
||||
void print_matrix(TensorPrintStream auto& stream, Extent<RANK>& index)
|
||||
{
|
||||
limited_foreach(
|
||||
lengths[RANK - 2],
|
||||
config.row_limit,
|
||||
[&](auto i) {
|
||||
index[RANK - 2] = i;
|
||||
print_row(stream, index);
|
||||
},
|
||||
[&]([[maybe_unused]] auto row_skip_count) {
|
||||
// When we encounter a skip row, continue with the same logic
|
||||
// as printing 1D tensor rows. Instead of actual values, we will
|
||||
// simply print MATRIX_ROW_SKIP_VAL (usually something like "...").
|
||||
stream.msg(config.row_prefix);
|
||||
limited_foreach(
|
||||
lengths[RANK - 1],
|
||||
config.col_limit,
|
||||
[&]([[maybe_unused]] auto i) {
|
||||
stream.msg(config.row_field_sep);
|
||||
// Note: We're using `stream.val(...)` here because we *do* want this field
|
||||
// to partake in max_width computation, and we *do* want to pad it like
|
||||
// value fields. This is so that these appear the same width as actual
|
||||
// values, so that everything is neatly aligned. This also ensures that if
|
||||
// there are no skip values, then the size of the skip field is not taken
|
||||
// into account.
|
||||
stream.val(config.matrix_row_skip_val);
|
||||
},
|
||||
[&]([[maybe_unused]] auto col_skip_count) {
|
||||
stream.msg(config.row_field_sep);
|
||||
// Note: Not using stream.val(...) here because we don't want this
|
||||
// field to partake in max_width computation, nor do we want to
|
||||
// pad it to the max width.
|
||||
stream.msg(config.row_skip_val);
|
||||
});
|
||||
stream.msg('\n');
|
||||
});
|
||||
}
|
||||
|
||||
/// @brief Print a tensor to a stream.
|
||||
///
|
||||
/// This is the main tensor printing function. It calls `print_row` or `print_matrix`
|
||||
/// (possibly repeatedly) as required. This function prints the entire tensor in
|
||||
/// `h_buffer` regardless.
|
||||
///
|
||||
/// @param stream The stream to print to.
|
||||
void print_tensor(TensorPrintStream auto& stream)
|
||||
{
|
||||
Extent<RANK> zero_coord = {};
|
||||
if constexpr(RANK == 0)
|
||||
{
|
||||
// 0D case: just print the one value
|
||||
stream.msg(config.row_prefix);
|
||||
stream.msg(config.row_field_sep);
|
||||
print_value(stream, zero_coord);
|
||||
stream.msg('\n');
|
||||
}
|
||||
else if constexpr(RANK == 1)
|
||||
{
|
||||
// 1D case: dump everything on one line
|
||||
print_row(stream, zero_coord);
|
||||
}
|
||||
else if constexpr(RANK == 2)
|
||||
{
|
||||
// 2D case: print a 2D matrix
|
||||
print_matrix(stream, zero_coord);
|
||||
}
|
||||
else
|
||||
{
|
||||
// For higher dimensions, print each window as a slice
|
||||
// We want to limit the *total* number of slices using `slice_limit`,
|
||||
// not the number in each axis. So flatten the remaining dimensions.
|
||||
// This also avoids recursion in this function in general.
|
||||
|
||||
// First get the shape minus the 2 inner dimensions
|
||||
Extent<RANK - 2> outer_shape;
|
||||
std::copy_n(lengths.begin(), RANK - 2, outer_shape.begin());
|
||||
|
||||
NdIter iter(outer_shape);
|
||||
detail::limited_foreach(
|
||||
iter.numel(),
|
||||
config.slice_limit,
|
||||
[&](auto outer_flat_index) {
|
||||
// Now decode the outer index and turn it back into a complete index
|
||||
const auto outer_index = iter(outer_flat_index);
|
||||
Extent<RANK> index = {};
|
||||
std::copy_n(outer_index.begin(), RANK - 2, index.begin());
|
||||
|
||||
// Print an extra separating line between two slices
|
||||
if(outer_flat_index != 0)
|
||||
stream.msg('\n');
|
||||
|
||||
// Print an information header about the current slice
|
||||
stream.msg("Tensor \"", name, "\", slice [");
|
||||
for(auto x : outer_index)
|
||||
stream.msg(x, ", ");
|
||||
stream.msg(":, :]\n");
|
||||
|
||||
// And print is as matrix
|
||||
print_matrix(stream, index);
|
||||
},
|
||||
[&](auto skip_count) { stream.msg("\n(skipping ", skip_count, " slices...)\n"); });
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// @brief Implementation of `TensorPrintStream` to figure out the maximum
|
||||
/// width of a field.
|
||||
///
|
||||
/// In order to produce neatly aligned tensors, where all values of each row
|
||||
/// appear on the same columns, we have to figure out the maximum width of
|
||||
/// each field. This print stream helps with that: It does not actually print
|
||||
/// anything, it just figures out the maximum width of any value (not message).
|
||||
///
|
||||
/// @details OK, this function does actually print things, but only to an
|
||||
/// internal `stringstream`. This is so that we can easily figure out the
|
||||
/// width of the field (in bytes), just by counting the amount of bytes
|
||||
/// written into the string stream.
|
||||
///
|
||||
/// @see TensorPrintStream
|
||||
struct MaxFieldWidthStream
|
||||
{
|
||||
size_t max_width = 0;
|
||||
|
||||
/// @brief Print a tensor value to the stream
|
||||
///
|
||||
/// "Print" a value to the stream. This function figures out the width
|
||||
/// of the value when printed, and then composes it with `max_width` to
|
||||
/// figure out the total maximum.
|
||||
///
|
||||
/// @param value The value to print.
|
||||
void val(std::string_view value) { max_width = std::max(max_width, value.size()); }
|
||||
|
||||
/// @brief Print a message to the stream.
|
||||
///
|
||||
/// "Print" a non-value message to the stream. In this implementation,
|
||||
/// everything is discarded.
|
||||
///
|
||||
/// @tparam Args the types of the values to print.
|
||||
///
|
||||
/// @param args The values to print.
|
||||
template <typename... Args>
|
||||
void msg([[maybe_unused]] const Args&... args)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
/// @brief Implementation of `TensorPrintStream` which actually prints.
|
||||
///
|
||||
/// In contrast to `MaxFieldWidthStream`, this function actually prints
|
||||
/// to an ostream, taking the value produced by that type into account.
|
||||
struct OutputStream
|
||||
{
|
||||
std::ostream& stream;
|
||||
// The maximum width of each tensor value.
|
||||
size_t max_width;
|
||||
|
||||
/// @brief Print a tensor value to the stream
|
||||
///
|
||||
/// Actually print a value into the stream, (right-)padding it to
|
||||
/// `max_width`.
|
||||
///
|
||||
/// @param value The value to print.
|
||||
void val(std::string_view value)
|
||||
{
|
||||
stream << std::setfill(' ') << std::setw(max_width) << value;
|
||||
}
|
||||
|
||||
/// @brief Print a message to the stream.
|
||||
///
|
||||
/// This prints a non-value message directly to the ostream, as if
|
||||
/// folded via `operator<<`.
|
||||
///
|
||||
/// @tparam Args the types of the values to print.
|
||||
///
|
||||
/// @param args The values to print.
|
||||
template <typename... Args>
|
||||
void msg(const Args&... args)
|
||||
{
|
||||
(stream << ... << args);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// @brief Print device tensor values to an ostream.
|
||||
///
|
||||
/// Print the values of a tensor to an ostream. This function neatly formats
|
||||
/// the tensor according to `config`, tabulating the values so that they are
|
||||
/// vertically aligned and skipping values to prevent flooding the terminal.
|
||||
/// With the default config, this function is good to get a quick overview
|
||||
/// of what a tensor looks like. For a more complete overview, consider
|
||||
/// supplying `TensorPrintConfig::unlimited()` to get everything (but beware
|
||||
/// of flooding the terminal). Tensors are printed with the rightmost-dimension
|
||||
/// as inner dimension, these values appear on the same row in the output.
|
||||
///
|
||||
/// @tparam DT The data type of the tensor.
|
||||
/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
|
||||
///
|
||||
/// @param name A name for the tensor. This will be used to add some extra identifying
|
||||
/// information during printing.
|
||||
/// @param desc The descriptor for the tensor memory layout.
|
||||
/// @param d_buffer The tensor's actual data buffer. This is expected to be
|
||||
/// _device accessible_ memory, as its copied back to the host first.
|
||||
/// @param config Tensor printing configuration. This allows tweaking some details
|
||||
/// of the printing process.
|
||||
/// @param out The ostream to print to, `std::cout` by default.
|
||||
template <DataType DT, size_t RANK>
|
||||
void print_tensor(std::string_view name,
|
||||
const TensorDescriptor<DT, RANK>& desc,
|
||||
const void* d_buffer,
|
||||
TensorPrintConfig config = {},
|
||||
std::ostream& out = std::cout)
|
||||
{
|
||||
// Copy memory to the host (printing from device is sketchy)
|
||||
const auto space = desc.get_element_space_size_in_bytes();
|
||||
std::vector<std::byte> h_buffer(space);
|
||||
check_hip(hipMemcpy(h_buffer.data(), d_buffer, space, hipMemcpyDeviceToHost));
|
||||
|
||||
// Create a custom stream with a completely new config (locale,
|
||||
/// precision, fill, etc). Use an osyncstream to buffer the output
|
||||
/// while were at it (its not likely to help a lot, but why not).
|
||||
std::osyncstream stream(out.rdbuf());
|
||||
stream.imbue(std::locale(std::locale(), new detail::numpunct{}));
|
||||
|
||||
// Print a header for the entire tensor (regardless of if there are multiple slices).
|
||||
stream << "Tensor \"" << name << "\": shape = " << desc.get_lengths() << "\n";
|
||||
|
||||
detail::TensorPrinter<DT, RANK> printer = {
|
||||
.name = name,
|
||||
.config = config,
|
||||
.lengths = desc.get_lengths(),
|
||||
.strides = desc.get_strides(),
|
||||
.h_buffer = h_buffer.data(),
|
||||
.ss = std::stringstream(),
|
||||
};
|
||||
|
||||
// We're actually going to print twice: once to figure out the
|
||||
// maximum width of the fields, and once to actually print to the stream.
|
||||
|
||||
// Print once to figure out the maximum field width.
|
||||
detail::MaxFieldWidthStream max_field_width;
|
||||
printer.print_tensor(max_field_width);
|
||||
|
||||
// Actually print to the output stream.
|
||||
detail::OutputStream tensor_out = {
|
||||
.stream = stream,
|
||||
.max_width = max_field_width.max_width,
|
||||
};
|
||||
printer.print_tensor(tensor_out);
|
||||
}
|
||||
|
||||
} // namespace ck_tile::builder::test
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <iosfwd>
|
||||
#include <concepts>
|
||||
#include <algorithm>
|
||||
#include <hip/hip_runtime.h>
|
||||
@@ -123,6 +124,33 @@ struct Extent : std::array<size_t, RANK>
|
||||
template <typename... T>
|
||||
Extent(T...) -> Extent<sizeof...(T)>;
|
||||
|
||||
/// @brief Extent printer
|
||||
///
|
||||
/// This function implements an ostream printing overload for `Extent`, so that
|
||||
/// they can be printed in the usual `stream << extent` fashion.
|
||||
///
|
||||
/// @tparam RANK Rank (number of spatial dimensions) of the extent.
|
||||
///
|
||||
/// @param stream The stream to print the extent to.
|
||||
/// @param extent The extent to print to the stream.
|
||||
template <size_t RANK>
|
||||
std::ostream& operator<<(std::ostream& stream, const Extent<RANK>& extent)
|
||||
{
|
||||
stream << '[';
|
||||
bool first = true;
|
||||
for(const auto x : extent)
|
||||
{
|
||||
if(first)
|
||||
first = false;
|
||||
else
|
||||
stream << ", ";
|
||||
|
||||
stream << x;
|
||||
}
|
||||
|
||||
return stream << ']';
|
||||
}
|
||||
|
||||
/// @brief Concept for automatically deriving tensor memory layout.
|
||||
///
|
||||
/// A `TensorStridesGenerator` is a type which can be used to automatically
|
||||
|
||||
@@ -18,6 +18,102 @@
|
||||
|
||||
namespace ck_tile::builder::test {
|
||||
|
||||
/// @brief Utility structure for N-dimensional iteration using a flat index
|
||||
///
|
||||
/// This structure's main purpose is to "unmerge" a flattened index into a
|
||||
/// multi-dimensional index, which helps when iterating over multi-dimensional
|
||||
/// indices without having to write an arbitrary amount of nested for loops.
|
||||
/// A minimal amount of precomputation must be done to do this efficiently,
|
||||
/// which is handled in the constructor of this type.
|
||||
///
|
||||
/// @details Decoding a flat index into a multi-dimensional index is done by
|
||||
/// first computing a reverse scan of the shape. These values can then be
|
||||
/// used to decode the index in the usual way:
|
||||
///
|
||||
/// x = flat_idx / (size_y * size_z)
|
||||
/// y = flat_idx % (size_y * size_z) / size_z
|
||||
/// z = flat_idx % (size_y * size_z) % size_z
|
||||
/// etc
|
||||
///
|
||||
/// The decode order is such that the innermost dimension (right in
|
||||
/// the shape extent) changes the fastest.
|
||||
///
|
||||
/// @tparam RANK The rank (number of spatial dimensions) of the tensor to
|
||||
/// iterate.
|
||||
template <size_t RANK>
|
||||
struct NdIter
|
||||
{
|
||||
/// @brief Prepare N-dimensional iteration over a particular shape.
|
||||
///
|
||||
/// Precompute ashape into a form that can be used to easily decode a flat
|
||||
/// index into a multi-dimensional index.
|
||||
///
|
||||
/// @param shape The shape to iterate over.
|
||||
explicit NdIter(const Extent<RANK>& shape)
|
||||
{
|
||||
// Precompute shape_scan = [..., shape[-2] * shape[-1], shape[-1], 1]
|
||||
|
||||
numel_ = 1;
|
||||
for(int i = RANK; i > 0; --i)
|
||||
{
|
||||
shape_scan_[i - 1] = numel_;
|
||||
numel_ *= shape[i - 1];
|
||||
}
|
||||
}
|
||||
|
||||
/// @brief Unflatten a flat index into a multi-dimensional index
|
||||
///
|
||||
/// This applies the usual multi-dimensional indexing method over the
|
||||
/// precomputed shape scan to get back a multi-dimensional index.
|
||||
/// The decode order is such that the innermost dimension (right in
|
||||
/// the shape extent) changes the fastest.
|
||||
///
|
||||
/// @param flat_index The "flattened" (1-dimensional) index of the tensor
|
||||
///
|
||||
/// @returns A multi-dimensional index into the tensor
|
||||
///
|
||||
/// @pre `0 <= flat_index < size()` (in other words, the `flat_index` must
|
||||
/// be in bounds of the tensor shape that this `NdIter` was made from).
|
||||
__host__ __device__ Extent<RANK> operator()(size_t flat_index) const
|
||||
{
|
||||
Extent<RANK> index = {};
|
||||
auto idx = flat_index;
|
||||
for(size_t i = 0; i < RANK; ++i)
|
||||
{
|
||||
const auto scanned_dim = shape_scan_[i];
|
||||
index[i] = idx / scanned_dim;
|
||||
idx %= scanned_dim;
|
||||
}
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
/// @brief Return the total elements to iterate over
|
||||
///
|
||||
/// Get the total number of elements in the shape to iterate over. This value
|
||||
/// can be used to construct a complete for loop to iterate over all indices
|
||||
/// of a tensor, for example:
|
||||
///
|
||||
/// for(size_t i = 0; i < iter.numel(); ++i)
|
||||
/// {
|
||||
/// const auto index = iter(i);
|
||||
/// use(index);
|
||||
/// }
|
||||
__host__ __device__ size_t numel() const { return numel_; }
|
||||
|
||||
private:
|
||||
/// Reverse (right) scan of the shape to iterate over.
|
||||
Extent<RANK> shape_scan_;
|
||||
|
||||
/// The total number of elements in the shape. This value turns out to be almost
|
||||
/// always required when iterating over a shape, so just store it in this type
|
||||
/// so that it is easily accessible.
|
||||
size_t numel_;
|
||||
};
|
||||
|
||||
template <size_t RANK>
|
||||
NdIter(Extent<RANK>) -> NdIter<RANK>;
|
||||
|
||||
/// @brief Concept for constraining tensor iteration functors.
|
||||
///
|
||||
/// This concept checks that a functor has the correct signature for
|
||||
@@ -50,28 +146,19 @@ constexpr int DEVICE_FOREACH_BLOCK_SIZE = 256;
|
||||
/// @tparam F The type of the callback to invoke. This function must be
|
||||
/// compatible with execution as a __device__ function.
|
||||
///
|
||||
/// @param numel The total number of elements in the tensor.
|
||||
/// @param shape_scan A right-exclusive scan of the shape of the tensor.
|
||||
/// @param iter An NdIter instance to help iterating over the tensor.
|
||||
/// @param f The callback to invoke for each index of the tensor. This
|
||||
/// functor must be eligible for running on the GPU.
|
||||
template <int BLOCK_SIZE, size_t RANK, typename F>
|
||||
requires ForeachFunctor<F, RANK>
|
||||
__global__ __launch_bounds__(BLOCK_SIZE) //
|
||||
void foreach_kernel(const size_t numel, Extent<RANK> shape_scan, F f)
|
||||
void foreach_kernel(NdIter<RANK> iter, F f)
|
||||
{
|
||||
const auto gid = blockIdx.x * BLOCK_SIZE + threadIdx.x;
|
||||
for(size_t flat_idx = gid; flat_idx < numel; flat_idx += gridDim.x * BLOCK_SIZE)
|
||||
for(size_t flat_idx = gid; flat_idx < iter.numel(); flat_idx += gridDim.x * BLOCK_SIZE)
|
||||
{
|
||||
// Compute the current index.
|
||||
Extent<RANK> index = {};
|
||||
|
||||
size_t idx = flat_idx;
|
||||
for(size_t i = 0; i < RANK; ++i)
|
||||
{
|
||||
const auto scanned_dim = shape_scan[i];
|
||||
index[i] = idx / scanned_dim;
|
||||
idx %= scanned_dim;
|
||||
}
|
||||
const auto index = iter(flat_idx);
|
||||
|
||||
// Then invoke the callback with the index.
|
||||
f(index);
|
||||
@@ -160,18 +247,12 @@ void tensor_foreach(const Extent<RANK>& shape, ForeachFunctor<RANK> auto f)
|
||||
// order in the kernel is from large-to-small. Right layout is the
|
||||
// easiest solution for that.
|
||||
|
||||
Extent<RANK> shape_scan;
|
||||
size_t numel = 1;
|
||||
for(int i = RANK; i > 0; --i)
|
||||
{
|
||||
shape_scan[i - 1] = numel;
|
||||
numel *= shape[i - 1];
|
||||
}
|
||||
NdIter iter(shape);
|
||||
|
||||
// Reset any errors from previous launches.
|
||||
(void)hipGetLastError();
|
||||
|
||||
kernel<<<occupancy * multiprocessors, block_size>>>(numel, shape_scan, f);
|
||||
kernel<<<occupancy * multiprocessors, block_size>>>(iter, f);
|
||||
check_hip(hipGetLastError());
|
||||
}
|
||||
|
||||
@@ -179,7 +260,7 @@ void tensor_foreach(const Extent<RANK>& shape, ForeachFunctor<RANK> auto f)
|
||||
///
|
||||
/// This concept checks that a functor has the correct signature for
|
||||
/// use with the `fill_tensor` function.
|
||||
template <typename F, builder::DataType DT, size_t RANK>
|
||||
template <typename F, DataType DT, size_t RANK>
|
||||
concept FillTensorFunctor = requires(const F& f, const Extent<RANK>& index) {
|
||||
{ f(index) } -> std::convertible_to<detail::cpp_type_t<DT>>;
|
||||
};
|
||||
@@ -199,7 +280,7 @@ concept FillTensorFunctor = requires(const F& f, const Extent<RANK>& index) {
|
||||
/// @param f A functor used to get the value at a particular coordinate.
|
||||
///
|
||||
/// @see FillTensorFunctor
|
||||
template <builder::DataType DT, size_t RANK>
|
||||
template <DataType DT, size_t RANK>
|
||||
void fill_tensor(const TensorDescriptor<DT, RANK>& desc,
|
||||
void* buffer,
|
||||
FillTensorFunctor<DT, RANK> auto f)
|
||||
@@ -218,7 +299,7 @@ void fill_tensor(const TensorDescriptor<DT, RANK>& desc,
|
||||
///
|
||||
/// This concept checks that a functor has the correct signature for
|
||||
/// use with the `fill_tensor_buffer` function.
|
||||
template <typename F, builder::DataType DT>
|
||||
template <typename F, DataType DT>
|
||||
concept FillTensorBufferFunctor = requires(const F& f, size_t index) {
|
||||
{ f(index) } -> std::convertible_to<detail::cpp_type_t<DT>>;
|
||||
};
|
||||
@@ -239,7 +320,7 @@ concept FillTensorBufferFunctor = requires(const F& f, size_t index) {
|
||||
/// @param f A functor used to get the value at a particular index.
|
||||
///
|
||||
/// @see FillTensorBufferFunctor
|
||||
template <builder::DataType DT, size_t RANK>
|
||||
template <DataType DT, size_t RANK>
|
||||
void fill_tensor_buffer(const TensorDescriptor<DT, RANK>& desc,
|
||||
void* buffer,
|
||||
FillTensorBufferFunctor<DT> auto f)
|
||||
@@ -247,7 +328,19 @@ void fill_tensor_buffer(const TensorDescriptor<DT, RANK>& desc,
|
||||
fill_tensor(desc.get_space_descriptor(), buffer, [f](auto index) { return f(index[0]); });
|
||||
}
|
||||
|
||||
template <builder::DataType DT, size_t RANK>
|
||||
/// @brief Utility for clearing tensor buffers to a particular value.
|
||||
///
|
||||
/// This function initializes all memory backing a particular tensor buffer to
|
||||
/// one specific value, zero by default. Note that this function ignores strides,
|
||||
/// and clears the entire buffer backing the tensor.
|
||||
///
|
||||
/// @tparam DT The tensor element datatype
|
||||
/// @tparam RANK The rank (number of spatial dimensions) of the tensor.
|
||||
///
|
||||
/// @param desc The descriptor of the tensor to initialize.
|
||||
/// @param buffer The memory of the tensor to initialize.
|
||||
/// @param value The value to initialize the tensor buffer with.
|
||||
template <DataType DT, size_t RANK>
|
||||
void clear_tensor_buffer(const TensorDescriptor<DT, RANK>& desc,
|
||||
void* buffer,
|
||||
detail::cpp_type_t<DT> value = detail::cpp_type_t<DT>{0})
|
||||
|
||||
@@ -39,7 +39,7 @@ constexpr size_t data_type_sizeof(DataType data_type)
|
||||
case DataType::FP8: return 1;
|
||||
case DataType::BF8: return 1;
|
||||
case DataType::FP64: return 8;
|
||||
case DataType::INT32: return 4;
|
||||
case DataType::I32: return 4;
|
||||
case DataType::I8: return 1;
|
||||
case DataType::I8_I8: return 2;
|
||||
case DataType::U8: return 1;
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
#include "ck_tile/builder/testing/tensor_buffer.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_foreach.hpp"
|
||||
#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/utility/type_convert.hpp"
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
@@ -24,7 +24,7 @@ enum class DataType
|
||||
FP8,
|
||||
BF8,
|
||||
FP64,
|
||||
INT32,
|
||||
I32,
|
||||
I8,
|
||||
I8_I8,
|
||||
U8
|
||||
@@ -252,8 +252,8 @@ enum class ConvAlgorithmSpecialization
|
||||
REFERENCE // GPU reference implementation for validation
|
||||
};
|
||||
|
||||
// toString methods for enum classes
|
||||
inline std::string_view toString(DataType dt)
|
||||
// to_string methods for enum classes
|
||||
inline std::string_view to_string(DataType dt)
|
||||
{
|
||||
using enum DataType;
|
||||
switch(dt)
|
||||
@@ -267,7 +267,7 @@ inline std::string_view toString(DataType dt)
|
||||
case FP8: return "FP8";
|
||||
case BF8: return "BF8";
|
||||
case FP64: return "FP64";
|
||||
case INT32: return "INT32";
|
||||
case I32: return "I32";
|
||||
case I8: return "I8";
|
||||
case I8_I8: return "I8_I8";
|
||||
case U8: return "U8";
|
||||
@@ -276,7 +276,7 @@ inline std::string_view toString(DataType dt)
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string_view toString(ConvDirection dir)
|
||||
inline std::string_view to_string(ConvDirection dir)
|
||||
{
|
||||
using enum ConvDirection;
|
||||
switch(dir)
|
||||
@@ -288,7 +288,7 @@ inline std::string_view toString(ConvDirection dir)
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string_view toString(ElementwiseOperation op)
|
||||
inline std::string_view to_string(ElementwiseOperation op)
|
||||
{
|
||||
using enum ElementwiseOperation;
|
||||
switch(op)
|
||||
@@ -332,7 +332,7 @@ inline std::string_view toString(ElementwiseOperation op)
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string_view toString(PipelineVersion ver)
|
||||
inline std::string_view to_string(PipelineVersion ver)
|
||||
{
|
||||
using enum PipelineVersion;
|
||||
switch(ver)
|
||||
@@ -347,7 +347,7 @@ inline std::string_view toString(PipelineVersion ver)
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string_view toString(GemmSpecialization spec)
|
||||
inline std::string_view to_string(GemmSpecialization spec)
|
||||
{
|
||||
using enum GemmSpecialization;
|
||||
switch(spec)
|
||||
@@ -372,7 +372,7 @@ inline std::string_view toString(GemmSpecialization spec)
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string_view toString(ConvFwdSpecialization spec)
|
||||
inline std::string_view to_string(ConvFwdSpecialization spec)
|
||||
{
|
||||
using enum ConvFwdSpecialization;
|
||||
switch(spec)
|
||||
@@ -386,7 +386,7 @@ inline std::string_view toString(ConvFwdSpecialization spec)
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string_view toString(ConvBwdDataSpecialization spec)
|
||||
inline std::string_view to_string(ConvBwdDataSpecialization spec)
|
||||
{
|
||||
using enum ConvBwdDataSpecialization;
|
||||
switch(spec)
|
||||
@@ -397,7 +397,7 @@ inline std::string_view toString(ConvBwdDataSpecialization spec)
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string_view toString(ConvBwdWeightSpecialization spec)
|
||||
inline std::string_view to_string(ConvBwdWeightSpecialization spec)
|
||||
{
|
||||
using enum ConvBwdWeightSpecialization;
|
||||
switch(spec)
|
||||
@@ -410,7 +410,7 @@ inline std::string_view toString(ConvBwdWeightSpecialization spec)
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string_view toString(GemmPadding padding)
|
||||
inline std::string_view to_string(GemmPadding padding)
|
||||
{
|
||||
using enum GemmPadding;
|
||||
switch(padding)
|
||||
@@ -435,7 +435,7 @@ inline std::string_view toString(GemmPadding padding)
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string_view toString(PipelineScheduler sched)
|
||||
inline std::string_view to_string(PipelineScheduler sched)
|
||||
{
|
||||
using enum PipelineScheduler;
|
||||
switch(sched)
|
||||
@@ -447,7 +447,7 @@ inline std::string_view toString(PipelineScheduler sched)
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string_view toString(TensorLayout layout)
|
||||
inline std::string_view to_string(TensorLayout layout)
|
||||
{
|
||||
using enum TensorLayout;
|
||||
switch(layout)
|
||||
@@ -503,53 +503,56 @@ inline std::string_view toString(TensorLayout layout)
|
||||
}
|
||||
|
||||
// ostream operator overloads for enum classes
|
||||
inline std::ostream& operator<<(std::ostream& os, DataType dt) { return os << toString(dt); }
|
||||
inline std::ostream& operator<<(std::ostream& os, DataType dt) { return os << to_string(dt); }
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, ConvDirection dir) { return os << toString(dir); }
|
||||
inline std::ostream& operator<<(std::ostream& os, ConvDirection dir)
|
||||
{
|
||||
return os << to_string(dir);
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, ElementwiseOperation op)
|
||||
{
|
||||
return os << toString(op);
|
||||
return os << to_string(op);
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, PipelineVersion ver)
|
||||
{
|
||||
return os << toString(ver);
|
||||
return os << to_string(ver);
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, GemmSpecialization spec)
|
||||
{
|
||||
return os << toString(spec);
|
||||
return os << to_string(spec);
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, ConvFwdSpecialization spec)
|
||||
{
|
||||
return os << toString(spec);
|
||||
return os << to_string(spec);
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, ConvBwdDataSpecialization spec)
|
||||
{
|
||||
return os << toString(spec);
|
||||
return os << to_string(spec);
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, ConvBwdWeightSpecialization spec)
|
||||
{
|
||||
return os << toString(spec);
|
||||
return os << to_string(spec);
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, GemmPadding padding)
|
||||
{
|
||||
return os << toString(padding);
|
||||
return os << to_string(padding);
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, PipelineScheduler sched)
|
||||
{
|
||||
return os << toString(sched);
|
||||
return os << to_string(sched);
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, TensorLayout layout)
|
||||
{
|
||||
return os << toString(layout);
|
||||
return os << to_string(layout);
|
||||
}
|
||||
|
||||
// ostream operator overload for std::variant of convolution specializations
|
||||
|
||||
@@ -83,6 +83,7 @@ add_ck_builder_test(test_ckb_conv_builder
|
||||
unit_tensor_foreach.cpp
|
||||
unit_error.cpp
|
||||
unit_validation.cpp
|
||||
unit_debug.cpp
|
||||
unit_conv_elementwise_op.cpp
|
||||
unit_conv_tensor_layout.cpp
|
||||
unit_conv_tensor_type.cpp
|
||||
|
||||
@@ -22,7 +22,7 @@ TEST(FwdConvInstances,
|
||||
constexpr ConvSignature FwdConvSignature{.spatial_dim = 1,
|
||||
.direction = FORWARD,
|
||||
.data_type = I8,
|
||||
.accumulation_data_type = INT32,
|
||||
.accumulation_data_type = I32,
|
||||
.input = {.config = {.layout = GNWC}},
|
||||
.weight = {.config = {.layout = GKXC}},
|
||||
.output = {.config = {.layout = GNWK}}};
|
||||
|
||||
@@ -27,7 +27,7 @@ TEST(ConvTensorType, Exhaustive)
|
||||
case FP32: EXPECT_TRUE((check_same<FP32, float>)); break;
|
||||
case FP16: EXPECT_TRUE((check_same<FP16, ck::half_t>)); break;
|
||||
case BF16: EXPECT_TRUE((check_same<BF16, ck::bhalf_t>)); break;
|
||||
case INT32: EXPECT_TRUE((check_same<INT32, uint32_t>)); break;
|
||||
case I32: EXPECT_TRUE((check_same<I32, uint32_t>)); break;
|
||||
case FP8: EXPECT_TRUE((check_same<FP8, ck::f8_t>)); break;
|
||||
case I8: EXPECT_TRUE((check_same<I8, int8_t>)); break;
|
||||
case U8: EXPECT_TRUE((check_same<U8, uint8_t>)); break;
|
||||
|
||||
464
experimental/builder/test/unit_debug.cpp
Normal file
464
experimental/builder/test/unit_debug.cpp
Normal file
@@ -0,0 +1,464 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck_tile/builder/testing/tensor_descriptor.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_foreach.hpp"
|
||||
#include "ck_tile/builder/testing/debug.hpp"
|
||||
#include "testing_utils.hpp"
|
||||
#include <gtest/gtest.h>
|
||||
#include <gmock/gmock.h>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
namespace ckb = ck_tile::builder;
|
||||
namespace ckt = ck_tile::builder::test;
|
||||
|
||||
using ck_tile::test::StringEqWithDiff;
|
||||
using ::testing::ElementsAreArray;
|
||||
using ::testing::Eq;
|
||||
using ::testing::Gt;
|
||||
|
||||
TEST(Debug, PrintDescriptor)
|
||||
{
|
||||
auto desc =
|
||||
ckt::make_descriptor<ckb::DataType::I32>(ckt::Extent{10, 11, 12}, ckt::PackedRightLayout{});
|
||||
|
||||
std::stringstream ss;
|
||||
ckt::print_descriptor("test", desc, ss);
|
||||
|
||||
EXPECT_THAT(ss.str(),
|
||||
StringEqWithDiff( //
|
||||
"Descriptor \"test\":\n"
|
||||
" data type: I32\n"
|
||||
" size: 1'320 elements\n"
|
||||
" space: 1'320 elements (5'280 bytes)\n"
|
||||
" lengths: [10, 11, 12]\n"
|
||||
" strides: [132, 12, 1]\n"
|
||||
" packed: yes\n"));
|
||||
|
||||
// Make sure that the stream locale does not leak.
|
||||
ss.str("");
|
||||
ss << 1000;
|
||||
EXPECT_THAT(ss.str(), StringEqWithDiff("1000"));
|
||||
}
|
||||
|
||||
TEST(Debug, LimitedForeach)
|
||||
{
|
||||
{
|
||||
std::vector<size_t> values;
|
||||
size_t delim_count = 0;
|
||||
ckt::detail::limited_foreach(
|
||||
10,
|
||||
2,
|
||||
[&](auto i) { values.push_back(i); },
|
||||
[&](auto skip_count) {
|
||||
++delim_count;
|
||||
EXPECT_THAT(skip_count, Eq(10 - 2));
|
||||
});
|
||||
EXPECT_THAT(values, ElementsAreArray({0, 9}));
|
||||
EXPECT_THAT(delim_count, Eq(1));
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<size_t> values;
|
||||
size_t delim_count = 0;
|
||||
ckt::detail::limited_foreach(
|
||||
100,
|
||||
9,
|
||||
[&](auto i) { values.push_back(i); },
|
||||
[&](auto skip_count) {
|
||||
++delim_count;
|
||||
EXPECT_THAT(skip_count, Eq(100 - 9));
|
||||
});
|
||||
EXPECT_THAT(values, ElementsAreArray({0, 1, 2, 3, 4, 96, 97, 98, 99}));
|
||||
EXPECT_THAT(delim_count, Eq(1));
|
||||
}
|
||||
|
||||
{
|
||||
size_t call_count = 0;
|
||||
size_t delim_count = 0;
|
||||
ckt::detail::limited_foreach(
|
||||
50,
|
||||
100,
|
||||
[&](auto i) {
|
||||
EXPECT_THAT(i, Eq(call_count));
|
||||
++call_count;
|
||||
},
|
||||
[&]([[maybe_unused]] auto skip_count) { ++delim_count; });
|
||||
EXPECT_THAT(call_count, Eq(50));
|
||||
EXPECT_THAT(delim_count, Eq(0));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Debug, PrintTensor0D)
|
||||
{
|
||||
auto desc = ckt::make_descriptor<ckb::DataType::I32>(ckt::Extent{}, ckt::PackedRightLayout{});
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
ckt::fill_tensor_buffer(desc, a.get(), []([[maybe_unused]] size_t i) { return 123; });
|
||||
|
||||
std::stringstream ss;
|
||||
ckt::print_tensor("0D", desc, a.get(), {}, ss);
|
||||
|
||||
EXPECT_THAT(ss.str(),
|
||||
StringEqWithDiff( //
|
||||
"Tensor \"0D\": shape = []\n"
|
||||
" 123\n"));
|
||||
}
|
||||
|
||||
TEST(Debug, PrintTensor1D)
|
||||
{
|
||||
auto desc = ckt::make_descriptor<ckb::DataType::I32>(ckt::Extent{44}, ckt::PackedRightLayout{});
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return i % 7; });
|
||||
|
||||
std::stringstream ss;
|
||||
ckt::print_tensor("1D", desc, a.get(), {}, ss);
|
||||
|
||||
// Note: output does not involve the size of the matrix separator fields,
|
||||
// since these are not printed.
|
||||
EXPECT_THAT(ss.str(),
|
||||
StringEqWithDiff( //
|
||||
"Tensor \"1D\": shape = [44]\n"
|
||||
" 0 1 2 3 4 ... 4 5 6 0 1\n"));
|
||||
}
|
||||
|
||||
TEST(Debug, PrintTensor4D)
|
||||
{
|
||||
auto desc = ckt::make_descriptor<ckb::DataType::I32>(ckt::Extent{100, 110, 120, 130},
|
||||
ckt::PackedRightLayout{});
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return i; });
|
||||
|
||||
std::stringstream ss;
|
||||
ckt::print_tensor("4D",
|
||||
desc,
|
||||
a.get(),
|
||||
{
|
||||
// Reduce default limits to have smaller output here.
|
||||
// That also tests that we can configure these (to some
|
||||
// extent).
|
||||
.col_limit = 4,
|
||||
.row_limit = 4,
|
||||
.slice_limit = 4,
|
||||
},
|
||||
ss);
|
||||
|
||||
EXPECT_THAT(ss.str(),
|
||||
StringEqWithDiff( //
|
||||
"Tensor \"4D\": shape = [100, 110, 120, 130]\n"
|
||||
"Tensor \"4D\", slice [0, 0, :, :]\n"
|
||||
" 0 1 ... 128 129\n"
|
||||
" 130 131 ... 258 259\n"
|
||||
" ... ... ... ... ...\n"
|
||||
" 15340 15341 ... 15468 15469\n"
|
||||
" 15470 15471 ... 15598 15599\n"
|
||||
"\n"
|
||||
"Tensor \"4D\", slice [0, 1, :, :]\n"
|
||||
" 15600 15601 ... 15728 15729\n"
|
||||
" 15730 15731 ... 15858 15859\n"
|
||||
" ... ... ... ... ...\n"
|
||||
" 30940 30941 ... 31068 31069\n"
|
||||
" 31070 31071 ... 31198 31199\n"
|
||||
"\n"
|
||||
"(skipping 10'996 slices...)\n"
|
||||
"\n"
|
||||
"Tensor \"4D\", slice [99, 108, :, :]\n"
|
||||
" 171568800 171568801 ... 171568928 171568929\n"
|
||||
" 171568930 171568931 ... 171569058 171569059\n"
|
||||
" ... ... ... ... ...\n"
|
||||
" 171584140 171584141 ... 171584268 171584269\n"
|
||||
" 171584270 171584271 ... 171584398 171584399\n"
|
||||
"\n"
|
||||
"Tensor \"4D\", slice [99, 109, :, :]\n"
|
||||
" 171584400 171584401 ... 171584528 171584529\n"
|
||||
" 171584530 171584531 ... 171584658 171584659\n"
|
||||
" ... ... ... ... ...\n"
|
||||
" 171599740 171599741 ... 171599868 171599869\n"
|
||||
" 171599870 171599871 ... 171599998 171599999\n"));
|
||||
}
|
||||
|
||||
TEST(Debug, PrintTensorCustomConfig)
|
||||
{
|
||||
auto desc =
|
||||
ckt::make_descriptor<ckb::DataType::I32>(ckt::Extent{10, 10, 10}, ckt::PackedRightLayout{});
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return i * 101 % 77; });
|
||||
|
||||
std::stringstream ss;
|
||||
ckt::print_tensor("CustomConfig",
|
||||
desc,
|
||||
a.get(),
|
||||
{
|
||||
// Reduce default limits to have smaller output here.
|
||||
// That also tests that we can configure these.
|
||||
.col_limit = 4,
|
||||
.row_limit = 2,
|
||||
.slice_limit = 6,
|
||||
// Try with different sizes to make sure that the alignment
|
||||
// is still correct after changing these.
|
||||
.row_prefix = ">>>>",
|
||||
.row_field_sep = "|||||",
|
||||
.row_skip_val = "-------",
|
||||
.matrix_row_skip_val = "&&&&&&&&",
|
||||
},
|
||||
ss);
|
||||
|
||||
EXPECT_THAT(ss.str(),
|
||||
StringEqWithDiff( //
|
||||
"Tensor \"CustomConfig\": shape = [10, 10, 10]\n"
|
||||
"Tensor \"CustomConfig\", slice [0, :, :]\n"
|
||||
">>>>||||| 0||||| 24|||||-------||||| 38||||| 62\n"
|
||||
">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n"
|
||||
">>>>||||| 4||||| 28|||||-------||||| 42||||| 66\n"
|
||||
"\n"
|
||||
"Tensor \"CustomConfig\", slice [1, :, :]\n"
|
||||
">>>>||||| 13||||| 37|||||-------||||| 51||||| 75\n"
|
||||
">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n"
|
||||
">>>>||||| 17||||| 41|||||-------||||| 55||||| 2\n"
|
||||
"\n"
|
||||
"Tensor \"CustomConfig\", slice [2, :, :]\n"
|
||||
">>>>||||| 26||||| 50|||||-------||||| 64||||| 11\n"
|
||||
">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n"
|
||||
">>>>||||| 30||||| 54|||||-------||||| 68||||| 15\n"
|
||||
"\n"
|
||||
"(skipping 4 slices...)\n"
|
||||
"\n"
|
||||
"Tensor \"CustomConfig\", slice [7, :, :]\n"
|
||||
">>>>||||| 14||||| 38|||||-------||||| 52||||| 76\n"
|
||||
">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n"
|
||||
">>>>||||| 18||||| 42|||||-------||||| 56||||| 3\n"
|
||||
"\n"
|
||||
"Tensor \"CustomConfig\", slice [8, :, :]\n"
|
||||
">>>>||||| 27||||| 51|||||-------||||| 65||||| 12\n"
|
||||
">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n"
|
||||
">>>>||||| 31||||| 55|||||-------||||| 69||||| 16\n"
|
||||
"\n"
|
||||
"Tensor \"CustomConfig\", slice [9, :, :]\n"
|
||||
">>>>||||| 40||||| 64|||||-------||||| 1||||| 25\n"
|
||||
">>>>|||||&&&&&&&&|||||&&&&&&&&|||||-------|||||&&&&&&&&|||||&&&&&&&&\n"
|
||||
">>>>||||| 44||||| 68|||||-------||||| 5||||| 29\n"));
|
||||
}
|
||||
|
||||
TEST(Debug, PrintTensorUnlimitedMatrix)
|
||||
{
|
||||
// To limit the output of the test, split the "unlimited" test up into one for the
|
||||
// matrices and one for the slices.
|
||||
|
||||
const ckt::Extent shape = ckt::Extent{12, 12};
|
||||
const ckt::TensorPrintConfig default_config;
|
||||
|
||||
// The shape should be larger than the default, otherwise this test doesn't make
|
||||
// any sense.
|
||||
ASSERT_THAT(shape[1], Gt(default_config.col_limit));
|
||||
ASSERT_THAT(shape[2], Gt(default_config.row_limit));
|
||||
|
||||
auto desc = ckt::make_descriptor<ckb::DataType::I32>(shape, ckt::PackedRightLayout{});
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return i ^ 0xF; });
|
||||
|
||||
std::stringstream ss;
|
||||
ckt::print_tensor("UnlimitedConfig", desc, a.get(), ckt::TensorPrintConfig::unlimited(), ss);
|
||||
|
||||
EXPECT_THAT(ss.str(),
|
||||
StringEqWithDiff( //
|
||||
"Tensor \"UnlimitedConfig\": shape = [12, 12]\n"
|
||||
" 15 14 13 12 11 10 9 8 7 6 5 4\n"
|
||||
" 3 2 1 0 31 30 29 28 27 26 25 24\n"
|
||||
" 23 22 21 20 19 18 17 16 47 46 45 44\n"
|
||||
" 43 42 41 40 39 38 37 36 35 34 33 32\n"
|
||||
" 63 62 61 60 59 58 57 56 55 54 53 52\n"
|
||||
" 51 50 49 48 79 78 77 76 75 74 73 72\n"
|
||||
" 71 70 69 68 67 66 65 64 95 94 93 92\n"
|
||||
" 91 90 89 88 87 86 85 84 83 82 81 80\n"
|
||||
" 111 110 109 108 107 106 105 104 103 102 101 100\n"
|
||||
" 99 98 97 96 127 126 125 124 123 122 121 120\n"
|
||||
" 119 118 117 116 115 114 113 112 143 142 141 140\n"
|
||||
" 139 138 137 136 135 134 133 132 131 130 129 128\n"));
|
||||
}
|
||||
|
||||
TEST(Debug, PrintTensorUnlimitedSlices)
|
||||
{
|
||||
// To limit the output of the test, split the "unlimited" test up into one for the
|
||||
// matrices and one for the slices.
|
||||
|
||||
const ckt::Extent shape = ckt::Extent{13, 1, 1};
|
||||
const ckt::TensorPrintConfig default_config;
|
||||
|
||||
// The shape should be larger than the default, otherwise this test doesn't make
|
||||
// any sense.
|
||||
ASSERT_THAT(shape[0], Gt(default_config.slice_limit));
|
||||
|
||||
auto desc = ckt::make_descriptor<ckb::DataType::I32>(shape, ckt::PackedRightLayout{});
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return i * 3; });
|
||||
|
||||
std::stringstream ss;
|
||||
ckt::print_tensor("UnlimitedConfig", desc, a.get(), ckt::TensorPrintConfig::unlimited(), ss);
|
||||
|
||||
EXPECT_THAT(ss.str(),
|
||||
StringEqWithDiff( //
|
||||
"Tensor \"UnlimitedConfig\": shape = [13, 1, 1]\n"
|
||||
"Tensor \"UnlimitedConfig\", slice [0, :, :]\n"
|
||||
" 0\n"
|
||||
"\n"
|
||||
"Tensor \"UnlimitedConfig\", slice [1, :, :]\n"
|
||||
" 3\n"
|
||||
"\n"
|
||||
"Tensor \"UnlimitedConfig\", slice [2, :, :]\n"
|
||||
" 6\n"
|
||||
"\n"
|
||||
"Tensor \"UnlimitedConfig\", slice [3, :, :]\n"
|
||||
" 9\n"
|
||||
"\n"
|
||||
"Tensor \"UnlimitedConfig\", slice [4, :, :]\n"
|
||||
" 12\n"
|
||||
"\n"
|
||||
"Tensor \"UnlimitedConfig\", slice [5, :, :]\n"
|
||||
" 15\n"
|
||||
"\n"
|
||||
"Tensor \"UnlimitedConfig\", slice [6, :, :]\n"
|
||||
" 18\n"
|
||||
"\n"
|
||||
"Tensor \"UnlimitedConfig\", slice [7, :, :]\n"
|
||||
" 21\n"
|
||||
"\n"
|
||||
"Tensor \"UnlimitedConfig\", slice [8, :, :]\n"
|
||||
" 24\n"
|
||||
"\n"
|
||||
"Tensor \"UnlimitedConfig\", slice [9, :, :]\n"
|
||||
" 27\n"
|
||||
"\n"
|
||||
"Tensor \"UnlimitedConfig\", slice [10, :, :]\n"
|
||||
" 30\n"
|
||||
"\n"
|
||||
"Tensor \"UnlimitedConfig\", slice [11, :, :]\n"
|
||||
" 33\n"
|
||||
"\n"
|
||||
"Tensor \"UnlimitedConfig\", slice [12, :, :]\n"
|
||||
" 36\n"));
|
||||
}
|
||||
|
||||
TEST(Debug, PrintTensorFP32)
|
||||
{
|
||||
auto desc =
|
||||
ckt::make_descriptor<ckb::DataType::FP32>(ckt::Extent{5, 5}, ckt::PackedRightLayout{});
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return std::pow(1.9999, i); });
|
||||
|
||||
std::stringstream ss;
|
||||
ckt::print_tensor("FP32", desc, a.get(), {}, ss);
|
||||
|
||||
EXPECT_THAT(ss.str(),
|
||||
StringEqWithDiff( //
|
||||
"Tensor \"FP32\": shape = [5, 5]\n"
|
||||
" 1.000 2.000 4.000 7.999 15.997\n"
|
||||
" 31.992 63.981 127.955 255.898 511.770\n"
|
||||
" 1023.488 2046.874 4093.543 8186.677 16372.535\n"
|
||||
" 32743.432 65483.590 130960.633 261908.172 523790.156\n"
|
||||
" 1047527.938 2094951.125 4189692.750 8378966.500 16757095.000\n"));
|
||||
}
|
||||
|
||||
TEST(Debug, PrintTensorBF16)
|
||||
{
|
||||
auto desc =
|
||||
ckt::make_descriptor<ckb::DataType::BF16>(ckt::Extent{5, 5}, ckt::PackedRightLayout{});
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
ckt::fill_tensor_buffer(
|
||||
desc, a.get(), [](size_t i) { return ck::type_convert<ck::bhalf_t>(1.2345678f * i); });
|
||||
|
||||
std::stringstream ss;
|
||||
ckt::print_tensor("BF16", desc, a.get(), {}, ss);
|
||||
|
||||
EXPECT_THAT(ss.str(),
|
||||
StringEqWithDiff( //
|
||||
"Tensor \"BF16\": shape = [5, 5]\n"
|
||||
" 0.000 1.234 2.469 3.703 4.938\n"
|
||||
" 6.188 7.406 8.625 9.875 11.125\n"
|
||||
" 12.375 13.562 14.812 16.000 17.250\n"
|
||||
" 18.500 19.750 21.000 22.250 23.500\n"
|
||||
" 24.750 25.875 27.125 28.375 29.625\n"));
|
||||
}
|
||||
|
||||
TEST(Debug, PrintTensorFP8)
|
||||
{
|
||||
auto desc =
|
||||
ckt::make_descriptor<ckb::DataType::FP8>(ckt::Extent{5, 5}, ckt::PackedRightLayout{});
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
ckt::fill_tensor_buffer(
|
||||
desc, a.get(), [](size_t i) { return ck::type_convert<ck::f8_t>(i * 0.1f); });
|
||||
|
||||
std::stringstream ss;
|
||||
ckt::print_tensor("FP8", desc, a.get(), {}, ss);
|
||||
|
||||
EXPECT_THAT(ss.str(),
|
||||
StringEqWithDiff( //
|
||||
"Tensor \"FP8\": shape = [5, 5]\n"
|
||||
" 0.000 0.102 0.203 0.312 0.406\n"
|
||||
" 0.500 0.625 0.688 0.812 0.875\n"
|
||||
" 1.000 1.125 1.250 1.250 1.375\n"
|
||||
" 1.500 1.625 1.750 1.750 1.875\n"
|
||||
" 2.000 2.000 2.250 2.250 2.500\n"));
|
||||
}
|
||||
|
||||
TEST(Debug, PrintTensorSpecialFloats)
|
||||
{
|
||||
auto desc =
|
||||
ckt::make_descriptor<ckb::DataType::FP32>(ckt::Extent{5, 5}, ckt::PackedRightLayout{});
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) {
|
||||
if(i % 8 == 1)
|
||||
return 0.f / 0.f;
|
||||
else if(i % 7 == 1)
|
||||
return std::sqrt(-1.f);
|
||||
else if(i % 6 == 1)
|
||||
return 1.f / 0.f;
|
||||
else if(i % 5 == 1)
|
||||
return -1.f / 0.f;
|
||||
else
|
||||
return static_cast<float>(i);
|
||||
});
|
||||
|
||||
std::stringstream ss;
|
||||
ckt::print_tensor("specials", desc, a.get(), {}, ss);
|
||||
|
||||
EXPECT_THAT(ss.str(),
|
||||
StringEqWithDiff( //
|
||||
"Tensor \"specials\": shape = [5, 5]\n"
|
||||
" 0.000 nan 2.000 3.000 4.000\n"
|
||||
" 5.000 -inf inf -nan nan\n"
|
||||
" 10.000 -inf 12.000 inf 14.000\n"
|
||||
" -nan -inf nan 18.000 inf\n"
|
||||
" 20.000 -inf -nan 23.000 24.000\n"));
|
||||
}
|
||||
|
||||
TEST(Debug, PrintTensorFloatPrecision)
|
||||
{
|
||||
auto desc = ckt::make_descriptor<ckb::DataType::FP32>(ckt::Extent{5}, ckt::PackedRightLayout{});
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { return std::pow(0.9, i); });
|
||||
|
||||
std::stringstream ss;
|
||||
ckt::print_tensor("FloatPrecision",
|
||||
desc,
|
||||
a.get(),
|
||||
{
|
||||
.float_precision = 10,
|
||||
},
|
||||
ss);
|
||||
|
||||
EXPECT_THAT(ss.str(),
|
||||
StringEqWithDiff( //
|
||||
"Tensor \"FloatPrecision\": shape = [5]\n"
|
||||
" 1.0000000000 0.8999999762 0.8100000024 0.7289999723 0.6560999751\n"));
|
||||
}
|
||||
@@ -6,11 +6,13 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <gmock/gmock.h>
|
||||
#include <array>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
namespace ckb = ck_tile::builder;
|
||||
namespace ckt = ck_tile::builder::test;
|
||||
|
||||
using ck_tile::test::StringEqWithDiff;
|
||||
using ::testing::ElementsAreArray;
|
||||
using ::testing::Eq;
|
||||
using ::testing::Throws;
|
||||
@@ -76,7 +78,7 @@ TEST(TensorDescriptor, MakeDescriptor)
|
||||
|
||||
// Note: automatic inference of RANK.
|
||||
const auto desc =
|
||||
ckt::make_descriptor<ckb::DataType::INT32>(lengths, ckt::PackedRightLayout{});
|
||||
ckt::make_descriptor<ckb::DataType::I32>(lengths, ckt::PackedRightLayout{});
|
||||
|
||||
EXPECT_THAT(desc.get_lengths(), ElementsAreArray(lengths));
|
||||
EXPECT_THAT(desc.get_strides(),
|
||||
@@ -173,7 +175,7 @@ TEST(TensorDescriptor, ExtentFromVector)
|
||||
|
||||
TEST(TensorDescriptor, IsPacked)
|
||||
{
|
||||
constexpr auto dt = ckb::DataType::INT32; // Irrelevant for this test
|
||||
constexpr auto dt = ckb::DataType::I32; // Irrelevant for this test
|
||||
EXPECT_TRUE(
|
||||
ckt::make_descriptor<dt>(ckt::Extent{101, 43, 25, 662, 654}, ckt::PackedLeftLayout{})
|
||||
.is_packed());
|
||||
@@ -189,3 +191,20 @@ TEST(TensorDescriptor, IsPacked)
|
||||
EXPECT_FALSE(
|
||||
ckt::make_descriptor<dt>(ckt::Extent{30, 20, 10}, ckt::Extent{1, 1, 1}).is_packed());
|
||||
}
|
||||
|
||||
TEST(TensorDescriptor, PrintExtent)
|
||||
{
|
||||
{
|
||||
const ckt::Extent extent{6233, 55, 1235, 52, 203};
|
||||
std::stringstream ss;
|
||||
ss << extent;
|
||||
EXPECT_THAT(ss.str(), StringEqWithDiff("[6233, 55, 1235, 52, 203]"));
|
||||
}
|
||||
|
||||
{
|
||||
const ckt::Extent extent{};
|
||||
std::stringstream ss;
|
||||
ss << extent;
|
||||
EXPECT_THAT(ss.str(), StringEqWithDiff("[]"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,28 @@ namespace ckt = ck_tile::builder::test;
|
||||
using ::testing::Each;
|
||||
using ::testing::Eq;
|
||||
|
||||
TEST(TensorForeach, NdIter)
|
||||
{
|
||||
{
|
||||
ckt::NdIter iter(ckt::Extent{523, 345, 123, 601});
|
||||
|
||||
EXPECT_THAT(iter.numel(), Eq(13'338'296'505ULL));
|
||||
EXPECT_THAT(iter(0), Eq(ckt::Extent{0, 0, 0, 0}));
|
||||
EXPECT_THAT(iter(1), Eq(ckt::Extent{0, 0, 0, 1}));
|
||||
EXPECT_THAT(iter(601), Eq(ckt::Extent{0, 0, 1, 0}));
|
||||
EXPECT_THAT(iter(601 * 123), Eq(ckt::Extent{0, 1, 0, 0}));
|
||||
EXPECT_THAT(iter(601 * 123 * 10), Eq(ckt::Extent{0, 10, 0, 0}));
|
||||
EXPECT_THAT(iter(((34 * 345 + 63) * 123 + 70) * 601 + 5), Eq(ckt::Extent{34, 63, 70, 5}));
|
||||
}
|
||||
|
||||
{
|
||||
ckt::NdIter iter(ckt::Extent{});
|
||||
|
||||
EXPECT_THAT(iter.numel(), Eq(1));
|
||||
EXPECT_THAT(iter(0), Eq(ckt::Extent{}));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TensorForeach, CalculateOffset)
|
||||
{
|
||||
EXPECT_THAT(ckt::calculate_offset(ckt::Extent{1, 2, 3}, ckt::Extent{100, 10, 1}), Eq(123));
|
||||
@@ -87,8 +109,8 @@ TEST(TensorForeach, VisitsEveryIndex)
|
||||
|
||||
TEST(TensorForeach, FillTensorBuffer)
|
||||
{
|
||||
auto desc = ckt::make_descriptor<ckb::DataType::INT32>(ckt::Extent{31, 54, 13},
|
||||
ckt::PackedRightLayout{});
|
||||
auto desc =
|
||||
ckt::make_descriptor<ckb::DataType::I32>(ckt::Extent{31, 54, 13}, ckt::PackedRightLayout{});
|
||||
|
||||
auto buffer = ckt::alloc_tensor_buffer(desc);
|
||||
|
||||
@@ -109,7 +131,7 @@ TEST(TensorForeach, FillTensor)
|
||||
// FillTensor with non-packed indices should not write out-of-bounds.
|
||||
const ckt::Extent shape = {4, 23, 35};
|
||||
const ckt::Extent pad = {12, 53, 100};
|
||||
auto desc = ckt::make_descriptor<ckb::DataType::INT32>(shape, ckt::PackedRightLayout{}(pad));
|
||||
auto desc = ckt::make_descriptor<ckb::DataType::I32>(shape, ckt::PackedRightLayout{}(pad));
|
||||
const auto strides = desc.get_strides();
|
||||
|
||||
auto size = desc.get_element_space_size();
|
||||
@@ -169,7 +191,7 @@ TEST(TensorForeach, ClearTensorZeros)
|
||||
const ckt::Extent pad = {6, 6, 6, 6, 6, 6, 6, 6};
|
||||
|
||||
const auto desc =
|
||||
ckt::make_descriptor<ckb::DataType::INT32>(shape, ckt::PackedRightLayout{}(pad));
|
||||
ckt::make_descriptor<ckb::DataType::I32>(shape, ckt::PackedRightLayout{}(pad));
|
||||
|
||||
auto buffer = ckt::alloc_tensor_buffer(desc);
|
||||
ckt::clear_tensor_buffer(desc, buffer.get());
|
||||
|
||||
@@ -173,8 +173,8 @@ TEST(ValidationReportTests, MultipleSomeIncorrect)
|
||||
}
|
||||
|
||||
{
|
||||
auto desc = ckt::make_descriptor<ckb::DataType::INT32, 3>({'G', 'P', 'U'},
|
||||
ckt::PackedRightLayout{});
|
||||
auto desc =
|
||||
ckt::make_descriptor<ckb::DataType::I32, 3>({'G', 'P', 'U'}, ckt::PackedRightLayout{});
|
||||
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
auto b = ckt::alloc_tensor_buffer(desc);
|
||||
|
||||
Reference in New Issue
Block a user