mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
* enable gfx940 * switch between intrinsic mfma routines on mi100/200 and mi300 * fix mfma_int8 on MI300 * disable 2 int8 examples on MI300 * Update cmake-ck-dev.sh * restore gitignore file * modify Jenkinsfile to the internal repo * Bump rocm-docs-core from 0.24.0 to 0.29.0 in /docs/sphinx Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.24.0 to 0.29.0. - [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases) - [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.24.0...v0.29.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> * initial enablement of gfx950 * fix clang format * disable examples 31 and 41 int8 on gfx950 * add code * fix build wip * fix xx * now can build * naming * minor fix * wip fix * fix macro for exp2; fix warpgemm a/b in transposedC * unify as tuple_array * Update the required Python version to 3.9 * Update executable name in test scripts * re-structure tuple/array to avoid spill * Merge function templates * Fix format * Add constraint to array<> ctor * Re-use function * Some minor changes * remove wrong code in store_raw() * fix compile issue in transpose * Rename enum Rename 'cood_transform_enum' to 'coord_transform_enum' * let more integral_constant->constant, and formating * make sure thread_buffer can be tuple/array * temp fix buffer_store spill * not using custom data type by default, now we can have ISA-level same code as opt_padding * fix compile error, fp8 not ready now * fix fp8 duplicated move/shift/and/or problem * Default use CK_TILE_FLOAT_TO_FP8_STOCHASTIC rounding mode * fix scratch in fp8 kernel * update some readme * fix merge from upstream * sync with upstream * sync upstream again * sync 22 * remove unused * fix clang-format * update README of ck_tile example * fix several issue * let python version to be 3.8 as minimal * remove ck_tile example from default cmake target like all/install/check * remove mistake * 1).support receipe in generate.py 2).use simplified mask type 3).change left/right to pass into karg * fix some bug in group-mode masking and codegen. update README * F8 quantization for FMHA forward (#1224) * Add SAccElementFunction, PComputeElementFunction, OAccElementFunction in pipeline * Add element function to fmha api * Adjust P elementwise function * Fix bug of elementwise op, our elementwise op is not inout * Add some elementwise op, prepare to quantization * Let generate.py can generate different elementwise function * To prevent compiler issue, remove the elementwise function we have not used. * Remove f8 pipeline, we should share the same pipeline even in f8 * Remove remove_cvref_t * Avoid warning * Fix wrong fp8 QK/KV block gemm setting * Check fp8 rounding error in check_err() * Set fp8 rounding error for check_err() * Use CK_TILE_FLOAT_TO_FP8_STANDARD as default fp8 rounding mode * 1. codgen the f8 api and kernel 2. f8 host code * prevent warning in filter mode * Remove not-in-use elementwise function kargs * Remove more not-in-use elementwise function kargs * Small refinements in C++ source files * Use conditional_t<> to simplify code * Support heterogeneous argument for binary function types * Re-use already-existing scales<> functor template * Fix wrong value produced by saturating * Generalize the composes<> template * Unify saturates<> implementation * Fix type errors in composes<> * Extend less_equal<> * Reuse the existing template less_equal<> in check_err() * Add equal<float> & equal<double> * Rename check_err() parameter * Rename check_err() parameter * Add FIXME comment for adding new macro in future * Remove unnecessary cast to void * Eliminate duplicated code * Avoid dividing api pool into more than 2 groups * Use more clear variable names * Use affirmative condition in if stmt * Remove blank lines * Donot perfect forwarding in composes<> * To fix compile error, revert generate.py back to4439cc107d* Fix bug of p element function * Add compute element op to host softmax * Remove element function in api interface * Extract user parameter * Rename pscale and oscale variable * rename f8 to fp8 * rename more f8 to fp8 * Add pipeline::operator() without element_functor * 1. Remove deprecated pipeline enum 2. Refine host code parameter * Use quantization range as input * 1. Rename max_dtype to dtype_max. 2. Rename scale to scale_s 3.Add init description * Refine description * prevent early return * unify _squant kernel name in cpp, update README * Adjust the default range. * Refine error message and bias range * Add fp8 benchmark and smoke test * fix fp8 swizzle_factor=4 case --------- Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com> Co-authored-by: carlushuang <carlus.huang@amd.com> --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: illsilin <Illia.Silin@amd.com> Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> Co-authored-by: Jing Zhang <jizha@amd.com> Co-authored-by: zjing14 <zhangjing14@gmail.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Po-Yen, Chen <PoYen.Chen@amd.com> Co-authored-by: rocking <ChunYu.Lai@amd.com> [ROCm/composable_kernel commit:db376dd8a4]
395 lines
14 KiB
C++
395 lines
14 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <cstdlib>
|
|
#include <iostream>
|
|
#include <iomanip>
|
|
#include <iterator>
|
|
#include <limits>
|
|
#include <type_traits>
|
|
#include <vector>
|
|
|
|
#include "ck_tile/core.hpp"
|
|
#include "ck_tile/host/ranges.hpp"
|
|
|
|
namespace ck_tile {
|
|
|
|
template <typename T>
|
|
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
|
|
{
|
|
using size_type = typename std::vector<T>::size_type;
|
|
|
|
os << "[";
|
|
for(size_type idx = 0; idx < v.size(); ++idx)
|
|
{
|
|
if(0 < idx)
|
|
{
|
|
os << ", ";
|
|
}
|
|
os << v[idx];
|
|
}
|
|
return os << "]";
|
|
}
|
|
|
|
template <typename Range, typename RefRange>
|
|
typename std::enable_if<
|
|
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
|
std::is_floating_point_v<ranges::range_value_t<Range>> &&
|
|
!std::is_same_v<ranges::range_value_t<Range>, half_t>,
|
|
bool>::type CK_TILE_HOST
|
|
check_err(const Range& out,
|
|
const RefRange& ref,
|
|
const std::string& msg = "Error: Incorrect results!",
|
|
double rtol = 1e-5,
|
|
double atol = 3e-6,
|
|
bool allow_infinity_ref = false)
|
|
{
|
|
if(out.size() != ref.size())
|
|
{
|
|
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
const auto is_infinity_error = [=](auto o, auto r) {
|
|
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
|
|
const bool both_infinite_and_same = std::isinf(o) && std::isinf(r) && (o == r);
|
|
|
|
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
|
|
};
|
|
|
|
bool res{true};
|
|
int err_count = 0;
|
|
double err = 0;
|
|
double max_err = std::numeric_limits<double>::min();
|
|
for(std::size_t i = 0; i < ref.size(); ++i)
|
|
{
|
|
const double o = *std::next(std::begin(out), i);
|
|
const double r = *std::next(std::begin(ref), i);
|
|
err = std::abs(o - r);
|
|
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
|
|
{
|
|
max_err = err > max_err ? err : max_err;
|
|
err_count++;
|
|
if(err_count < 5)
|
|
{
|
|
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
|
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
|
}
|
|
res = false;
|
|
}
|
|
}
|
|
if(!res)
|
|
{
|
|
const float error_percent =
|
|
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
|
|
std::cerr << "max err: " << max_err;
|
|
std::cerr << ", number of errors: " << err_count;
|
|
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
template <typename Range, typename RefRange>
|
|
typename std::enable_if<
|
|
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
|
std::is_same_v<ranges::range_value_t<Range>, bf16_t>,
|
|
bool>::type CK_TILE_HOST
|
|
check_err(const Range& out,
|
|
const RefRange& ref,
|
|
const std::string& msg = "Error: Incorrect results!",
|
|
double rtol = 1e-3,
|
|
double atol = 1e-3,
|
|
bool allow_infinity_ref = false)
|
|
{
|
|
if(out.size() != ref.size())
|
|
{
|
|
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
const auto is_infinity_error = [=](auto o, auto r) {
|
|
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
|
|
const bool both_infinite_and_same = std::isinf(o) && std::isinf(r) && (o == r);
|
|
|
|
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
|
|
};
|
|
|
|
bool res{true};
|
|
int err_count = 0;
|
|
double err = 0;
|
|
// TODO: This is a hack. We should have proper specialization for bf16_t data type.
|
|
double max_err = std::numeric_limits<float>::min();
|
|
for(std::size_t i = 0; i < ref.size(); ++i)
|
|
{
|
|
const double o = type_convert<float>(*std::next(std::begin(out), i));
|
|
const double r = type_convert<float>(*std::next(std::begin(ref), i));
|
|
err = std::abs(o - r);
|
|
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
|
|
{
|
|
max_err = err > max_err ? err : max_err;
|
|
err_count++;
|
|
if(err_count < 5)
|
|
{
|
|
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
|
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
|
}
|
|
res = false;
|
|
}
|
|
}
|
|
if(!res)
|
|
{
|
|
const float error_percent =
|
|
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
|
|
std::cerr << "max err: " << max_err;
|
|
std::cerr << ", number of errors: " << err_count;
|
|
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
template <typename Range, typename RefRange>
|
|
typename std::enable_if<
|
|
std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
|
std::is_same_v<ranges::range_value_t<Range>, half_t>,
|
|
bool>::type CK_TILE_HOST
|
|
check_err(const Range& out,
|
|
const RefRange& ref,
|
|
const std::string& msg = "Error: Incorrect results!",
|
|
double rtol = 1e-3,
|
|
double atol = 1e-3,
|
|
bool allow_infinity_ref = false)
|
|
{
|
|
if(out.size() != ref.size())
|
|
{
|
|
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
const auto is_infinity_error = [=](auto o, auto r) {
|
|
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
|
|
const bool both_infinite_and_same = std::isinf(o) && std::isinf(r) && (o == r);
|
|
|
|
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
|
|
};
|
|
|
|
bool res{true};
|
|
int err_count = 0;
|
|
double err = 0;
|
|
double max_err = static_cast<double>(std::numeric_limits<ranges::range_value_t<Range>>::min());
|
|
for(std::size_t i = 0; i < ref.size(); ++i)
|
|
{
|
|
const double o = type_convert<float>(*std::next(std::begin(out), i));
|
|
const double r = type_convert<float>(*std::next(std::begin(ref), i));
|
|
err = std::abs(o - r);
|
|
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
|
|
{
|
|
max_err = err > max_err ? err : max_err;
|
|
err_count++;
|
|
if(err_count < 5)
|
|
{
|
|
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
|
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
|
}
|
|
res = false;
|
|
}
|
|
}
|
|
if(!res)
|
|
{
|
|
const float error_percent =
|
|
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
|
|
std::cerr << "max err: " << max_err;
|
|
std::cerr << ", number of errors: " << err_count;
|
|
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
template <typename Range, typename RefRange>
|
|
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
|
std::is_integral_v<ranges::range_value_t<Range>> &&
|
|
!std::is_same_v<ranges::range_value_t<Range>, bf16_t>)
|
|
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
|
|| std::is_same_v<ranges::range_value_t<Range>, int4_t>
|
|
#endif
|
|
,
|
|
bool>
|
|
CK_TILE_HOST check_err(const Range& out,
|
|
const RefRange& ref,
|
|
const std::string& msg = "Error: Incorrect results!",
|
|
double = 0,
|
|
double atol = 0)
|
|
{
|
|
if(out.size() != ref.size())
|
|
{
|
|
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
bool res{true};
|
|
int err_count = 0;
|
|
int64_t err = 0;
|
|
int64_t max_err = std::numeric_limits<int64_t>::min();
|
|
for(std::size_t i = 0; i < ref.size(); ++i)
|
|
{
|
|
const int64_t o = *std::next(std::begin(out), i);
|
|
const int64_t r = *std::next(std::begin(ref), i);
|
|
err = std::abs(o - r);
|
|
|
|
if(err > atol)
|
|
{
|
|
max_err = err > max_err ? err : max_err;
|
|
err_count++;
|
|
if(err_count < 5)
|
|
{
|
|
std::cerr << msg << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r
|
|
<< std::endl;
|
|
}
|
|
res = false;
|
|
}
|
|
}
|
|
if(!res)
|
|
{
|
|
const float error_percent =
|
|
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
|
|
std::cerr << "max err: " << max_err;
|
|
std::cerr << ", number of errors: " << err_count;
|
|
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
template <typename Range, typename RefRange>
|
|
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
|
std::is_same_v<ranges::range_value_t<Range>, fp8_t>),
|
|
bool>
|
|
CK_TILE_HOST check_err(const Range& out,
|
|
const RefRange& ref,
|
|
const std::string& msg = "Error: Incorrect results!",
|
|
unsigned max_rounding_point_distance = 1,
|
|
double atol = 1e-1,
|
|
bool allow_infinity_ref = false)
|
|
{
|
|
if(out.size() != ref.size())
|
|
{
|
|
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
const auto is_infinity_error = [=](auto o, auto r) {
|
|
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
|
|
const bool both_infinite_and_same = std::isinf(o) && std::isinf(r) && (o == r);
|
|
|
|
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
|
|
};
|
|
|
|
static const auto get_rounding_point_distance = [](fp8_t o, fp8_t r) -> unsigned {
|
|
static const auto get_sign_bit = [](fp8_t v) -> bool {
|
|
return 0x80 & bit_cast<uint8_t>(v);
|
|
};
|
|
|
|
if(get_sign_bit(o) ^ get_sign_bit(r))
|
|
{
|
|
return std::numeric_limits<unsigned>::max();
|
|
}
|
|
else
|
|
{
|
|
return std::abs(bit_cast<int8_t>(o) - bit_cast<int8_t>(r));
|
|
}
|
|
};
|
|
|
|
bool res{true};
|
|
int err_count = 0;
|
|
double err = 0;
|
|
double max_err = std::numeric_limits<float>::min();
|
|
for(std::size_t i = 0; i < ref.size(); ++i)
|
|
{
|
|
const fp8_t o_fp8 = *std::next(std::begin(out), i);
|
|
const fp8_t r_fp8 = *std::next(std::begin(ref), i);
|
|
const double o_fp64 = type_convert<float>(o_fp8);
|
|
const double r_fp64 = type_convert<float>(r_fp8);
|
|
err = std::abs(o_fp64 - r_fp64);
|
|
if(!(less_equal<double>{}(err, atol) ||
|
|
get_rounding_point_distance(o_fp8, r_fp8) <= max_rounding_point_distance) ||
|
|
is_infinity_error(o_fp64, r_fp64))
|
|
{
|
|
max_err = err > max_err ? err : max_err;
|
|
err_count++;
|
|
if(err_count < 5)
|
|
{
|
|
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
|
<< "] != ref[" << i << "]: " << o_fp64 << " != " << r_fp64 << std::endl;
|
|
}
|
|
res = false;
|
|
}
|
|
}
|
|
if(!res)
|
|
{
|
|
std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
template <typename Range, typename RefRange>
|
|
std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
|
|
std::is_same_v<ranges::range_value_t<Range>, bf8_t>),
|
|
bool>
|
|
CK_TILE_HOST check_err(const Range& out,
|
|
const RefRange& ref,
|
|
const std::string& msg = "Error: Incorrect results!",
|
|
double rtol = 1e-3,
|
|
double atol = 1e-3,
|
|
bool allow_infinity_ref = false)
|
|
{
|
|
if(out.size() != ref.size())
|
|
{
|
|
std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
const auto is_infinity_error = [=](auto o, auto r) {
|
|
const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
|
|
const bool both_infinite_and_same = std::isinf(o) && std::isinf(r) && (o == r);
|
|
|
|
return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
|
|
};
|
|
|
|
bool res{true};
|
|
int err_count = 0;
|
|
double err = 0;
|
|
double max_err = std::numeric_limits<float>::min();
|
|
for(std::size_t i = 0; i < ref.size(); ++i)
|
|
{
|
|
const double o = type_convert<float>(*std::next(std::begin(out), i));
|
|
const double r = type_convert<float>(*std::next(std::begin(ref), i));
|
|
err = std::abs(o - r);
|
|
if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
|
|
{
|
|
max_err = err > max_err ? err : max_err;
|
|
err_count++;
|
|
if(err_count < 5)
|
|
{
|
|
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
|
|
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
|
|
}
|
|
res = false;
|
|
}
|
|
}
|
|
if(!res)
|
|
{
|
|
std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
} // namespace ck_tile
|