mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-05 06:01:23 +00:00
* Improve random number generation * use different seed for each input (Q, K, V...); * use deterministic generation of: * seqstart_q/k (for group mode); * block_table (for paged-kvcahe); * cache_batch_idx (for kvcache); * Extract arg_parser-related code from run functions to use them as tests * Split examples into main programs and fmha runners, build instances separately * Add dummy tests that use instances and runners * Fix a missed corner case of f32->f8 conversion When value if < min f8 denormal but > min f8 denormal / 2, it must be rounded to min f8 denormal (i.e. 0b1), not to 0. * Fix incorrect fp8 scales for P and O in validation code DataTypeConfig was incorrectly compared with fp8_t. * Add host generation of dropout random values and use it for validation Previously host validation (reference_batched_dropout) used random numbers generated by BlockDropout of the kernel, meaning that incorrect generation on device (bad distribution, repeated numbers, too many zeros, etc.) would not trigger any validation errors. * Implement tests from smoke_test_bwd.sh * Return result as enum to distinguish failure and missing instance * Add tests for bwd features: bias, alibi, dropout * Implement tests from smoke_test_fwd.sh * Pass seqlen_q/k as vectors to fwd and bwd runners * Add tests for fwd features: bias, alibi, dropout * Add tests for pagedkv and splitkv * Fix conditions when to use splitkv and pagedkv kernels splitkv was executed only when use_kvcache which == (need_append_kvcache || use_cache_batch_idx || 0 < page_block_size). In the SplitKV tests: the regular fwd kernel was executed if use_cache_batch_idx was not requested even when num_splitkv > 1. In the AppendKV tests: the pagedkv kernel was executed but it often failed to find an instance. * Add tests for appendkv * Use is_v_rowmajor = true because there are no instances with column layout anymore * Split public and private compile options for instances Tests and examples need to know only about CK_TILE_FMHA_FWD_*_API. * Improve parsing validation in bias and mask * Pass bias as string for consistency with mask * Catch parsing and other exceptions * Add bwd test for deterministic flag * Initialize fp8 tensors (-init=ufq) similarly to uf * Fix splitkv/pagedkv invocation: use padded sk when seqlen_k_ptr is not null seqlen_k cannot be used to determine padding when seqlen_k_ptr is provided. The actual seqlen_k is taken from seqlen_k_ptr[b]. Even seqlen_k values (% bn0 == 0) use padded seqlen_k while seqlen_k_ptr may contain arbitrary values. In the example or tests this produces incorrect results with appendkv (for example, -d=32 -s=1 -s_k=64 -s_knew=7 -vlayout=c -b=8). * Fix use_pagedkv value when kvcache = true but page_block_size = 0 In this case block_table_ptr is nullptr which is accessed in the kernel. * Clean up bwd tests * Unify fwd tests for f16/bf16 and fp8 * Use better explicit instantiation declaration for fmha_bwd<2> * Use the same seed for all tests, allow to override it with env variable * Undo clang-format of one irrelevant file For some reason my local clang-format-18 and the one in CI work differently. * Do not build instances and tests on unsupported archs * Build instance libraries as OBJECT library * CI: Enable sccache for HIP There are source files with LANGUAGE HIP, they need -DCMAKE_HIP_COMPILER_LAUNCHER=sccache * Add tests to REGRESSION_TESTS * Fix OOB accesses in deterministic bwd due to incorrectly assumed kN0 The runner assumes kN0 = (hdim_q <= 128) ? 128 : 64 but there are smaller tiles (for tr_load or fp32). This can create too small dq_acc_buf. * Pass CK_TILE_FMHA_FWD_*_API as INTERFACE compile options The instances don't actually depend on them, only examples and tests do. Passing these definitions as INTERFACE allows to change FMHA_FWD_ENABLE_APIS without recompiling instances that are already in ccache. * Fix formatting and names
345 lines
16 KiB
C++
345 lines
16 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
using ::testing::Bool;
|
|
using ::testing::Combine;
|
|
using ::testing::TestWithParam;
|
|
using ::testing::Values;
|
|
using ::testing::ValuesIn;
|
|
|
|
// Random seed used for initializing input tensors. 0 for non-deterministic seed
|
|
CK_TILE_DECLARE_ENV_VAR(CK_TILE_TEST_SEED, uint64_t, 123456)
|
|
|
|
// Whether to run long tests (from smoke_test_fwd.sh)
|
|
CK_TILE_DECLARE_ENV_VAR_BOOL(CK_TILE_FMHA_LONG_TESTS)
|
|
|
|
#define CHECK_RESULT(result) \
|
|
do \
|
|
{ \
|
|
if(result == bwd_result::no_instance) \
|
|
GTEST_SKIP() << "No instance for current parameters"; \
|
|
ASSERT_EQ(result, bwd_result::success); \
|
|
} while(0)
|
|
|
|
const ck_tile::stream_config stream_config{
|
|
nullptr, // stream_id_
|
|
false, // time_kernel_
|
|
1, // log_level_
|
|
0, // cold_niters_
|
|
1, // nrepeat_
|
|
true, // is_gpu_timer_
|
|
false, // flush_cache_
|
|
1, // rotating_count_
|
|
};
|
|
|
|
#define COMMON_ARGS \
|
|
init_method, static_cast<uint32_t>(ck_tile::EnvValue(CK_TILE_ENV(CK_TILE_TEST_SEED))), 1, \
|
|
stream_config
|
|
|
|
auto EnableTestIf(bool condition)
|
|
{
|
|
return ValuesIn(condition ? std::vector<bool>{true} : std::vector<bool>{});
|
|
}
|
|
|
|
class AllLong : public TestWithParam<std::tuple<bool,
|
|
std::tuple<int, int>,
|
|
bool,
|
|
mode_enum,
|
|
std::string,
|
|
float,
|
|
std::tuple<int, int, int, int, int, std::string>>>
|
|
{
|
|
};
|
|
|
|
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AllLong);
|
|
|
|
// Test cases from example/ck_tile/01_fmha/script/smoke_test_bwd.sh
|
|
|
|
INSTANTIATE_TEST_SUITE_P(
|
|
TestCkTileFmhaBwd,
|
|
AllLong,
|
|
Combine(EnableTestIf(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_FMHA_LONG_TESTS))),
|
|
HDimValues,
|
|
Bool(),
|
|
ModeValues,
|
|
Values("n", "a"),
|
|
Values(0.0f, 0.2f),
|
|
Values(std::tuple{1, 4, 2, 259, -1, "0"},
|
|
std::tuple{2, 2, -1, 516, 253, "0"},
|
|
std::tuple{1, 4, 1, 500, 251, "1"},
|
|
std::tuple{1, 2, -1, 900, 258, "2"},
|
|
std::tuple{2, 1, -1, 987, 219, "t:128,30"},
|
|
std::tuple{2, 3, 1, 244, 499, "b:4,35"})));
|
|
|
|
TEST_P(AllLong, Test)
|
|
{
|
|
auto [_, hdims, perm, mode, bias_str, p_drop, dims_mask] = GetParam();
|
|
auto [hdim_q, hdim_v] = hdims;
|
|
auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str] = dims_mask;
|
|
|
|
auto result = fmha_bwd_run<DataTypeConfig>(mode,
|
|
batch,
|
|
nhead,
|
|
nhead_k,
|
|
{seqlen_q},
|
|
{seqlen_k},
|
|
hdim_q,
|
|
hdim_v,
|
|
perm, // i_perm
|
|
perm, // o_perm
|
|
0, // scale
|
|
bias_str, // bias_str
|
|
false, // use_dbias
|
|
p_drop, // p_drop
|
|
123, // drop_seed
|
|
1024, // drop_offset
|
|
true, // drop_prefs
|
|
mask_str, // mask_str
|
|
false, // deterministic
|
|
COMMON_ARGS);
|
|
CHECK_RESULT(result);
|
|
}
|
|
|
|
class HDimPadding
|
|
: public TestWithParam<std::tuple<std::tuple<int, int>,
|
|
bool,
|
|
mode_enum,
|
|
std::tuple<int, int, int, int, int, std::string>>>
|
|
{
|
|
};
|
|
|
|
INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaBwd,
|
|
HDimPadding,
|
|
Combine(Values(std::tuple{24, 48},
|
|
std::tuple{120, 160},
|
|
std::tuple{256, 108},
|
|
std::tuple{40, 64}),
|
|
Bool(),
|
|
ModeValues,
|
|
Values(std::tuple{1, 4, 2, 480, -1, "0"},
|
|
std::tuple{2, 2, -1, 300, 400, "t:64,64"},
|
|
std::tuple{1, 4, 1, 512, 201, "1"},
|
|
std::tuple{1, 2, -1, 900, 256, "0"},
|
|
std::tuple{2, 1, -1, 256, 256, "1"})));
|
|
|
|
TEST_P(HDimPadding, Test)
|
|
{
|
|
auto [hdims, perm, mode, dims_mask] = GetParam();
|
|
auto [hdim_q, hdim_v] = hdims;
|
|
auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str] = dims_mask;
|
|
|
|
auto result = fmha_bwd_run<DataTypeConfig>(mode,
|
|
batch,
|
|
nhead,
|
|
nhead_k,
|
|
{seqlen_q},
|
|
{seqlen_k},
|
|
hdim_q,
|
|
hdim_v,
|
|
perm, // i_perm
|
|
perm, // o_perm
|
|
0, // scale
|
|
"n", // bias_str
|
|
false, // use_dbias
|
|
0.0f, // p_drop
|
|
0, // drop_seed
|
|
0, // drop_offset
|
|
false, // drop_prefs
|
|
mask_str, // mask_str
|
|
false, // deterministic
|
|
COMMON_ARGS);
|
|
CHECK_RESULT(result);
|
|
}
|
|
|
|
class ElementwiseBias
|
|
: public TestWithParam<std::tuple<std::tuple<int, int>,
|
|
bool,
|
|
mode_enum,
|
|
std::string,
|
|
bool,
|
|
std::tuple<int, int, int, int, int, std::string>>>
|
|
{
|
|
};
|
|
|
|
INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaBwd,
|
|
ElementwiseBias,
|
|
Combine(HDimValues,
|
|
Bool(), // layouts of bias and dbias are controlled by i_perm
|
|
ModeValues,
|
|
Values("e:0", "e:1", "e:2"),
|
|
Bool(),
|
|
Values(std::tuple{1, 4, 2, 1024, 100, "0"},
|
|
std::tuple{3, 2, -1, 128, 256, "2"},
|
|
std::tuple{2, 2, -1, 130, 499, "t:50,64"})));
|
|
|
|
TEST_P(ElementwiseBias, Test)
|
|
{
|
|
auto [hdims, i_perm, mode, bias_str, use_dbias, dims_mask] = GetParam();
|
|
auto [hdim_q, hdim_v] = hdims;
|
|
auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str] = dims_mask;
|
|
|
|
auto result = fmha_bwd_run<DataTypeConfig>(mode,
|
|
batch,
|
|
nhead,
|
|
nhead_k,
|
|
{seqlen_q},
|
|
{seqlen_k},
|
|
hdim_q,
|
|
hdim_v,
|
|
i_perm, // i_perm
|
|
false, // o_perm
|
|
0, // scale
|
|
bias_str, // bias_str
|
|
use_dbias, // use_dbias
|
|
0.0f, // p_drop
|
|
123, // drop_seed
|
|
1024, // drop_offset
|
|
true, // drop_prefs
|
|
mask_str, // mask_str
|
|
false, // deterministic
|
|
COMMON_ARGS);
|
|
CHECK_RESULT(result);
|
|
}
|
|
|
|
class Alibi : public TestWithParam<std::tuple<std::tuple<int, int>,
|
|
mode_enum,
|
|
std::string,
|
|
std::tuple<int, int, int, int, int>,
|
|
std::string>>
|
|
{
|
|
};
|
|
|
|
INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaBwd,
|
|
Alibi,
|
|
Combine(HDimValues,
|
|
ModeValues,
|
|
Values("a:0", "a:1"),
|
|
Values(std::tuple{1, 3, 3, 1024, 1000},
|
|
std::tuple{3, 5, 5, 128, 256},
|
|
std::tuple{2, 8, 4, 130, 320}),
|
|
Values("0", "t", "b", "t:50,64", "b:32,40")));
|
|
|
|
TEST_P(Alibi, Test)
|
|
{
|
|
auto [hdims, mode, bias_str, dims, mask_str] = GetParam();
|
|
auto [hdim_q, hdim_v] = hdims;
|
|
auto [batch, nhead, nhead_k, seqlen_q, seqlen_k] = dims;
|
|
|
|
auto result = fmha_bwd_run<DataTypeConfig>(mode,
|
|
batch,
|
|
nhead,
|
|
nhead_k,
|
|
{seqlen_q},
|
|
{seqlen_k},
|
|
hdim_q,
|
|
hdim_v,
|
|
true, // i_perm
|
|
true, // o_perm
|
|
0, // scale
|
|
bias_str, // bias_str
|
|
false, // use_dbias
|
|
0.0f, // p_drop
|
|
0, // drop_seed
|
|
0, // drop_offset
|
|
false, // drop_prefs
|
|
mask_str, // mask_str
|
|
false, // deterministic
|
|
COMMON_ARGS);
|
|
CHECK_RESULT(result);
|
|
}
|
|
|
|
class Dropout : public TestWithParam<std::tuple<std::tuple<int, int>,
|
|
mode_enum,
|
|
float,
|
|
std::tuple<uint64_t, uint64_t, bool>,
|
|
std::tuple<int, int, int, int, int, std::string>>>
|
|
{
|
|
};
|
|
|
|
INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaBwd,
|
|
Dropout,
|
|
Combine(HDimValues,
|
|
ModeValues,
|
|
Values(0.123f, 0.5f),
|
|
Values(std::tuple{10, 123, false},
|
|
std::tuple{34534564645, 7876878876864, true}),
|
|
Values(std::tuple{2, 6, 2, 180, 512, "0"},
|
|
std::tuple{3, 2, 2, 256, 128, "1"},
|
|
std::tuple{4, 2, 1, 100, 768, "2"})));
|
|
|
|
TEST_P(Dropout, Test)
|
|
{
|
|
auto [hdims, mode, p_drop, drop_seed_offset_prefs, dims_mask] = GetParam();
|
|
auto [hdim_q, hdim_v] = hdims;
|
|
auto [drop_seed, drop_offset, drop_prefs] = drop_seed_offset_prefs;
|
|
auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str] = dims_mask;
|
|
|
|
auto result = fmha_bwd_run<DataTypeConfig>(mode,
|
|
batch,
|
|
nhead,
|
|
nhead_k,
|
|
{seqlen_q},
|
|
{seqlen_k},
|
|
hdim_q,
|
|
hdim_v,
|
|
true, // i_perm
|
|
true, // o_perm
|
|
0.1f, // scale
|
|
"n", // bias_str
|
|
false, // use_dbias
|
|
p_drop, // p_drop
|
|
drop_seed, // drop_seed
|
|
drop_offset, // drop_offset
|
|
drop_prefs, // drop_prefs
|
|
mask_str, // mask_str
|
|
false, // deterministic
|
|
COMMON_ARGS);
|
|
CHECK_RESULT(result);
|
|
}
|
|
|
|
class Deterministic
|
|
: public TestWithParam<std::tuple<std::tuple<int, int>,
|
|
bool,
|
|
mode_enum,
|
|
std::tuple<int, int, int, int, int, std::string>>>
|
|
{
|
|
};
|
|
|
|
INSTANTIATE_TEST_SUITE_P(TestCkTileFmhaBwd,
|
|
Deterministic,
|
|
Combine(HDimValues,
|
|
Bool(),
|
|
ModeValues,
|
|
Values(std::tuple{2, 6, 2, 180, 512, "0"},
|
|
std::tuple{3, 3, 1, 256, 128, "1"},
|
|
std::tuple{4, 2, 2, 768, 100, "2"})));
|
|
|
|
TEST_P(Deterministic, Test)
|
|
{
|
|
auto [hdims, i_perm, mode, dims_mask] = GetParam();
|
|
auto [hdim_q, hdim_v] = hdims;
|
|
auto [batch, nhead, nhead_k, seqlen_q, seqlen_k, mask_str] = dims_mask;
|
|
|
|
auto result = fmha_bwd_run<DataTypeConfig>(mode,
|
|
batch,
|
|
nhead,
|
|
nhead_k,
|
|
{seqlen_q},
|
|
{seqlen_k},
|
|
hdim_q,
|
|
hdim_v,
|
|
i_perm, // i_perm
|
|
true, // o_perm
|
|
0, // scale
|
|
"n", // bias_str
|
|
false, // use_dbias
|
|
0.0f, // p_drop
|
|
0, // drop_seed
|
|
0, // drop_offset
|
|
false, // drop_prefs
|
|
mask_str, // mask_str
|
|
true, // deterministic
|
|
COMMON_ARGS);
|
|
CHECK_RESULT(result);
|
|
}
|