mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
[CK_TILE] add tf32 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Proposed changes TF32 is added in CK on gfx942 and gfx950. This PR is to initiate tf32 in CK_TILE on gfx942 and gfx950. ## Checklist Please put an into the boxes that apply. You can also fill these out after creating the PR. If you're not sure, please don't hesitate to ask. - [ ] I have added tests relevant to the introduced functionality, and the unit tests are passing locally - [ ] I have added the test to REGRESSION_TESTS list defined at the top of CMakeLists.txt in tests/CMakeLists.txt, **IF** the test takes more than 30 seconds to run. - [ ] I have added inline documentation which enables the maintainers with understanding the motivation - [ ] I have removed the stale documentation which is no longer relevant after this pull request - [ ] (If this change is user-facing) I have added release notes which provide the end users with a brief summary of the improvement from this pull request - [x] I have run on all changed files - [ ] Any dependent changes have been merged ## Discussion
384 lines
15 KiB
C++
384 lines
15 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#pragma once
|
|
|
|
#include "ck_tile/core.hpp"
|
|
#include "ck_tile/host/device_memory.hpp"
|
|
#include "ck_tile/host/hip_check_error.hpp"
|
|
#include "ck_tile/host/host_tensor.hpp"
|
|
#include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp"
|
|
#include "ck_tile/ops/elementwise.hpp"
|
|
#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
|
|
#include "ck_tile/ops/common/tensor_layout.hpp"
|
|
|
|
#include <algorithm>
|
|
#include <array>
|
|
#include <iostream>
|
|
#include <utility>
|
|
#include <vector>
|
|
#include <hip/hip_runtime.h>
|
|
|
|
namespace ck_tile {
|
|
|
|
enum class ScaleType
|
|
{
|
|
None,
|
|
RowCol,
|
|
Tensor
|
|
};
|
|
|
|
// Simple test kernel to invoke the CShuffleEpilogue
|
|
template <typename Problem, index_t M, index_t N, ScaleType Scale>
|
|
__global__ void
|
|
test_cshuffle_epilogue_kernel(const typename Problem::AccDataType* __restrict__ input_data,
|
|
typename Problem::ODataType* __restrict__ output_data,
|
|
float* m_scale,
|
|
float* n_scale)
|
|
{
|
|
using Epilogue = CShuffleEpilogue<Problem>;
|
|
using AccDataType = typename Epilogue::AccDataType;
|
|
|
|
static_assert(Problem::kMPerBlock <= M && Problem::kNPerBlock <= N,
|
|
"Block size must fit in tensor dimensions");
|
|
|
|
// Allocate shared memory for epilogue
|
|
__shared__ char smem[Epilogue::GetSmemSize()];
|
|
|
|
// Create accumulator tile with GEMM accumulator distribution (matches BlockGemm)
|
|
using WG = ck_tile::WarpGemmDispatcher<typename Epilogue::ATypeToUse,
|
|
typename Epilogue::BTypeToUse,
|
|
typename Problem::AccDataType,
|
|
Problem::MPerXdl,
|
|
Problem::NPerXdl,
|
|
Problem::KPerXdl,
|
|
Problem::isCTransposed>;
|
|
|
|
constexpr index_t MIterPerWarp = Problem::kMPerBlock / (Problem::MWave * Problem::MPerXdl);
|
|
constexpr index_t NIterPerWarp = Problem::kNPerBlock / (Problem::NWave * Problem::NPerXdl);
|
|
|
|
constexpr auto c_block_outer_dstr_encoding = ck_tile::tile_distribution_encoding<
|
|
ck_tile::sequence<>,
|
|
ck_tile::tuple<ck_tile::sequence<MIterPerWarp, Problem::MWave>,
|
|
ck_tile::sequence<NIterPerWarp, Problem::NWave>>,
|
|
ck_tile::tuple<ck_tile::sequence<1, 2>>,
|
|
ck_tile::tuple<ck_tile::sequence<1, 1>>,
|
|
ck_tile::sequence<1, 2>,
|
|
ck_tile::sequence<0, 0>>{};
|
|
|
|
constexpr auto acc_distribution_encode = ck_tile::detail::make_embed_tile_distribution_encoding(
|
|
c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
|
|
|
|
constexpr auto acc_distribution = make_static_tile_distribution(acc_distribution_encode);
|
|
auto acc_tile = make_static_distributed_tensor<AccDataType>(acc_distribution);
|
|
|
|
// Create input tensor view for loading from global memory
|
|
// Note: cast away const since buffer_view infrastructure doesn't support const pointers,
|
|
// but the input_data is only read, never written
|
|
// Use runtime values for dimensions to avoid issues with constant buffer size types
|
|
constexpr index_t kMPerBlock = Problem::kMPerBlock;
|
|
constexpr index_t kNPerBlock = Problem::kNPerBlock;
|
|
auto input_tensor_view = make_naive_tensor_view<address_space_enum::global>(
|
|
const_cast<AccDataType*>(input_data),
|
|
make_tuple(kMPerBlock, kNPerBlock),
|
|
make_tuple(kNPerBlock, 1), // row-major strides
|
|
number<1>{},
|
|
number<1>{});
|
|
|
|
// Create tile window using the correct accumulator distribution
|
|
auto input_tile_window =
|
|
make_tile_window(input_tensor_view,
|
|
make_tuple(number<Problem::kMPerBlock>{}, number<Problem::kNPerBlock>{}),
|
|
{0, 0},
|
|
acc_distribution); // Use GEMM acc distribution, not LDS distribution
|
|
|
|
// Load input data from global memory into acc_tile
|
|
load_tile(acc_tile, input_tile_window);
|
|
|
|
// Create output tensor view
|
|
auto output_tensor_view =
|
|
make_naive_tensor_view<address_space_enum::global>(output_data,
|
|
make_tuple(M, N),
|
|
make_tuple(N, 1),
|
|
number<Epilogue::GetVectorSizeC()>{},
|
|
number<1>{});
|
|
|
|
// Create output tile window
|
|
auto output_tile_window =
|
|
make_tile_window(output_tensor_view,
|
|
make_tuple(number<Problem::kMPerBlock>{}, number<Problem::kNPerBlock>{}),
|
|
{0, 0});
|
|
|
|
// Create empty D tensors tuple (we're ignoring ds_dram_windows for this test)
|
|
auto empty_ds = make_tuple();
|
|
|
|
// Call the epilogue
|
|
if constexpr(Scale == ScaleType::RowCol)
|
|
{
|
|
const auto m_scale_window = make_tile_window(
|
|
make_naive_tensor_view<address_space_enum::global>(
|
|
m_scale, make_tuple(M, N), make_tuple(1, 0), number<1>{}, number<1>{}),
|
|
make_tuple(number<Problem::kMPerBlock>{}, number<Problem::kNPerBlock>{}),
|
|
{0, 0});
|
|
const auto n_scale_window = make_tile_window(
|
|
make_naive_tensor_view<address_space_enum::global>(
|
|
n_scale, make_tuple(M, N), make_tuple(0, 1), number<1>{}, number<1>{}),
|
|
make_tuple(number<Problem::kMPerBlock>{}, number<Problem::kNPerBlock>{}),
|
|
{0, 0});
|
|
Epilogue{}(output_tile_window, acc_tile, empty_ds, smem, m_scale_window, n_scale_window);
|
|
}
|
|
else if constexpr(Scale == ScaleType::Tensor)
|
|
{
|
|
Epilogue{}(output_tile_window, acc_tile, empty_ds, smem, *m_scale, *n_scale);
|
|
}
|
|
else
|
|
{
|
|
Epilogue{}(output_tile_window, acc_tile, empty_ds, smem);
|
|
}
|
|
}
|
|
|
|
// Test configuration helper
|
|
template <typename ADataType,
|
|
typename BDataType,
|
|
typename AccDataType,
|
|
typename ODataType,
|
|
index_t kM,
|
|
index_t kN,
|
|
index_t MWave,
|
|
index_t NWave,
|
|
index_t MPerXdl,
|
|
index_t NPerXdl,
|
|
index_t KPerXdl>
|
|
using SimpleCShuffleEpilogueProblem =
|
|
CShuffleEpilogueProblem<ADataType,
|
|
BDataType,
|
|
ck_tile::tuple<>, // Empty Ds datatype tuple
|
|
AccDataType,
|
|
ODataType,
|
|
ck_tile::tuple<>, // Empty Ds layout
|
|
tensor_layout::gemm::RowMajor, // ELayout
|
|
ck_tile::element_wise::PassThrough, // CDElementwise
|
|
kM,
|
|
kN,
|
|
MWave,
|
|
NWave,
|
|
MPerXdl,
|
|
NPerXdl,
|
|
KPerXdl,
|
|
false // isCTransposed
|
|
>;
|
|
|
|
// Launch kernel with RowCol scaling
|
|
template <typename Problem, index_t M, index_t N>
|
|
void launch_kernel_with_rowcol_scale(const typename Problem::AccDataType* device_input,
|
|
typename Problem::ODataType* device_output,
|
|
dim3 gridSize,
|
|
dim3 blockSize)
|
|
{
|
|
HostTensor<float> h_m_scale({M});
|
|
HostTensor<float> h_n_scale({N});
|
|
for(index_t i = 0; i < M; ++i)
|
|
{
|
|
h_m_scale.mData[i] = 1.0F;
|
|
}
|
|
for(index_t i = 0; i < N; ++i)
|
|
{
|
|
h_n_scale.mData[i] = 1.0F;
|
|
}
|
|
h_n_scale.mData[1] = 2.0F;
|
|
|
|
DeviceMem m_scale_buf(h_m_scale.get_element_space_size_in_bytes());
|
|
DeviceMem n_scale_buf(h_n_scale.get_element_space_size_in_bytes());
|
|
m_scale_buf.ToDevice(h_m_scale.data());
|
|
n_scale_buf.ToDevice(h_n_scale.data());
|
|
|
|
test_cshuffle_epilogue_kernel<Problem, M, N, ScaleType::RowCol>
|
|
<<<gridSize, blockSize>>>(device_input,
|
|
device_output,
|
|
static_cast<float*>(m_scale_buf.GetDeviceBuffer()),
|
|
static_cast<float*>(n_scale_buf.GetDeviceBuffer()));
|
|
HIP_CHECK_ERROR(hipGetLastError());
|
|
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
|
}
|
|
|
|
// Launch kernel with Tensor scaling
|
|
template <typename Problem, index_t M, index_t N>
|
|
void launch_kernel_with_tensor_scale(const typename Problem::AccDataType* device_input,
|
|
typename Problem::ODataType* device_output,
|
|
dim3 gridSize,
|
|
dim3 blockSize)
|
|
{
|
|
HostTensor<float> h_m_scale({1});
|
|
HostTensor<float> h_n_scale({1});
|
|
h_m_scale.mData[0] = 2.0F;
|
|
h_n_scale.mData[0] = 1.0F;
|
|
|
|
DeviceMem m_scale_buf(h_m_scale.get_element_space_size_in_bytes());
|
|
DeviceMem n_scale_buf(h_n_scale.get_element_space_size_in_bytes());
|
|
m_scale_buf.ToDevice(h_m_scale.data());
|
|
n_scale_buf.ToDevice(h_n_scale.data());
|
|
|
|
test_cshuffle_epilogue_kernel<Problem, M, N, ScaleType::Tensor>
|
|
<<<gridSize, blockSize>>>(device_input,
|
|
device_output,
|
|
static_cast<float*>(m_scale_buf.GetDeviceBuffer()),
|
|
static_cast<float*>(n_scale_buf.GetDeviceBuffer()));
|
|
HIP_CHECK_ERROR(hipGetLastError());
|
|
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
|
}
|
|
|
|
// Launch kernel without scaling
|
|
template <typename Problem, index_t M, index_t N>
|
|
void launch_kernel_without_scale(const typename Problem::AccDataType* device_input,
|
|
typename Problem::ODataType* device_output,
|
|
dim3 gridSize,
|
|
dim3 blockSize)
|
|
{
|
|
test_cshuffle_epilogue_kernel<Problem, M, N, ScaleType::None>
|
|
<<<gridSize, blockSize>>>(device_input, device_output, nullptr, nullptr);
|
|
HIP_CHECK_ERROR(hipGetLastError());
|
|
HIP_CHECK_ERROR(hipDeviceSynchronize());
|
|
}
|
|
|
|
/// Generate N unique fp16 bit patterns from the normal range.
|
|
/// Uses positive normals (0x0400-0x7BFF) first, then negative normals (0x8400-0xFBFF).
|
|
/// Static asserts if N > 61440 (max unique normal fp16 values).
|
|
template <size_t N>
|
|
constexpr std::array<uint16_t, N> generate_fp16_bit_patterns()
|
|
{
|
|
static_assert(N <= 61440, "N exceeds available unique normal fp16 values");
|
|
|
|
std::array<uint16_t, N> result{};
|
|
constexpr uint16_t kPosStart = 0x0400;
|
|
constexpr uint16_t kNegStart = 0x8400;
|
|
constexpr size_t kMaxPositiveNormals = 30720;
|
|
|
|
for(size_t i = 0; i < N; ++i)
|
|
{
|
|
result[i] = (i < kMaxPositiveNormals)
|
|
? static_cast<uint16_t>(kPosStart + i)
|
|
: static_cast<uint16_t>(kNegStart + (i - kMaxPositiveNormals));
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/// Convert fp16 bit patterns to float values.
|
|
/// Performs: uint16_t -> half_t (bit_cast) -> float
|
|
template <size_t N>
|
|
std::array<float, N> convert_fp16_bits(const std::array<uint16_t, N>& bits)
|
|
{
|
|
std::array<float, N> result;
|
|
for(size_t i = 0; i < N; ++i)
|
|
{
|
|
half_t h = bit_cast<half_t>(bits[i]);
|
|
result[i] = type_convert<float>(h);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/// Generate unique fp16 values as a HostTensor for permutation testing.
|
|
/// Uses layered architecture: bit patterns -> type conversion -> HostTensor.
|
|
template <typename AccDataType, index_t Rows, index_t Cols>
|
|
HostTensor<AccDataType> generate_unique_fp16_input()
|
|
{
|
|
constexpr size_t N = static_cast<size_t>(Rows * Cols);
|
|
|
|
constexpr auto bits = generate_fp16_bit_patterns<N>();
|
|
auto values = convert_fp16_bits(bits);
|
|
|
|
HostTensor<AccDataType> host_input({Rows, Cols});
|
|
for(index_t m = 0; m < Rows; ++m)
|
|
{
|
|
for(index_t n = 0; n < Cols; ++n)
|
|
{
|
|
host_input(m, n) = static_cast<AccDataType>(values[static_cast<size_t>(m * Cols + n)]);
|
|
}
|
|
}
|
|
return host_input;
|
|
}
|
|
|
|
template <typename Problem, index_t M, index_t N>
|
|
auto run_cshuffle_epilogue_test(ScaleType scale = ScaleType::None)
|
|
{
|
|
using AccDataType = typename Problem::AccDataType;
|
|
using ODataType = typename Problem::ODataType;
|
|
|
|
constexpr index_t kMPerBlock = Problem::kMPerBlock;
|
|
constexpr index_t kNPerBlock = Problem::kNPerBlock;
|
|
const index_t kBlockSize = ck_tile::is_wave32() ? Problem::kBlockSize / 2 : Problem::kBlockSize;
|
|
|
|
std::cout << "Running CShuffleEpilogue test with M=" << M << ", N=" << N
|
|
<< ", MPerBlock=" << kMPerBlock << ", NPerBlock=" << kNPerBlock
|
|
<< ", BlockSize=" << kBlockSize << std::endl;
|
|
|
|
HostTensor<AccDataType> host_input =
|
|
generate_unique_fp16_input<AccDataType, kMPerBlock, kNPerBlock>();
|
|
|
|
// Allocate device input and copy from host
|
|
DeviceMem device_input_buf(host_input.get_element_space_size_in_bytes());
|
|
device_input_buf.ToDevice(host_input.data());
|
|
auto* device_input = static_cast<const AccDataType*>(device_input_buf.GetDeviceBuffer());
|
|
|
|
// Allocate host output memory
|
|
HostTensor<ODataType> host_output({M, N});
|
|
host_output.SetZero();
|
|
|
|
// Allocate device output memory
|
|
DeviceMem device_output_buf(host_output.get_element_space_size_in_bytes());
|
|
device_output_buf.ToDevice(host_output.data());
|
|
ODataType* device_output = static_cast<ODataType*>(device_output_buf.GetDeviceBuffer());
|
|
|
|
// Launch kernel with appropriate scale configuration
|
|
dim3 gridSize(1, 1, 1);
|
|
dim3 blockSize(kBlockSize, 1, 1);
|
|
|
|
switch(scale)
|
|
{
|
|
case ScaleType::RowCol:
|
|
launch_kernel_with_rowcol_scale<Problem, M, N>(
|
|
device_input, device_output, gridSize, blockSize);
|
|
break;
|
|
case ScaleType::Tensor:
|
|
launch_kernel_with_tensor_scale<Problem, M, N>(
|
|
device_input, device_output, gridSize, blockSize);
|
|
break;
|
|
case ScaleType::None:
|
|
launch_kernel_without_scale<Problem, M, N>(
|
|
device_input, device_output, gridSize, blockSize);
|
|
break;
|
|
}
|
|
|
|
// Copy results back
|
|
device_output_buf.FromDevice(host_output.data());
|
|
|
|
return host_output;
|
|
}
|
|
|
|
// Convert output values to sorted float vector for verification
|
|
// Uses float as intermediate to preserve precision for floating-point comparison
|
|
template <typename ODataType>
|
|
std::vector<float> convert_and_sort_output(const HostTensor<ODataType>& output)
|
|
{
|
|
std::vector<float> result;
|
|
result.reserve(output.get_element_size());
|
|
for(size_t i = 0; i < output.get_element_size(); ++i)
|
|
{
|
|
result.push_back(type_convert<float>(output.mData[i]));
|
|
}
|
|
std::sort(result.begin(), result.end());
|
|
return result;
|
|
}
|
|
|
|
// Run both unscaled and scaled tests for comparison
|
|
// Returns pair of (unscaled_output, scaled_output) host tensors
|
|
template <typename Problem, index_t M, index_t N, ScaleType ScaleMode>
|
|
auto run_scale_comparison_test()
|
|
{
|
|
auto unscaled_output = run_cshuffle_epilogue_test<Problem, M, N>(ScaleType::None);
|
|
auto scaled_output = run_cshuffle_epilogue_test<Problem, M, N>(ScaleMode);
|
|
|
|
return std::make_pair(std::move(unscaled_output), std::move(scaled_output));
|
|
}
|
|
|
|
} // namespace ck_tile
|