mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-05 06:01:23 +00:00
[rocm-libraries] ROCm/rocm-libraries#4302 (commit e62bd8a)
[CK_TILE] add tf32 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Proposed changes TF32 is added in CK on gfx942 and gfx950. This PR is to initiate tf32 in CK_TILE on gfx942 and gfx950. ## Checklist Please put an into the boxes that apply. You can also fill these out after creating the PR. If you're not sure, please don't hesitate to ask. - [ ] I have added tests relevant to the introduced functionality, and the unit tests are passing locally - [ ] I have added the test to REGRESSION_TESTS list defined at the top of CMakeLists.txt in tests/CMakeLists.txt, **IF** the test takes more than 30 seconds to run. - [ ] I have added inline documentation which enables the maintainers with understanding the motivation - [ ] I have removed the stale documentation which is no longer relevant after this pull request - [ ] (If this change is user-facing) I have added release notes which provide the end users with a brief summary of the improvement from this pull request - [x] I have run on all changed files - [ ] Any dependent changes have been merged ## Discussion
This commit is contained in:
committed by
assistant-librarian[bot]
parent
652d3456ca
commit
d460ab35b6
@@ -46,8 +46,12 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
|
||||
add_gtest_executable(test_ck_tile_gemm_pipeline_comp_async test_gemm_pipeline_comp_async.cpp)
|
||||
target_compile_options(test_ck_tile_gemm_pipeline_comp_async PRIVATE ${EXAMPLE_GEMM_COMPILE_COMPUTE_ASYNC_OPTIONS})
|
||||
|
||||
add_gtest_executable(test_ck_tile_gemm_pipeline_tf32_mem test_gemm_pipeline_tf32_mem.cpp)
|
||||
target_compile_options(test_ck_tile_gemm_pipeline_tf32_mem PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
|
||||
list(APPEND CK_TILE_GEMM_TEST_TARGETS
|
||||
test_ck_tile_gemm_pipeline_comp_async
|
||||
test_ck_tile_gemm_pipeline_tf32_mem
|
||||
)
|
||||
|
||||
add_gtest_executable(test_ck_tile_gemm_pipeline_comp_async_eight_waves test_gemm_pipeline_comp_async_eight_waves.cpp)
|
||||
|
||||
@@ -320,4 +320,14 @@ using KernelTypesPersistentWmma = ::testing::Types<
|
||||
std::tuple< Row, Col, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Intrawave, CompV3, NonPersistent>
|
||||
>;
|
||||
|
||||
// TF32 (gfx950 only): 3x bf16 MFMA emulation, uses float buffers with tf32_t compute type
|
||||
// Tile: 128x128x64, Warp tile: 32x32x16
|
||||
using KernelTypesTf32Mem = ::testing::Types<
|
||||
// ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, M_BlockSize, N_BlockSize, K_BlockSize, M_TileSize, N_TileSize, K_TileSize, Scheduler, PipelineType
|
||||
std::tuple< Row, Row, Row, TF32, TF32, F32, F32, I128, I128, I64, I32, I32, I16, Intrawave, Mem>,
|
||||
std::tuple< Row, Row, Row, TF32, TF32, F32, F32, I128, I128, I64, I32, I32, I16, Interwave, Mem>,
|
||||
std::tuple< Row, Col, Row, TF32, TF32, F32, F32, I128, I128, I64, I32, I32, I16, Intrawave, Mem>,
|
||||
std::tuple< Row, Col, Row, TF32, TF32, F32, F32, I128, I128, I64, I32, I32, I16, Interwave, Mem>
|
||||
>;
|
||||
|
||||
// clang-format on
|
||||
|
||||
@@ -13,3 +13,5 @@ using BF16 = ck_tile::bf16_t;
|
||||
using BF8 = ck_tile::bf8_t;
|
||||
|
||||
using I4 = ck_tile::pk_int4_t;
|
||||
|
||||
using TF32 = ck_tile::tf32_t;
|
||||
|
||||
22
test/ck_tile/gemm/test_gemm_pipeline_tf32_mem.cpp
Normal file
22
test/ck_tile/gemm/test_gemm_pipeline_tf32_mem.cpp
Normal file
@@ -0,0 +1,22 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "test_gemm_pipeline_kernel_types.hpp"
|
||||
#include "test_gemm_pipeline_util.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
template <typename T>
|
||||
class TestCkTileGemmPipelineTf32Mem
|
||||
: public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineTf32Mem<T>>
|
||||
{
|
||||
public:
|
||||
static constexpr bool check_data_type() { return true; }
|
||||
};
|
||||
|
||||
#define TEST_SUITE_NAME TestCkTileGemmPipelineTf32Mem
|
||||
|
||||
TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesTf32Mem);
|
||||
|
||||
#include "test_gemm_pipeline_ut_cases.inc"
|
||||
|
||||
#undef TEST_SUITE_NAME
|
||||
@@ -135,6 +135,10 @@ class TestCkTileGemmPipeline : public ::testing::Test
|
||||
static constexpr bool Persistent =
|
||||
ck_tile::tuple_element_or_default_t<Tuple, 15, std::false_type>::value;
|
||||
|
||||
// TF32 uses tf32_t as compute type but float as buffer/storage type
|
||||
using ADataTypeBuf = ck_tile::if_select_t<ADataType, ck_tile::tf32_t, float, ADataType>;
|
||||
using BDataTypeBuf = ck_tile::if_select_t<BDataType, ck_tile::tf32_t, float, BDataType>;
|
||||
|
||||
protected:
|
||||
template <bool PadM, bool PadN, bool PadK, bool Preshuffle>
|
||||
void invoke_gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
|
||||
@@ -183,12 +187,16 @@ class TestCkTileGemmPipeline : public ::testing::Test
|
||||
NumWaveGroup,
|
||||
preshuffle>;
|
||||
|
||||
using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
GemmShape,
|
||||
GemmUniversalTraits,
|
||||
Scheduler>;
|
||||
using UniversalGemmProblem =
|
||||
ck_tile::UniversalGemmPipelineProblem<ADataTypeBuf,
|
||||
BDataTypeBuf,
|
||||
AccDataType,
|
||||
GemmShape,
|
||||
GemmUniversalTraits,
|
||||
Scheduler,
|
||||
ck_tile::element_wise::PassThrough,
|
||||
ck_tile::element_wise::PassThrough,
|
||||
ADataType>;
|
||||
|
||||
using GemmPipeline =
|
||||
typename GemmPipelineTypeSelector<PipelineType, UniversalGemmProblem>::pipeline;
|
||||
@@ -304,24 +312,23 @@ class TestCkTileGemmPipeline : public ::testing::Test
|
||||
ck_tile::index_t stride_C =
|
||||
ck_tile::get_default_stride(M, N, StrideC, is_row_major(CLayout{}));
|
||||
|
||||
ck_tile::HostTensor<ADataType> a_m_k(
|
||||
ck_tile::HostTensor<ADataTypeBuf> a_m_k(
|
||||
ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(ALayout{})));
|
||||
ck_tile::HostTensor<BDataType> b_k_n(
|
||||
ck_tile::HostTensor<BDataTypeBuf> b_k_n(
|
||||
ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(BLayout{})));
|
||||
ck_tile::HostTensor<CDataType> c_m_n_dev_result(
|
||||
ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
|
||||
|
||||
ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5, 5, 11939}(a_m_k);
|
||||
ck_tile::FillUniformDistributionIntegerValue<BDataType>{-5, 5, 11940}(b_k_n);
|
||||
ck_tile::FillUniformDistributionIntegerValue<ADataTypeBuf>{-5, 5, 11939}(a_m_k);
|
||||
ck_tile::FillUniformDistributionIntegerValue<BDataTypeBuf>{-5, 5, 11940}(b_k_n);
|
||||
|
||||
ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
|
||||
|
||||
if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
|
||||
if constexpr(std::is_same_v<BDataTypeBuf, ck_tile::pk_int4_t>)
|
||||
{
|
||||
// Permute vector pk_i4x4 data for device implementation
|
||||
ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
|
||||
ck_tile::HostTensor<BDataTypeBuf> b_k_n_dev = b_k_n;
|
||||
permute_vectors_i4x4_b(b_k_n_dev);
|
||||
b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user