mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-04 05:31:24 +00:00
[CK_TILE] Migrate CK Tile examples to Tests to autorun on CI (#2421)
[CK_TILE] Add new ck tile unit test * Add new ck tile unit test smoke-gemm-universal * Add new ck tile unit test smoke-gemm-basic * Add new ck tile unit test topk_softmax * Add new ck tile unit test add_rmsnorm2d_rdquant_fwd
This commit is contained in:
53
test/ck_tile/layernorm2d/CMakeLists.txt
Normal file
53
test/ck_tile/layernorm2d/CMakeLists.txt
Normal file
@@ -0,0 +1,53 @@
|
||||
function(create_tile_layernorm2d_fwd SUFFIX)
|
||||
set(TEST_CK_TILE_LAYERNORM2D_FWD "test_ck_tile_layernorm2d_fwd_${SUFFIX}")
|
||||
|
||||
message(DEBUG "adding example ${TEST_CK_TILE_LAYERNORM2D_FWD}")
|
||||
add_test_executable(${TEST_CK_TILE_LAYERNORM2D_FWD} layernorm2d_fwd_${SUFFIX}.cpp)
|
||||
target_include_directories(${TEST_CK_TILE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_sources(${TEST_CK_TILE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS})
|
||||
|
||||
set(TEST_CK_TILE_LAYERNORM2D_FWD_COMPILE_OPTIONS)
|
||||
|
||||
# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
|
||||
list(APPEND TEST_CK_TILE_LAYERNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal --offload-compress)
|
||||
|
||||
target_compile_options(${TEST_CK_TILE_LAYERNORM2D_FWD} PRIVATE ${TEST_CK_TILE_LAYERNORM2D_FWD_COMPILE_OPTIONS})
|
||||
endfunction()
|
||||
|
||||
if(GPU_TARGETS MATCHES "gfx9")
|
||||
set(LAYERNORM2D_FWD_KNOWN_APIS "fwd;bwd")
|
||||
set(LAYERNORM2D_FWD_ENABLE_APIS "fwd" CACHE STRING
|
||||
"semicolon-separated list of APIs to generate (${LAYERNORM2D_FWD_KNOWN_APIS}) & link, or \"all\".")
|
||||
if(LAYERNORM2D_FWD_ENABLE_APIS STREQUAL "all")
|
||||
set(LAYERNORM2D_FWD_ENABLE_APIS ${LAYERNORM2D_FWD_KNOWN_APIS})
|
||||
endif()
|
||||
|
||||
# generate a list of kernels, but not actually emit files at config sta
|
||||
execute_process(
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
|
||||
--api ${LAYERNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --list_blobs
|
||||
RESULT_VARIABLE ret
|
||||
)
|
||||
if(ret AND NOT ret EQUAL 0)
|
||||
message( FATAL_ERROR "Fail to generate kernels via Python. ${ret}")
|
||||
endif()
|
||||
|
||||
file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/layernorm2d_fwd_blobs.txt LAYERNORM2D_FWD_GEN_BLOBS)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${LAYERNORM2D_FWD_GEN_BLOBS}
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
|
||||
--api ${LAYERNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --gen_blobs
|
||||
)
|
||||
|
||||
create_tile_layernorm2d_fwd("fp16")
|
||||
create_tile_layernorm2d_fwd("bf16")
|
||||
|
||||
# TODO: we have to turn off this global prop, otherwise the progress bar generated
|
||||
# by cmake will print too many files, execvp: /bin/sh: Argument list too long
|
||||
# however, this property may affect global
|
||||
# TODO: consider codegen a makefile by us
|
||||
set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
|
||||
else()
|
||||
message(DEBUG "Skipping ck tile add_rmsnorm2d_rdquant_fwd tests for current target")
|
||||
endif()
|
||||
730
test/ck_tile/layernorm2d/generate.py
Normal file
730
test/ck_tile/layernorm2d/generate.py
Normal file
@@ -0,0 +1,730 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
# generate kernel instances to speed up compilation
|
||||
|
||||
import argparse
|
||||
from enum import IntEnum
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import List, Optional, Any
|
||||
import functools
|
||||
import itertools
|
||||
import copy
|
||||
from dataclasses import dataclass
|
||||
|
||||
def get_if_str(idx, total, lase_else = True):
|
||||
if idx == 0:
|
||||
return 'if'
|
||||
elif idx < total - 1:
|
||||
return 'else if'
|
||||
else:
|
||||
if lase_else:
|
||||
return 'else'
|
||||
else:
|
||||
return 'else if'
|
||||
|
||||
XBIAS_ENUM_STR_MAP = [
|
||||
'no',
|
||||
'xbias'] # pre-norm add bias
|
||||
|
||||
FUSED_ADD_ENUM_STR_MAP = [
|
||||
'no',
|
||||
'pras', # pre-norm
|
||||
'pra' ] # post-norm
|
||||
|
||||
FUSED_FUSED_SWEEP_STR_MAP = [
|
||||
'no',
|
||||
'dquant' ]
|
||||
|
||||
DATA_TYPE_MAP = {'fp32' : 'float',
|
||||
'fp16' : 'ck_tile::fp16_t',
|
||||
'bf16' : 'ck_tile::bf16_t',
|
||||
'int8' : 'ck_tile::int8_t',
|
||||
'fp8' : 'ck_tile::fp8_t'}
|
||||
|
||||
def BOOL_MAP(b_) -> str:
|
||||
if b_:
|
||||
return 'true'
|
||||
else:
|
||||
return 'false'
|
||||
|
||||
class layernorm_fwd_codegen:
|
||||
API_TRAITS_DEFINE = """
|
||||
// this is used to pattern-match internl kernel implementation, not to instantiate kernel
|
||||
template <typename XDataType_,
|
||||
typename YDataType_,
|
||||
typename SmoothScaleDataType_,
|
||||
typename YScaleDataType_,
|
||||
ck_tile::index_t Repeat_M_, // each thread repeat along M
|
||||
ck_tile::index_t Repeat_N_, // each thread repeat along N
|
||||
ck_tile::index_t ThreadPerBlock_M_, // num threads along M
|
||||
ck_tile::index_t ThreadPerBlock_N_, // num threads along N
|
||||
ck_tile::index_t Vector_N_, // vector size along N
|
||||
bool kPadN_,
|
||||
bool kSaveMeanInvStd_,
|
||||
bool kFastFDiv_,
|
||||
bool kWelford_,
|
||||
bool kTwoPass_,
|
||||
ck_tile::index_t kXbias_ = 0,
|
||||
ck_tile::index_t kFusedAdd_ = 0,
|
||||
ck_tile::index_t kFusedQuant_ = 0>
|
||||
struct layernorm2d_fwd_traits_
|
||||
{
|
||||
using XDataType = ck_tile::remove_cvref_t<XDataType_>;
|
||||
using YDataType = ck_tile::remove_cvref_t<YDataType_>;
|
||||
using SmoothScaleDataType = ck_tile::remove_cvref_t<SmoothScaleDataType_>;
|
||||
using YScaleDataType = ck_tile::remove_cvref_t<YScaleDataType_>;
|
||||
|
||||
static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
|
||||
static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
|
||||
static constexpr ck_tile::index_t total_warps =
|
||||
(ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
|
||||
|
||||
// num of warps along m
|
||||
static constexpr ck_tile::index_t BlockWarps_M = []() {
|
||||
if constexpr(is_warp_per_row)
|
||||
{
|
||||
static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
|
||||
return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
|
||||
}
|
||||
else
|
||||
{
|
||||
// static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
|
||||
return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
|
||||
}
|
||||
}();
|
||||
|
||||
// num of warps along n
|
||||
static constexpr ck_tile::index_t BlockWarps_N = []() {
|
||||
if constexpr(is_warp_per_row)
|
||||
{
|
||||
static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
|
||||
return ThreadPerBlock_N_ / ck_tile::get_warp_size();
|
||||
}
|
||||
}();
|
||||
|
||||
static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
|
||||
static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
|
||||
|
||||
static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
|
||||
static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
|
||||
|
||||
static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
|
||||
static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
|
||||
|
||||
using BlockTile = ck_tile::sequence<Block_M, Block_N>;
|
||||
using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
|
||||
using WarpTile = ck_tile::sequence<Warp_M, Warp_N>;
|
||||
using Vector = ck_tile::sequence<1, Vector_N_>;
|
||||
|
||||
using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
|
||||
|
||||
static constexpr bool kPadN = kPadN_;
|
||||
static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
|
||||
static constexpr bool kFastFDiv = kFastFDiv_;
|
||||
static constexpr bool kWelford = kWelford_;
|
||||
static constexpr bool kTwoPass = kTwoPass_;
|
||||
static constexpr ck_tile::index_t kXbias = kXbias_;
|
||||
static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_;
|
||||
static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_;
|
||||
};
|
||||
|
||||
template <typename XDataType_,
|
||||
typename YDataType_,
|
||||
typename SmoothScaleDataType_,
|
||||
typename YScaleDataType_,
|
||||
ck_tile::index_t Repeat_M_, // each thread repeat along M
|
||||
ck_tile::index_t Repeat_N_, // each thread repeat along N
|
||||
ck_tile::index_t ThreadPerBlock_M_, // num threads along M
|
||||
ck_tile::index_t ThreadPerBlock_N_, // num threads along N
|
||||
ck_tile::index_t Vector_N_, // vector size along N
|
||||
bool kPadN_,
|
||||
bool kSaveMeanInvStd_,
|
||||
bool kFastFDiv_,
|
||||
bool kWelford_,
|
||||
bool kTwoPass_,
|
||||
int kXbias_,
|
||||
int kFusedAdd_,
|
||||
int kFusedQuant_>
|
||||
using traits_ = layernorm2d_fwd_traits_<XDataType_,
|
||||
YDataType_,
|
||||
SmoothScaleDataType_,
|
||||
YScaleDataType_,
|
||||
Repeat_M_,
|
||||
Repeat_N_,
|
||||
ThreadPerBlock_M_,
|
||||
ThreadPerBlock_N_,
|
||||
Vector_N_,
|
||||
kPadN_,
|
||||
kSaveMeanInvStd_,
|
||||
kFastFDiv_,
|
||||
kWelford_,
|
||||
kTwoPass_,
|
||||
kXbias_,
|
||||
kFusedAdd_,
|
||||
kFusedQuant_>;
|
||||
"""
|
||||
API_COMMON_HEADER = """
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <ck_tile/core.hpp>
|
||||
#include "layernorm2d_fwd.hpp"
|
||||
#include <ck_tile/ops/epilogue.hpp>
|
||||
#include <iostream>
|
||||
|
||||
#pragma once
|
||||
|
||||
using S = ck_tile::stream_config;
|
||||
using A = layernorm2d_fwd_args;
|
||||
|
||||
{F_traits_define}
|
||||
|
||||
template <typename Traits_>
|
||||
float layernorm2d_fwd_(const S& s, A a)
|
||||
{{
|
||||
using XDataType = typename Traits_::XDataType;
|
||||
using YDataType = typename Traits_::YDataType;
|
||||
using SmoothScaleDataType = typename Traits_::SmoothScaleDataType;
|
||||
using YScaleDataType = typename Traits_::YScaleDataType;
|
||||
using ComputeDataType = typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::ComputeDataType;
|
||||
|
||||
using PipelineTraits = ck_tile::Layernorm2dFwdTraits<Traits_::kPadN,
|
||||
Traits_::kSaveMeanInvStd,
|
||||
Traits_::kFastFDiv,
|
||||
Traits_::kWelford,
|
||||
Traits_::kTwoPass,
|
||||
static_cast<ck_tile::Layernorm2dXBiasEnum>(Traits_::kXbias),
|
||||
static_cast<ck_tile::Layernorm2dFusedAddEnum>(Traits_::kFusedAdd),
|
||||
static_cast<ck_tile::Layernorm2dFusedQuantEnum>(Traits_::kFusedQuant)>;
|
||||
using PipelineProblem = ck_tile::Layernorm2dFwdPipelineProblem<
|
||||
typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::XDataType,
|
||||
typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::XBiasDataType,
|
||||
typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::GammaDataType,
|
||||
typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::BetaDataType,
|
||||
typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::ComputeDataType,
|
||||
typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::YDataType,
|
||||
typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::MeanDataType,
|
||||
typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::InvStdDataType,
|
||||
typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::SmoothScaleDataType,
|
||||
typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::YScaleDataType,
|
||||
typename Traits_::Shape,
|
||||
PipelineTraits>;
|
||||
|
||||
using OnePassPipeline = ck_tile::Layernorm2dFwdPipelineOnePass<PipelineProblem>;
|
||||
using TwoPassPipeline = ck_tile::Layernorm2dFwdPipelineTwoPass<PipelineProblem>;
|
||||
using Pipeline = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
|
||||
|
||||
using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem<ComputeDataType, YDataType, false, Traits_::kPadN, true>;
|
||||
using Default2DEpilogue = ck_tile::Default2DEpilogue<Default2DEpilogueProblem>;
|
||||
|
||||
static constexpr bool UseSmoothInputScale = Traits_::kFusedQuant == 1;
|
||||
static constexpr bool UseRawStore = sizeof(YDataType) == 4;
|
||||
using DynamicQuantEpilogueProblem = ck_tile::DynamicQuantEpilogueProblem<ComputeDataType, SmoothScaleDataType, YScaleDataType, YDataType, typename Traits_::Shape,
|
||||
ck_tile::DynamicQuantEpilogueTraits<false, Traits_::kPadN, UseSmoothInputScale, UseRawStore, true/*max3*/>>;
|
||||
|
||||
using DynamicQuantEpilogue = ck_tile::DynamicQuantEpilogue<DynamicQuantEpilogueProblem>;
|
||||
|
||||
using Epilogue = std::conditional_t<Traits_::kFusedQuant == 1, DynamicQuantEpilogue, Default2DEpilogue>;
|
||||
|
||||
using Kernel = ck_tile::Layernorm2dFwd<Pipeline, Epilogue>;
|
||||
|
||||
const dim3 grids = Kernel::GridSize(a);
|
||||
constexpr dim3 blocks = Kernel::BlockSize();
|
||||
constexpr ck_tile::index_t kBlockPerCu = 1;
|
||||
|
||||
auto kargs = Kernel::MakeKargs(a);
|
||||
if(s.log_level_ > 0)
|
||||
std::cout << ", " << Kernel::GetName() << std::flush;
|
||||
|
||||
return ck_tile::launch_kernel(
|
||||
s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{{}}, grids, blocks, 0, kargs));
|
||||
}}
|
||||
|
||||
"""
|
||||
|
||||
API_BASE = """
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <ck_tile/core.hpp>
|
||||
#include "layernorm2d_fwd.hpp"
|
||||
|
||||
{F_traits_define}
|
||||
|
||||
// Note: this internal API only declare, not define here, otherwise will block `make -j`
|
||||
template <typename Traits_>
|
||||
float layernorm2d_fwd_(const ck_tile::stream_config& s, layernorm2d_fwd_args a);
|
||||
|
||||
float layernorm2d_fwd(layernorm2d_fwd_traits t,
|
||||
layernorm2d_fwd_args a,
|
||||
const ck_tile::stream_config& s)
|
||||
{{
|
||||
float r = -1;
|
||||
{F_dispatch}
|
||||
return r;
|
||||
}}
|
||||
|
||||
"""
|
||||
|
||||
API_PER_DTYPE=""" {F_if}(t.prec_i == \"{F_i_type}\" && t.prec_o == \"{F_o_type}\"){{
|
||||
{F_per_n_case}
|
||||
}}
|
||||
"""
|
||||
API_PER_N_CASE=""" {F_if} {F_N_COND} {{
|
||||
{F_inner_dispatch}
|
||||
}}
|
||||
"""
|
||||
API_INNER_CASE=""" {F_if} {F_VEC_COND}
|
||||
r={F_instance_func}(s, a);
|
||||
"""
|
||||
|
||||
INSTANCE_BASE = """
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "layernorm2d_fwd_api_common.hpp"
|
||||
|
||||
// clang-format off
|
||||
// prec_i prec_o prec_sy rm rn tm tn vn pd mv rpcf welford 2p xbias add sweep
|
||||
{F_instance_def}
|
||||
// clang-format on
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, working_path, kernel_filter):
|
||||
self.working_path = working_path
|
||||
self.kernel_filter = kernel_filter
|
||||
|
||||
class k_xbias_enum(IntEnum):
|
||||
F_NO_XBIAS = 0
|
||||
F_ADD_XBIAS = 1
|
||||
|
||||
class k_fuesd_add_enum(IntEnum):
|
||||
F_NO_ADD = 0
|
||||
F_PRE_ADD = 1
|
||||
F_PRE_ADD_STORE_RESIDUAL = 2
|
||||
|
||||
class k_fused_sweep_enum(IntEnum):
|
||||
F_NO_SWEEP = 0
|
||||
F_RENORM = 1
|
||||
F_DYNAMIC_QUANT = 2
|
||||
|
||||
@dataclass
|
||||
class k_traits:
|
||||
F_kPadN : bool
|
||||
F_kSaveMeanInvStd : bool
|
||||
F_kTwoPass : bool
|
||||
F_kXbias : Any #: layernorm_fwd_codegen.k_bias_enum
|
||||
F_kFusedAdd : Any #: layernorm_fwd_codegen.k_fuesd_add_enum
|
||||
F_kFusedQuant : Any #: layernorm_fwd_codegen.k_fused_sweep_enum
|
||||
|
||||
@dataclass
|
||||
class k_shape:
|
||||
F_BlockTile : List[int]
|
||||
F_WarpPerBlock : List[int]
|
||||
F_WarpTile : List[int]
|
||||
F_Vector_ : List[int]
|
||||
@property
|
||||
def F_BlockSize(self) -> int:
|
||||
return functools.reduce(lambda a, b: a*b, self.F_WarpTile)
|
||||
|
||||
@dataclass
|
||||
class k_problem:
|
||||
F_XDataType : str
|
||||
F_XBiasDataType : str
|
||||
F_GammaDataType : str
|
||||
F_BetaDataType : str
|
||||
F_ComputeDataType : str
|
||||
F_YDataType : str
|
||||
F_MeanDataType : str
|
||||
F_InvStdDataType : str
|
||||
F_BlockShape : str
|
||||
F_Traits : Any #k_traits
|
||||
|
||||
@dataclass
|
||||
class k_pipeline_one_pass:
|
||||
F_Problem : Any #k_problem
|
||||
|
||||
@dataclass
|
||||
class k_pipeline_two_pass:
|
||||
F_Problem : Any #k_problem
|
||||
|
||||
@dataclass
|
||||
class default_2d_epilogue_problem:
|
||||
F_AccDataType : str
|
||||
F_ODataType : str
|
||||
F_kPadM : bool
|
||||
F_kPadN : bool
|
||||
|
||||
@dataclass
|
||||
class default_2d_epilogue:
|
||||
F_problem : Any
|
||||
|
||||
@dataclass
|
||||
class k_kernel:
|
||||
F_pipeline : Any
|
||||
F_epilogue : Any
|
||||
|
||||
@dataclass
|
||||
class h_traits:
|
||||
F_XDataType : str
|
||||
F_YDataType : str
|
||||
F_SmoothScaleDataType : str
|
||||
F_YScaleDataType : str
|
||||
F_Repeat_M : int
|
||||
F_Repeat_N : int
|
||||
F_ThreadPerBlock_M : int
|
||||
F_ThreadPerBlock_N : int
|
||||
F_Vector_N : int
|
||||
F_kPadN : bool
|
||||
F_kSaveMeanInvStd_ : bool
|
||||
F_kFastFDiv_ : bool
|
||||
F_kWelford_ : bool
|
||||
F_kTwoPass_ : bool
|
||||
F_kXbias_ : int
|
||||
F_kFusedAdd : int
|
||||
F_kFusedQuant : int
|
||||
|
||||
@property
|
||||
def trait_name(self) ->str:
|
||||
t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_SmoothScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}'
|
||||
t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}, {BOOL_MAP(self.F_kFastFDiv_):5}, {BOOL_MAP(self.F_kWelford_):5}'
|
||||
t_ += f', {BOOL_MAP(self.F_kTwoPass_):5}, {self.F_kXbias:4}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}'
|
||||
return t_
|
||||
|
||||
# string when calling this kernel
|
||||
@property
|
||||
def call_name(self) -> str:
|
||||
return f'layernorm2d_fwd_<traits_<{self.trait_name}>>'
|
||||
|
||||
# string when define this kernel
|
||||
@property
|
||||
def def_name(self) -> str:
|
||||
return f'template float layernorm2d_fwd_<traits_<{self.trait_name}>>(const S&, A);'
|
||||
|
||||
# this class hold kernel under same source file
|
||||
@dataclass
|
||||
class h_instance:
|
||||
F_DataTypePair : str
|
||||
F_N : str
|
||||
F_xbias : int
|
||||
F_add : int
|
||||
F_sweep : int
|
||||
instance_list : List[Any] # List[h_traits]
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
prec_i, prec_o = self.F_DataTypePair.split(',')
|
||||
dtype_str = f'{prec_i}' if prec_i == prec_o else f'{prec_i}_{prec_o}'
|
||||
nnn = f'layernorm2d_fwd_{dtype_str}_n{self.F_N}'
|
||||
if self.F_xbias != 0:
|
||||
nnn = nnn + '_' + XBIAS_ENUM_STR_MAP[self.F_xbias]
|
||||
if self.F_add != 0:
|
||||
nnn = nnn + '_' + FUSED_ADD_ENUM_STR_MAP[self.F_add]
|
||||
if self.F_sweep != 0:
|
||||
nnn = nnn + '_' + FUSED_FUSED_SWEEP_STR_MAP[self.F_sweep]
|
||||
return nnn
|
||||
|
||||
@property
|
||||
def instance_name(self) ->str:
|
||||
return self.name
|
||||
|
||||
@property
|
||||
def content(self) ->str:
|
||||
instance_defs = ''
|
||||
for ins in self.instance_list:
|
||||
instance_defs += ins.def_name + '\n'
|
||||
return layernorm_fwd_codegen.INSTANCE_BASE.format(F_instance_def=instance_defs)
|
||||
|
||||
@property
|
||||
def name_api(self) -> str:
|
||||
return 'layernorm2d_fwd_api'
|
||||
|
||||
@property
|
||||
def name_common_header(self) -> str:
|
||||
return 'layernorm2d_fwd_api_common'
|
||||
|
||||
def content_api(self, args) -> str:
|
||||
# 1 sort based on dtype
|
||||
t_dtype_dict = dict()
|
||||
blobs = self.get_blobs(args)
|
||||
for blob in blobs:
|
||||
if blob.F_DataTypePair not in t_dtype_dict:
|
||||
t_dtype_dict[blob.F_DataTypePair] = {}
|
||||
if blob.F_N not in t_dtype_dict[blob.F_DataTypePair]:
|
||||
t_dtype_dict[blob.F_DataTypePair][blob.F_N] = []
|
||||
t_dtype_dict[blob.F_DataTypePair][blob.F_N].append(blob)
|
||||
|
||||
d_str = ''
|
||||
for i_d, dtype_ in enumerate(t_dtype_dict):
|
||||
blob_per_t = t_dtype_dict[dtype_]
|
||||
n_str = ''
|
||||
for i_n, n_ in enumerate(blob_per_t):
|
||||
blob_per_n = blob_per_t[n_]
|
||||
inner_str = ""
|
||||
for i_b, b_ in enumerate(blob_per_n):
|
||||
# generate single kernel instance file
|
||||
#vec_str = ""
|
||||
for i_ins, ins in enumerate(b_.instance_list):
|
||||
idx_in_n = i_b * len(b_.instance_list) + i_ins
|
||||
len_in_n = len(blob_per_n) * len(b_.instance_list)
|
||||
# _if = 'if' if i_ins == 0 else 'else if'
|
||||
if ins.F_kFusedQuant == 0:
|
||||
_sweep_cond = 't.fused_quant == {f_fused_sweep}'.format(f_fused_sweep = ins.F_kFusedQuant)
|
||||
elif ins.F_kFusedQuant == 1:
|
||||
_sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sm == \"{f_sx_type}\" && t.prec_sy == \"{f_sy_type}\")'.format(
|
||||
f_fused_sweep = ins.F_kFusedQuant, f_sx_type=ins.F_SmoothScaleDataType, f_sy_type=ins.F_YScaleDataType)
|
||||
elif ins.F_kFusedQuant == 2:
|
||||
_sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sy == \"{f_sy_type}\")'.format(
|
||||
f_fused_sweep = ins.F_kFusedQuant, f_sy_type=ins.F_YScaleDataType)
|
||||
_cond = '((a.n % {f_vec_n} == 0) && (t.xbias == {f_xbias}) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}))'.format(
|
||||
f_vec_n = ins.F_Vector_N, f_xbias = ins.F_kXbias, f_fused_add = ins.F_kFusedAdd,
|
||||
f_sweep_cond = _sweep_cond)
|
||||
inner_str += self.API_INNER_CASE.format(F_if = get_if_str(idx_in_n, len_in_n, False),
|
||||
F_VEC_COND = _cond, F_instance_func=ins.call_name)
|
||||
#inner_str = inner_str + vec_str
|
||||
n_cnd = f'(a.n <= {n_})' if isinstance(n_, int) else ''
|
||||
n_str += self.API_PER_N_CASE.format(F_if = get_if_str(i_n, len(blob_per_t), not isinstance(n_, int)), F_N_COND=n_cnd, F_inner_dispatch=inner_str)
|
||||
prec_i, prec_o = dtype_.split(',')
|
||||
d_str += self.API_PER_DTYPE.format(F_if = get_if_str(i_d, len(t_dtype_dict), False), F_i_type=prec_i, F_o_type=prec_o, F_per_n_case=n_str)
|
||||
|
||||
api_base = self.API_BASE.format(F_traits_define=self.API_TRAITS_DEFINE, F_dispatch=d_str)
|
||||
return api_base
|
||||
|
||||
@property
|
||||
def content_common_header(self) -> str:
|
||||
return self.API_COMMON_HEADER.format(F_traits_define=self.API_TRAITS_DEFINE)
|
||||
|
||||
def get_blobs(self, args):
|
||||
h_traits = layernorm_fwd_codegen.h_traits
|
||||
h_instance = layernorm_fwd_codegen.h_instance
|
||||
|
||||
dynamic_quant_out_dtype = ['int8', 'fp8']
|
||||
# some predefined support range
|
||||
# (prec_i,prec_o) for simplicity this string will be used as key for dict
|
||||
scale_list = [('fp32,fp32')]
|
||||
dtype_list = [('fp16,fp16'), ('bf16,bf16'),
|
||||
('fp16,int8'), ('bf16,int8'),
|
||||
('fp16,fp8'), ('bf16,fp8')] # NOTE: only fused-dynamic-quant use int8 or fp8 out
|
||||
types_8bit = ('int8', 'fp8')
|
||||
types_16bit = ('int16', 'fp16', 'bf16')
|
||||
#fused_add_list = [0, 1, 2]
|
||||
#fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused dynamic quant
|
||||
xbias_list = [0, 1]
|
||||
fused_add_list = [0, 1]
|
||||
fused_sweep_list = [0, 1] # NOTE: only single pass can use fused dynamic quant
|
||||
# rm rn tm tn vn pd mv fdiv welford 2p xbias add sweep
|
||||
h_trait_dict = {'64' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 8, 8, 8, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 16, 4, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 1, True, False, True, True, False, 0, 0, 0)],
|
||||
'128' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 16, 8, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 2, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 1, True, False, True, True, False, 0, 0, 0)],
|
||||
'256' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 4, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 2, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 1, True, False, True, True, False, 0, 0, 0)],
|
||||
'512' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 8, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 4, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 2, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 8, 4, 64, 1, True, False, True, True, False, 0, 0, 0)],
|
||||
'768' : [ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 4, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 6, 4, 64, 2, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 12, 4, 64, 1, True, False, True, True, False, 0, 0, 0)],
|
||||
'1024' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 2, 128, 8, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 2, 2, 128, 4, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 4, 2, 128, 2, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 1, True, False, True, True, False, 0, 0, 0)],
|
||||
'1536' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 8, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 3, 2, 128, 4, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 2, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 1, True, False, True, True, False, 0, 0, 0)],
|
||||
'2048' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 1, 256, 8, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 4, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 2, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 8, 1, 256, 1, True, False, True, True, False, 0, 0, 0)],
|
||||
'3072' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 128, 8, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 4, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 2, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 1, True, False, True, True, False, 0, 0, 0)],
|
||||
'4096' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, True, True, False, 0, 0, 0)],
|
||||
'6144' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 8, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 512, 4, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 2, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 6, 1,1024, 1, True, False, True, True, False, 0, 0, 0)],
|
||||
'8192' :[ h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 8, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 512, 4, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 2, True, False, True, True, False, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 8, 1,1024, 1, True, False, True, True, False, 0, 0, 0)],
|
||||
'big' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 1,1024, 8, True, False, True, True, True, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, True, True, True, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 12, 1, 256, 2, True, False, True, True, True, 0, 0, 0),
|
||||
h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, True, True, True, 0, 0, 0)]}
|
||||
total_blob = list()
|
||||
for hs_key in h_trait_dict:
|
||||
hs = h_trait_dict[hs_key]
|
||||
current_n = hs[0].F_Repeat_N * hs[0].F_ThreadPerBlock_N * hs[0].F_Vector_N
|
||||
for dtype, scale_type, xbias, fused_add, fused_quant in itertools.product(dtype_list, scale_list, xbias_list, fused_add_list, fused_sweep_list):
|
||||
prec_i, prec_o = dtype.split(',')
|
||||
scale_sm, scale_y = scale_type.split(',')
|
||||
if prec_o in dynamic_quant_out_dtype and fused_quant != 1:
|
||||
continue # skip non dynamic quant case
|
||||
if fused_quant == 1 and hs_key == 'big':
|
||||
continue
|
||||
current_hs = list()
|
||||
for chs_ in hs:
|
||||
h_ = copy.copy(chs_) # copy the base instance out
|
||||
h_.F_XDataType = prec_i
|
||||
h_.F_YDataType = prec_o
|
||||
h_.F_SmoothScaleDataType = scale_sm
|
||||
h_.F_YScaleDataType = scale_y
|
||||
h_.F_kXbias = xbias
|
||||
h_.F_kFusedAdd = fused_add
|
||||
h_.F_kFusedQuant = fused_quant
|
||||
# disable welford update for 8bit and 16 bit smallN
|
||||
if not h_.F_kTwoPass_:
|
||||
#disable 16 bit when set args disable_16b_welford
|
||||
if args.disable_16b_welford and prec_i in types_16bit:
|
||||
h_.F_kWelford_ = False
|
||||
#disable 8bit by default
|
||||
elif prec_i in types_8bit or prec_o in types_8bit:
|
||||
h_.F_kWelford_ = False
|
||||
#disable 16bit small N
|
||||
elif prec_i in types_16bit and hs_key == '64':
|
||||
h_.F_kWelford_ = False
|
||||
current_hs.append(h_) # + "\n"
|
||||
#f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_
|
||||
current_n_str = 'big' if hs_key == 'big' else current_n
|
||||
total_blob.append(h_instance(dtype, current_n_str, xbias, fused_add, fused_quant, current_hs))
|
||||
return total_blob
|
||||
|
||||
def list_blobs(self, args) -> None:
|
||||
w_p = Path(self.working_path)
|
||||
list_p = w_p / 'layernorm2d_fwd_blobs.txt'
|
||||
blobs = self.get_blobs(args)
|
||||
with list_p.open('w') as list_f:
|
||||
# api related file
|
||||
list_f.write(str(w_p / (self.name_api + ".cpp")) + "\n")
|
||||
list_f.write(str(w_p / (self.name_common_header + ".hpp")) + "\n")
|
||||
# kernel instance file
|
||||
for b in blobs:
|
||||
list_f.write(str(w_p / (b.name + ".cpp")) + "\n")
|
||||
|
||||
def gen_blobs(self, args) -> None:
|
||||
w_p = Path(self.working_path)
|
||||
w_str = self.content_api(args)
|
||||
(w_p / (self.name_api + ".cpp")).write_text(w_str)
|
||||
(w_p / (self.name_common_header + ".hpp")).write_text(self.content_common_header)
|
||||
blobs = self.get_blobs(args)
|
||||
for b in blobs:
|
||||
(w_p / (b.name + ".cpp")).write_text(b.content)
|
||||
|
||||
def list_blobs(args):
|
||||
api_list = args.api.split(',')
|
||||
for api in api_list:
|
||||
if api == 'fwd':
|
||||
layernorm_fwd_codegen(args.working_path, args.filter).list_blobs(args)
|
||||
|
||||
|
||||
def gen_blobs(args):
|
||||
api_list = args.api.split(',')
|
||||
for api in api_list:
|
||||
if api == 'fwd':
|
||||
layernorm_fwd_codegen(args.working_path, args.filter).gen_blobs(args)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="generate",
|
||||
description="gen API for CK layernorm kernel",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-a",
|
||||
"--api",
|
||||
default='fwd[all]',
|
||||
required=False,
|
||||
help="supply API(s) to generate (default: fwd). separated by comma."
|
||||
)
|
||||
|
||||
# the directory for list_blobs/gen_blobs to write files into
|
||||
parser.add_argument(
|
||||
"-w",
|
||||
"--working_path",
|
||||
default="./",
|
||||
required=False,
|
||||
help="the path where all the blobs are going to be generated"
|
||||
)
|
||||
|
||||
# this script have 2 modes
|
||||
# 1) list_blobs mode, will generate a txt file with all the files going to be generated.
|
||||
# this is useful in build system like cmake to construct source code dependency, by
|
||||
# reading the content out of this file
|
||||
# 2) gen_blobs mode, will generate the actuall kernel instance and api. If in framework
|
||||
# like FA, only need to use this mode
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--list_blobs",
|
||||
action='store_true',
|
||||
help="list all the kernels to a file, "
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-g",
|
||||
"--gen_blobs",
|
||||
action='store_true',
|
||||
help="generate all kernels into different tile"
|
||||
)
|
||||
|
||||
# TODO: if using filter, must apply same value to output_dir and list_blobs
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--filter",
|
||||
required=False,
|
||||
help="filter out kernels that need to generate, using fnmatch module"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--traits",
|
||||
default="all",
|
||||
required=False,
|
||||
help="enable/disable some feature. default generate all"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--receipt",
|
||||
default=0,
|
||||
required=False,
|
||||
help="codegen receipt."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--disable_16b_welford",
|
||||
default=False,
|
||||
required=False,
|
||||
help="enable/disable welford for 16bit datatype n > 64"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# print(f'{args.list_blobs}-{args.gen_blobs}')
|
||||
if (args.gen_blobs and args.list_blobs) or ((not args.gen_blobs) and (not args.list_blobs)):
|
||||
print('gen_blobs/list_blobs must specify only one option')
|
||||
sys.exit()
|
||||
|
||||
p = Path(args.working_path)
|
||||
if not p.exists():
|
||||
p.mkdir()
|
||||
|
||||
if args.list_blobs:
|
||||
list_blobs(args)
|
||||
else:
|
||||
gen_blobs(args)
|
||||
70
test/ck_tile/layernorm2d/layernorm2d_fwd.hpp
Normal file
70
test/ck_tile/layernorm2d/layernorm2d_fwd.hpp
Normal file
@@ -0,0 +1,70 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
#include "ck_tile/ops/layernorm2d.hpp"
|
||||
#include <string>
|
||||
|
||||
template <typename InType,
|
||||
typename OutType,
|
||||
typename SmoothSScaleDataType_,
|
||||
typename YScaleDataType_>
|
||||
struct LayerNormTypeConfig;
|
||||
|
||||
template <typename OutType, typename SmoothScaleDataType_, typename YScaleDataType_>
|
||||
struct LayerNormTypeConfig<ck_tile::half_t, OutType, SmoothScaleDataType_, YScaleDataType_>
|
||||
{
|
||||
using XDataType = ck_tile::half_t;
|
||||
using YDataType = OutType;
|
||||
using XBiasDataType = ck_tile::half_t;
|
||||
using GammaDataType = ck_tile::half_t;
|
||||
using BetaDataType = ck_tile::half_t;
|
||||
using MeanDataType = ck_tile::half_t;
|
||||
using InvStdDataType = ck_tile::half_t;
|
||||
using ComputeDataType = float;
|
||||
using SmoothScaleDataType = SmoothScaleDataType_;
|
||||
using YScaleDataType = YScaleDataType_;
|
||||
};
|
||||
|
||||
template <typename OutType, typename SmoothScaleDataType_, typename YScaleDataType_>
|
||||
struct LayerNormTypeConfig<ck_tile::bf16_t, OutType, SmoothScaleDataType_, YScaleDataType_>
|
||||
{
|
||||
using XDataType = ck_tile::bf16_t;
|
||||
using YDataType = OutType;
|
||||
using XBiasDataType = ck_tile::bf16_t;
|
||||
using GammaDataType = ck_tile::bf16_t;
|
||||
using BetaDataType = ck_tile::bf16_t;
|
||||
using MeanDataType = ck_tile::bf16_t;
|
||||
using InvStdDataType = ck_tile::bf16_t;
|
||||
using ComputeDataType = float;
|
||||
using SmoothScaleDataType = SmoothScaleDataType_;
|
||||
using YScaleDataType = YScaleDataType_;
|
||||
};
|
||||
|
||||
// runtime args
|
||||
struct layernorm2d_fwd_args : public ck_tile::Layernorm2dFwdHostArgs
|
||||
{
|
||||
};
|
||||
|
||||
// This is the public API, will be generated by script
|
||||
struct layernorm2d_fwd_traits
|
||||
{
|
||||
std::string prec_i; // input precision
|
||||
std::string prec_o; // output precision
|
||||
|
||||
// if fused_quant == 1, need set prec_sm/prec_sy to proper string, otherwise can set
|
||||
// arbitrary(will skip check) if fused_quant == 2, need set prec_sy to proper string, otherwise
|
||||
// can set arbitrary(will skip check)
|
||||
std::string prec_sm; // x-scale, used for [1*N] input smooth quant
|
||||
std::string prec_sy; // y-scale, used for [M*1] output for next layer
|
||||
|
||||
bool save_mean_var; //
|
||||
int xbias; // 0:no-bias, 1:add bias
|
||||
int fused_add; // 0:no-add, 1:pre-add-store, 2:pre-add
|
||||
int fused_quant; // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant
|
||||
};
|
||||
|
||||
float layernorm2d_fwd(layernorm2d_fwd_traits, layernorm2d_fwd_args, const ck_tile::stream_config&);
|
||||
566
test/ck_tile/layernorm2d/layernorm2d_fwd.inc
Normal file
566
test/ck_tile/layernorm2d/layernorm2d_fwd.inc
Normal file
@@ -0,0 +1,566 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "layernorm2d_fwd.hpp"
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
|
||||
// different threshold for different dtype
|
||||
template <typename DataType>
|
||||
auto get_elimit()
|
||||
{
|
||||
double rtol = 1e-2;
|
||||
double atol = 1e-2;
|
||||
return ck_tile::make_tuple(rtol, atol);
|
||||
}
|
||||
|
||||
template <>
|
||||
auto get_elimit<ck_tile::bf16_t>()
|
||||
{
|
||||
double rtol = 1e-2;
|
||||
double atol = 1e-2;
|
||||
return ck_tile::make_tuple(rtol, atol);
|
||||
}
|
||||
|
||||
template <>
|
||||
auto get_elimit<ck_tile::int8_t>()
|
||||
{
|
||||
double rtol = 1e-2;
|
||||
double atol = 1.0;
|
||||
return ck_tile::make_tuple(rtol, atol);
|
||||
}
|
||||
|
||||
auto create_args(int argc, char* argv[])
|
||||
{
|
||||
ck_tile::ArgParser arg_parser;
|
||||
arg_parser.insert("m", "3328", "m dimension")
|
||||
.insert("n", "4096", "n dimension")
|
||||
.insert("x_stride", "-1", "x row_stride, if -1 then equal to n")
|
||||
.insert("xr_stride", "-1", "x residule row_stride, if -1 then equal to n")
|
||||
.insert("y_stride", "-1", "y row_stride, if -1 then equal to n")
|
||||
.insert("yr_stride", "-1", "y residule row_stride, if -1 then equal to n")
|
||||
.insert("e", "1e-5", "epsilon")
|
||||
.insert("save_mv", "0", "save mean/variance(invstd) or not. set to 1 in training case")
|
||||
.insert("v", "1", "cpu validation or not")
|
||||
.insert("kname", "1", "print kernel name or not")
|
||||
.insert("prec_i", "fp16", "input precision")
|
||||
.insert("prec_o", "auto", "output precision, set auto will be the same as input")
|
||||
.insert("prec_sm",
|
||||
"auto",
|
||||
"output quant scale type, set auto will use fp32. used when fquant=1")
|
||||
.insert("prec_sy",
|
||||
"auto",
|
||||
"output quant scale type, set auto will use fp32. used when fquant=1 or 2")
|
||||
.insert("xbias", "0", "add bias, 0:no add, 1:add bias before fadd")
|
||||
.insert("fadd", "0", "fused-add, 0:no fused add, 1:preadd+store, 2:preadd only")
|
||||
.insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant")
|
||||
.insert("warmup", "5", "cold iter")
|
||||
.insert("repeat", "20", "hot iter");
|
||||
|
||||
bool result = arg_parser.parse(argc, argv);
|
||||
return std::make_tuple(result, arg_parser);
|
||||
}
|
||||
|
||||
template <typename InDataType,
|
||||
typename OutDataType,
|
||||
typename SmoothScaleDataType,
|
||||
typename YScaleDataType,
|
||||
bool SaveMeanVar>
|
||||
bool run(const ck_tile::ArgParser& arg_parser)
|
||||
{
|
||||
ck_tile::index_t m = arg_parser.get_int("m");
|
||||
ck_tile::index_t n = arg_parser.get_int("n");
|
||||
ck_tile::index_t x_stride = arg_parser.get_int("x_stride");
|
||||
if(x_stride < 0)
|
||||
x_stride = n;
|
||||
ck_tile::index_t xr_stride = arg_parser.get_int("xr_stride");
|
||||
if(xr_stride < 0)
|
||||
xr_stride = n;
|
||||
ck_tile::index_t y_stride = arg_parser.get_int("y_stride");
|
||||
if(y_stride < 0)
|
||||
y_stride = n;
|
||||
ck_tile::index_t yr_stride = arg_parser.get_int("yr_stride");
|
||||
if(yr_stride < 0)
|
||||
yr_stride = n;
|
||||
float epsilon = arg_parser.get_float("e");
|
||||
std::string prec_i = arg_parser.get_str("prec_i");
|
||||
std::string prec_o = arg_parser.get_str("prec_o");
|
||||
std::string prec_sm = arg_parser.get_str("prec_sm");
|
||||
std::string prec_sy = arg_parser.get_str("prec_sy");
|
||||
if(prec_o == "auto")
|
||||
{
|
||||
prec_o = prec_i;
|
||||
}
|
||||
if(prec_sm == "auto")
|
||||
{
|
||||
prec_sm = "fp32";
|
||||
}
|
||||
if(prec_sy == "auto")
|
||||
{
|
||||
prec_sy = "fp32";
|
||||
}
|
||||
|
||||
int kname = arg_parser.get_int("kname");
|
||||
int do_validation = arg_parser.get_int("v");
|
||||
int warmup = arg_parser.get_int("warmup");
|
||||
int repeat = arg_parser.get_int("repeat");
|
||||
int xbias = arg_parser.get_int("xbias");
|
||||
int fused_add = arg_parser.get_int("fadd");
|
||||
int fused_quant = arg_parser.get_int("fquant");
|
||||
if(fused_quant == 1 && prec_o != "int8" && prec_o != "fp8")
|
||||
{
|
||||
std::cout
|
||||
<< "if fused_quant is 1 or 2, only support \"-prec_o=int8\" or \"-prec_o=fp8\" cases."
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
assert(x_stride >= n);
|
||||
|
||||
using TypeConfig =
|
||||
LayerNormTypeConfig<InDataType, OutDataType, SmoothScaleDataType, YScaleDataType>;
|
||||
|
||||
using XDataType = typename TypeConfig::XDataType;
|
||||
using YDataType = typename TypeConfig::YDataType;
|
||||
using XBiasDataType = typename TypeConfig::XBiasDataType;
|
||||
using GammaDataType = typename TypeConfig::GammaDataType;
|
||||
using BetaDataType = typename TypeConfig::BetaDataType;
|
||||
using XResidualDataType = XDataType;
|
||||
using YResidualDataType = XDataType;
|
||||
|
||||
using MeanDataType =
|
||||
std::conditional_t<SaveMeanVar, typename TypeConfig::MeanDataType, ck_tile::null_type>;
|
||||
using InvStdDataType =
|
||||
std::conditional_t<SaveMeanVar, typename TypeConfig::InvStdDataType, ck_tile::null_type>;
|
||||
|
||||
using ComputeDataType = typename TypeConfig::ComputeDataType;
|
||||
|
||||
// host verify
|
||||
ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
|
||||
ck_tile::HostTensor<XBiasDataType> x_bias_host({n});
|
||||
ck_tile::HostTensor<GammaDataType> gamma_host({n});
|
||||
ck_tile::HostTensor<BetaDataType> beta_host({n});
|
||||
|
||||
ck_tile::HostTensor<XResidualDataType> x_residual_host({m, n}, {xr_stride, 1});
|
||||
ck_tile::HostTensor<YResidualDataType> y_residual_host({m, n}, {yr_stride, 1});
|
||||
|
||||
ck_tile::HostTensor<YDataType> y_host_ref({m, n}, {y_stride, 1});
|
||||
ck_tile::HostTensor<YDataType> y_host_dev({m, n}, {y_stride, 1});
|
||||
|
||||
ck_tile::HostTensor<MeanDataType> mean_host_ref({m});
|
||||
ck_tile::HostTensor<InvStdDataType> invStd_host_ref({m});
|
||||
ck_tile::HostTensor<YScaleDataType> y_scale_host_ref({m});
|
||||
ck_tile::HostTensor<YScaleDataType> y_scale_host_dev({m});
|
||||
|
||||
ck_tile::HostTensor<SmoothScaleDataType> sm_scale_host({n});
|
||||
ck_tile::HostTensor<SmoothScaleDataType> sm_scale_host_dev({n});
|
||||
|
||||
ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
|
||||
ck_tile::FillUniformDistribution<XResidualDataType>{-.5f, .5f}(x_residual_host);
|
||||
ck_tile::FillUniformDistribution<SmoothScaleDataType>{-1.f, 1.f}(sm_scale_host);
|
||||
ck_tile::FillUniformDistribution<XBiasDataType>{-.5f, .5f}(x_bias_host);
|
||||
ck_tile::FillUniformDistribution<GammaDataType>{-.5f, .5f}(gamma_host);
|
||||
ck_tile::FillUniformDistribution<BetaDataType>{-.5f, .5f}(beta_host);
|
||||
|
||||
ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem x_bias_buf(x_bias_host.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem beta_buf(beta_host.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem y_scale_buf(y_scale_host_dev.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem sm_scale_buf(sm_scale_host_dev.get_element_space_size_in_bytes());
|
||||
|
||||
ck_tile::DeviceMem x_residual_buf(x_residual_host.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem y_residual_buf(y_residual_host.get_element_space_size_in_bytes());
|
||||
|
||||
x_buf.ToDevice(x_host.data());
|
||||
x_bias_buf.ToDevice(x_bias_host.data());
|
||||
gamma_buf.ToDevice(gamma_host.data());
|
||||
beta_buf.ToDevice(beta_host.data());
|
||||
x_residual_buf.ToDevice(x_residual_host.data());
|
||||
sm_scale_buf.ToDevice(sm_scale_host.data());
|
||||
|
||||
auto prec_str = [&]() {
|
||||
auto base_str = prec_i;
|
||||
if(prec_i != prec_o)
|
||||
{
|
||||
base_str += "|" + prec_o;
|
||||
}
|
||||
if(fused_quant == 1)
|
||||
{
|
||||
base_str += std::string("(") + prec_sy + ")";
|
||||
}
|
||||
return base_str;
|
||||
}();
|
||||
|
||||
std::cout << "[" << prec_str << "]"
|
||||
<< " m:" << m << ", n:" << n << ", x_stride:" << x_stride
|
||||
<< ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
|
||||
<< ", yr_stride:" << yr_stride << std::flush;
|
||||
|
||||
layernorm2d_fwd_traits traits{
|
||||
prec_i, prec_o, prec_sm, prec_sy, SaveMeanVar, xbias, fused_add, fused_quant};
|
||||
|
||||
layernorm2d_fwd_args args{x_buf.GetDeviceBuffer(),
|
||||
fused_add != 0 ? x_residual_buf.GetDeviceBuffer() : nullptr,
|
||||
fused_quant == 1 ? sm_scale_buf.GetDeviceBuffer() : nullptr,
|
||||
x_bias_buf.GetDeviceBuffer(),
|
||||
gamma_buf.GetDeviceBuffer(),
|
||||
beta_buf.GetDeviceBuffer(),
|
||||
|
||||
y_buf.GetDeviceBuffer(),
|
||||
fused_add == 1 ? y_residual_buf.GetDeviceBuffer() : nullptr,
|
||||
fused_quant != 0 ? y_scale_buf.GetDeviceBuffer() : nullptr,
|
||||
nullptr, // p_mean, unsupported yet
|
||||
nullptr, // p_invStd, unsupported yet
|
||||
|
||||
epsilon,
|
||||
m,
|
||||
n,
|
||||
x_stride, // x row_stride
|
||||
xr_stride, // x residule row stride
|
||||
y_stride, // y row stride
|
||||
yr_stride}; // y residule row stride
|
||||
|
||||
float ave_time = layernorm2d_fwd(
|
||||
traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
|
||||
|
||||
if(ave_time < 0)
|
||||
{
|
||||
std::cout << " not supported!" << std::endl << std::flush;
|
||||
return false;
|
||||
}
|
||||
|
||||
std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(XBiasDataType) * n +
|
||||
sizeof(GammaDataType) * n + sizeof(BetaDataType) * n +
|
||||
sizeof(YDataType) * m * n;
|
||||
|
||||
float gb_per_sec = num_byte / 1.E6 / ave_time;
|
||||
std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
|
||||
|
||||
bool pass = true;
|
||||
|
||||
if(do_validation)
|
||||
{
|
||||
// reference
|
||||
if(xbias != 0)
|
||||
{
|
||||
// add bias before fadd
|
||||
int M = x_host.mDesc.get_lengths()[0];
|
||||
int N = x_host.mDesc.get_lengths()[1];
|
||||
for(int idx_m = 0; idx_m < M; ++idx_m)
|
||||
{
|
||||
for(int idx_n = 0; idx_n < N; ++idx_n)
|
||||
{
|
||||
x_host(idx_m, idx_n) = ck_tile::type_convert<XDataType>(
|
||||
ck_tile::type_convert<ComputeDataType>(x_host(idx_m, idx_n)) +
|
||||
ck_tile::type_convert<ComputeDataType>(x_bias_host(idx_n)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(fused_add != 0)
|
||||
{
|
||||
// fused pre_add/pre_add_store
|
||||
// TODO we accumulate directly to x_host for simplcity here...
|
||||
|
||||
std::transform(x_host.mData.cbegin(),
|
||||
x_host.mData.cend(),
|
||||
x_residual_host.mData.cbegin(),
|
||||
x_host.mData.begin(),
|
||||
[](auto x_, auto r_) {
|
||||
auto o_ = ck_tile::type_convert<ComputeDataType>(x_) +
|
||||
ck_tile::type_convert<ComputeDataType>(r_);
|
||||
return ck_tile::type_convert<XDataType>(o_);
|
||||
});
|
||||
}
|
||||
ck_tile::reference_layernorm2d_fwd<XDataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
ComputeDataType,
|
||||
YDataType,
|
||||
MeanDataType,
|
||||
InvStdDataType>(
|
||||
x_host, gamma_host, beta_host, y_host_ref, mean_host_ref, invStd_host_ref, epsilon);
|
||||
|
||||
if(fused_quant != 0)
|
||||
{
|
||||
auto dquant_functor = [&](int m_, auto& o_, auto& acc_) {
|
||||
int N_ = acc_.mDesc.get_lengths()[1];
|
||||
if(fused_quant == 1)
|
||||
{
|
||||
for(int n_ = 0; n_ < N_; n_++)
|
||||
{
|
||||
// input smooth outlier
|
||||
acc_(m_, n_) = acc_(m_, n_) *
|
||||
ck_tile::type_convert<ComputeDataType>(sm_scale_host(n_));
|
||||
}
|
||||
}
|
||||
ComputeDataType absmax = static_cast<ComputeDataType>(0);
|
||||
for(int n_ = 0; n_ < N_; n_++)
|
||||
{
|
||||
const auto a = ck_tile::abs(acc_(m_, n_));
|
||||
absmax = a > absmax ? a : absmax;
|
||||
}
|
||||
// printf("cpu:absmax:%f\n", absmax);
|
||||
constexpr ComputeDataType kMaxY =
|
||||
std::is_same<YDataType, ck_tile::fp8_t>::value ? 240.0
|
||||
: std::is_same<YDataType, ck_tile::int8_t>::value ? 127.0
|
||||
: 0.0;
|
||||
ComputeDataType y_scale = absmax / kMaxY;
|
||||
y_scale_host_ref(m_) = ck_tile::type_convert<YScaleDataType>(y_scale);
|
||||
for(int n_ = 0; n_ < N_; n_++)
|
||||
{
|
||||
o_(m_, n_) = ck_tile::type_convert<YDataType>(acc_(m_, n_) / y_scale);
|
||||
}
|
||||
};
|
||||
|
||||
ck_tile::reference_layernorm2d_fwd<XDataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
ComputeDataType,
|
||||
YDataType,
|
||||
MeanDataType,
|
||||
InvStdDataType>(x_host,
|
||||
gamma_host,
|
||||
beta_host,
|
||||
y_host_ref,
|
||||
mean_host_ref,
|
||||
invStd_host_ref,
|
||||
epsilon,
|
||||
dquant_functor);
|
||||
}
|
||||
else
|
||||
{
|
||||
ck_tile::reference_layernorm2d_fwd<XDataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
ComputeDataType,
|
||||
YDataType,
|
||||
MeanDataType,
|
||||
InvStdDataType>(
|
||||
x_host, gamma_host, beta_host, y_host_ref, mean_host_ref, invStd_host_ref, epsilon);
|
||||
}
|
||||
|
||||
y_buf.FromDevice(y_host_dev.data());
|
||||
|
||||
ck_tile::HostTensor<YResidualDataType> y_residual_host_dev({m, n}, {yr_stride, 1});
|
||||
if(fused_add == 1)
|
||||
{
|
||||
y_residual_buf.FromDevice(y_residual_host_dev.data());
|
||||
}
|
||||
|
||||
auto [rtol, atol] = get_elimit<OutDataType>();
|
||||
|
||||
if(x_stride == n)
|
||||
{
|
||||
pass = ck_tile::check_err(
|
||||
y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
|
||||
if(fused_add == 1)
|
||||
{
|
||||
pass &= ck_tile::check_err(y_residual_host_dev,
|
||||
x_host,
|
||||
std::string("ADD Error: Incorrect results!"),
|
||||
rtol,
|
||||
atol);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int i_r = 0; i_r < m; i_r++)
|
||||
{
|
||||
std::vector<YDataType> y_host_dev_row(y_host_dev.begin() + i_r * y_stride,
|
||||
y_host_dev.begin() + i_r * y_stride + n);
|
||||
std::vector<YDataType> y_host_ref_row(y_host_ref.begin() + i_r * y_stride,
|
||||
y_host_ref.begin() + i_r * y_stride + n);
|
||||
pass &= ck_tile::check_err(y_host_dev_row,
|
||||
y_host_ref_row,
|
||||
std::string("OUT[") + std::to_string(i_r) +
|
||||
std::string("] Error: Incorrect results!"),
|
||||
rtol,
|
||||
atol);
|
||||
if(fused_add == 1)
|
||||
{
|
||||
std::vector<YResidualDataType> y_residual_host_dev_row(
|
||||
y_residual_host_dev.begin() + i_r * yr_stride,
|
||||
y_residual_host_dev.begin() + i_r * yr_stride + n);
|
||||
std::vector<YResidualDataType> y_residual_host_ref_row(
|
||||
x_host.begin() + i_r * yr_stride, x_host.begin() + i_r * yr_stride + n);
|
||||
pass &= ck_tile::check_err(y_residual_host_dev_row,
|
||||
y_residual_host_ref_row,
|
||||
std::string("ADD[") + std::to_string(i_r) +
|
||||
std::string("] Error: Incorrect results!"),
|
||||
rtol,
|
||||
atol);
|
||||
}
|
||||
}
|
||||
}
|
||||
if(fused_quant == 1)
|
||||
{
|
||||
y_scale_buf.FromDevice(y_scale_host_dev.data());
|
||||
pass &= ck_tile::check_err(y_scale_host_dev,
|
||||
y_scale_host_ref,
|
||||
std::string("SCALE Error: Incorrect results!"),
|
||||
rtol,
|
||||
atol);
|
||||
}
|
||||
|
||||
std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
|
||||
}
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
bool dispatch_by_type(int argc, char* argv[])
|
||||
{
|
||||
auto [result, arg_parser] = create_args(argc, argv);
|
||||
if(!result)
|
||||
return false;
|
||||
|
||||
std::string prec_i = arg_parser.get_str("prec_i");
|
||||
std::string prec_o = arg_parser.get_str("prec_o");
|
||||
std::string prec_sm = arg_parser.get_str("prec_sm");
|
||||
std::string prec_sy = arg_parser.get_str("prec_sy");
|
||||
|
||||
if(prec_o == "auto")
|
||||
{
|
||||
prec_o = prec_i;
|
||||
}
|
||||
if(prec_sm == "auto")
|
||||
{
|
||||
prec_sm = "fp32";
|
||||
}
|
||||
if(prec_sy == "auto")
|
||||
{
|
||||
prec_sy = "fp32";
|
||||
}
|
||||
int save_mv = arg_parser.get_int("save_mv");
|
||||
|
||||
// no dynamic quant case
|
||||
if(prec_i == "fp16" && prec_o == "fp16" && prec_sm == "fp32" && prec_sy == "fp32" && save_mv)
|
||||
{
|
||||
return run<ck_tile::half_t, ck_tile::half_t, float, float, true>(arg_parser) ? 0 : -2;
|
||||
}
|
||||
else if(prec_i == "fp16" && prec_o == "fp16" && prec_sm == "fp32" && prec_sy == "fp32" &&
|
||||
!save_mv)
|
||||
{
|
||||
return run<ck_tile::half_t, ck_tile::half_t, float, float, false>(arg_parser) ? 0 : -2;
|
||||
}
|
||||
else if(prec_i == "bf16" && prec_o == "bf16" && prec_sm == "fp32" && prec_sy == "fp32" &&
|
||||
save_mv)
|
||||
{
|
||||
return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true>(arg_parser) ? 0 : -2;
|
||||
}
|
||||
else if(prec_i == "bf16" && prec_o == "bf16" && prec_sm == "fp32" && prec_sy == "fp32" &&
|
||||
!save_mv)
|
||||
{
|
||||
return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true>(arg_parser) ? 0 : -2;
|
||||
}
|
||||
|
||||
// dynamic quant case, only in inference
|
||||
else if(prec_i == "fp16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
|
||||
!save_mv)
|
||||
{
|
||||
return run<ck_tile::half_t, ck_tile::int8_t, float, float, false>(arg_parser) ? 0 : -2;
|
||||
}
|
||||
else if(prec_i == "bf16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
|
||||
!save_mv)
|
||||
{
|
||||
return run<ck_tile::bf16_t, ck_tile::int8_t, float, float, false>(arg_parser) ? 0 : -2;
|
||||
}
|
||||
else if(prec_i == "fp16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
|
||||
!save_mv)
|
||||
{
|
||||
return run<ck_tile::half_t, ck_tile::fp8_t, float, float, false>(arg_parser) ? 0 : -2;
|
||||
}
|
||||
else if(prec_i == "bf16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
|
||||
!save_mv)
|
||||
{
|
||||
return run<ck_tile::bf16_t, ck_tile::fp8_t, float, float, false>(arg_parser) ? 0 : -2;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
int run_layernorm2d_fwd_combinations(std::string const& data_type)
|
||||
{
|
||||
constexpr size_t PARAM_COUNT = 20;
|
||||
char bufs[PARAM_COUNT][64];
|
||||
char* argv[PARAM_COUNT];
|
||||
|
||||
for(std::size_t i = 0; i < PARAM_COUNT; i++)
|
||||
{
|
||||
argv[i] = bufs[i];
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::string>> fquant = {
|
||||
{}, {"-fquant=1", "-prec_o=int8"}, {"-fquant=1", "-prec_o=fp8"}};
|
||||
|
||||
std::vector<std::string> fadd = {"-fadd=0", "-fadd=1"};
|
||||
|
||||
std::vector<std::vector<std::string>> params = {
|
||||
{"-m=99", "-n=13"},
|
||||
{"-m=17", "-n=16"},
|
||||
{"-m=1", "-n=100"},
|
||||
{"-m=4", "-n=128"},
|
||||
{"-m=80", "-n=127"},
|
||||
{"-m=22", "-n=255 -stride=256"},
|
||||
{"-m=7", "-n=599"},
|
||||
{"-m=19", "-n=512"},
|
||||
{"-m=33", "-n=313 -stride=1000"},
|
||||
{"-m=11", "-n=510"},
|
||||
{"-m=171", "-n=676 -stride=818"},
|
||||
{"-m=91", "-n=636"},
|
||||
{"-m=12", "-n=768 -stride=800"},
|
||||
{"-m=100", "-n=766 -stride=812"},
|
||||
{"-m=31", "-n=1024"},
|
||||
{"-m=64", "-n=1000 -stride=1004"},
|
||||
{"-m=8", "-n=1501"},
|
||||
{"-m=3", "-n=1826"},
|
||||
{"-m=5", "-n=2040"},
|
||||
{"-m=7", "-n=2734"},
|
||||
{"-m=1", "-n=3182"},
|
||||
{"-m=9", "-n=4096"},
|
||||
{"-m=3", "-n=8192"},
|
||||
{"-m=3", "-n=9120"},
|
||||
{"-m=1", "-n=10547"},
|
||||
};
|
||||
|
||||
bool result = true;
|
||||
int argc = 0;
|
||||
std::vector<int> argc_stack;
|
||||
std::string pr_i = "-prec_i=" + data_type;
|
||||
strncpy(bufs[argc++], "layernorm2d_fwd", 64);
|
||||
strncpy(bufs[argc++], pr_i.c_str(), 64);
|
||||
argc_stack.push_back(argc);
|
||||
for(size_t fquant_idx = 0; fquant_idx < fquant.size(); fquant_idx++)
|
||||
{
|
||||
argc = argc_stack.back();
|
||||
for(size_t j = 0; j < fquant[fquant_idx].size(); j++)
|
||||
{
|
||||
strncpy(bufs[argc++], fquant[fquant_idx][j].c_str(), 64);
|
||||
}
|
||||
argc_stack.push_back(argc);
|
||||
for(size_t fadd_idx = 0; fadd_idx < fadd.size(); fadd_idx++)
|
||||
{
|
||||
argc = argc_stack.back();
|
||||
strncpy(bufs[argc++], fadd[fadd_idx].c_str(), 64);
|
||||
argc_stack.push_back(argc);
|
||||
for(size_t param_idx = 0; param_idx < params.size(); param_idx++)
|
||||
{
|
||||
argc = argc_stack.back();
|
||||
for(size_t j = 0; j < params[param_idx].size(); j++)
|
||||
{
|
||||
strncpy(bufs[argc++], params[param_idx][j].c_str(), 64);
|
||||
}
|
||||
|
||||
result = dispatch_by_type(argc, argv) && result;
|
||||
}
|
||||
argc_stack.pop_back();
|
||||
}
|
||||
argc_stack.pop_back();
|
||||
}
|
||||
argc_stack.pop_back();
|
||||
return result ? 0 : -1;
|
||||
}
|
||||
6
test/ck_tile/layernorm2d/layernorm2d_fwd_bf16.cpp
Normal file
6
test/ck_tile/layernorm2d/layernorm2d_fwd_bf16.cpp
Normal file
@@ -0,0 +1,6 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "layernorm2d_fwd.inc"
|
||||
|
||||
int main() { return run_layernorm2d_fwd_combinations("bf16"); }
|
||||
6
test/ck_tile/layernorm2d/layernorm2d_fwd_fp16.cpp
Normal file
6
test/ck_tile/layernorm2d/layernorm2d_fwd_fp16.cpp
Normal file
@@ -0,0 +1,6 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "layernorm2d_fwd.inc"
|
||||
|
||||
int main() { return run_layernorm2d_fwd_combinations("fp16"); }
|
||||
Reference in New Issue
Block a user