diff --git a/CHANGELOG.md b/CHANGELOG.md index 44d0837b40..1223b63be0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/projects/composable_kernel/en/latest/](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/). -## Composable Kernel 1.1.0 for ROCm 7.2.0 +## Composable Kernel 1.2.0 for ROCm 7.2.0 ### Added * Added support for mixed precision fp8 x bf8 universal GEMM and weight preshuffle GEMM diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index 03bde86421..a9ae0b2a6a 100644 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -105,7 +105,7 @@ foreach(gpu IN LISTS GPU_TARGETS) endif() endforeach() -list(APPEND gpu_list_tf32 gfx942) +list(APPEND gpu_list_tf32 gfx942 gfx950) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0) diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt index 4f174bfcbb..791d81e264 100644 --- a/example/09_convnd_fwd/CMakeLists.txt +++ b/example/09_convnd_fwd/CMakeLists.txt @@ -21,7 +21,7 @@ foreach(gpu IN LISTS GPU_TARGETS) endif() endforeach() -list(APPEND gpu_list_tf32 gfx942) +list(APPEND gpu_list_tf32 gfx942 gfx950) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0) diff --git a/example/09_convnd_fwd/convnd_fwd_common.hpp b/example/09_convnd_fwd/convnd_fwd_common.hpp index d82b56ec00..2a972c13eb 100644 --- a/example/09_convnd_fwd/convnd_fwd_common.hpp +++ b/example/09_convnd_fwd/convnd_fwd_common.hpp @@ -77,7 +77,7 @@ inline __host__ __device__ constexpr double get_atol() { if constexpr(std::is_same_v && std::is_same_v) { - return 1e-2; + return 1e-3; } else if constexpr(std::is_same_v) { diff --git a/example/15_grouped_gemm/CMakeLists.txt b/example/15_grouped_gemm/CMakeLists.txt index 20cbc5fdca..20d9bab7e1 100644 --- a/example/15_grouped_gemm/CMakeLists.txt +++ b/example/15_grouped_gemm/CMakeLists.txt @@ -33,3 +33,13 @@ if(USE_BITINT_EXTENSION_INT4) add_example_executable(example_grouped_gemm_xdl_int4 grouped_gemm_xdl_int4.cpp) add_example_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_int4) endif() + +list(APPEND gpu_list_tf32 gfx942 gfx950) +set(target 0) +foreach(gpu IN LISTS GPU_TARGETS) + if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0) + add_example_executable(example_grouped_gemm_xdl_fp32_tf32 grouped_gemm_xdl_fp32_tf32.cpp) + add_example_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_fp32_tf32) + set(target 1) + endif() +endforeach() diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp32_tf32.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp32_tf32.cpp new file mode 100644 index 0000000000..c9a3ede151 --- /dev/null +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp32_tf32.cpp @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" + +#define EXAMPLE_WITH_COMPUTE_DATATYPE + +template +using S = ck::Sequence; + +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using ADataType = F32; +using BDataType = F32; +using AccDataType = F32; +using CShuffleDataType = F32; +using DsDataType = ck::Tuple<>; +using EDataType = F32; +using ComputeDataType = ck::tf32_t; + +using ALayout = Row; +using BLayout = Col; +using DsLayout = ck::Tuple<>; +using ELayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl + // clang-format off +//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| +//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| +//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| +//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4, ck::LoopScheduler::Default, ComputeDataType>; +// clang-format on + +#include "run_grouped_gemm_example.inc" + +int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); } + +#undef EXAMPLE_WITH_COMPUTE_DATATYPE diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc index 87ccebc3c4..62f0f3673d 100644 --- a/example/15_grouped_gemm/run_grouped_gemm_example.inc +++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc @@ -3,6 +3,11 @@ #pragma once +// use macro to minimize code change +#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE +using ComputeDataType = AccDataType; +#endif + struct ProblemSize final { std::vector Ms; @@ -231,7 +236,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co AccDataType, AElementOp, BElementOp, - CDEElementOp>; + CDEElementOp, + ComputeDataType>; for(std::size_t i = 0; i < gemm_descs.size(); i++) { @@ -253,7 +259,9 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co pass &= ck::utils::check_err(c_device_result_converted, c_host_tensors[i]); #else - pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]); + pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]); #endif } } diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp index 9155b27dba..cf05abd51c 100644 --- a/example/ck_tile/18_flatmm/flatmm_basic.cpp +++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp @@ -47,7 +47,7 @@ static constexpr inline auto is_row_major(Layout layout_) // mfma_type, 0:32x32, 1:16x16 template -auto shuffle_b(const ck_tile::HostTensor& t) +auto shuffle_b_v0(const ck_tile::HostTensor& t) { assert(t.get_lengths().size() == 2); int n_ = t.get_lengths()[1]; diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc index 69bf39f670..4063fe284e 100644 --- a/example/ck_tile/18_flatmm/run_flatmm_example.inc +++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc @@ -103,7 +103,7 @@ int run_flatmm_example_with_layouts(int argc, } else { - return shuffle_b(b_origin_host); + return shuffle_b_v0(b_origin_host); } }(); ck_tile::DeviceMem b_shuffle_dev_buf(b_shuffle_host.get_element_space_size_in_bytes()); diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp index 0c4f056a46..53f4c27399 100644 --- a/include/ck/host_utility/device_prop.hpp +++ b/include/ck/host_utility/device_prop.hpp @@ -129,7 +129,10 @@ inline bool is_wmma_supported() return is_gfx103_supported() || is_gfx11_supported() || is_gfx12_supported(); } -inline bool is_tf32_supported() { return (ck::get_device_name() == "gfx942") ? true : false; } +inline bool is_tf32_supported() +{ + return ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"; +} } // namespace ck #endif diff --git a/include/ck/library/utility/check_err.hpp b/include/ck/library/utility/check_err.hpp index 3637053e14..fccd5c8e75 100644 --- a/include/ck/library/utility/check_err.hpp +++ b/include/ck/library/utility/check_err.hpp @@ -168,8 +168,8 @@ typename std::enable_if< check_err(const Range& out, const RefRange& ref, const std::string& msg = "Error: Incorrect results!", - double rtol = 1e-5, - double atol = 3e-5) + double rtol = 5e-4, + double atol = 5e-4) { if(out.size() != ref.size()) { diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp index 52632785bd..cf9992942e 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp @@ -94,7 +94,8 @@ template + typename CElementwiseOperation, + typename ComputeDataType = ADataType> struct DeviceGroupedGemm : public BaseOperator { static constexpr index_t NumDTensor = DsDataType::Size(); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp index 02639dbf3e..9d8f7e3bf7 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp @@ -795,7 +795,7 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3 : public DeviceGemmMultipleDSplitK + LoopScheduler LoopSched = make_default_loop_scheduler(), + typename ComputeDataType = ADataType> struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm + CDEElementwiseOperation, + ComputeDataType> { using DeviceOp = DeviceGroupedGemm_Xdl; GET_NXDL_PER_WAVE_IMPL @@ -233,8 +235,6 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm; using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N(1, 1, 1)); - using ComputeDataType = ADataType; - // GridwiseGemm template using GridwiseGemmBase = GridwiseGemmMultipleD_xdl_cshuffle< diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp index 4fcf717e80..f775f99a65 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp @@ -1145,6 +1145,22 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3 InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set> __device__ static bool constexpr IsValidCompilationParameter() { + enum struct Arch : bool + { +#if defined(__gfx950__) + is_gfx950_build = true, +#else + is_gfx950_build = false, +#endif + }; + + // skip building the instances with K1>=32 on pre-gfx950 + if constexpr((static_cast(Arch::is_gfx950_build) == false) && + (AK1Number >= 32 || BK1Number >= 32)) + { + return false; + } + constexpr bool valid = ck::tensor_operation::device::IsValidGemmCompilationParameter< BlockSize, MPerBlock, diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp index ce2d9299f9..0817cf9856 100644 --- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp @@ -80,8 +80,10 @@ enum struct MfmaInstr mfma_f32_16x16x128f8f6f4, mfma_scale_f32_32x32x64f8f6f4, mfma_scale_f32_16x16x128f8f6f4, - mfma_f32_16x16x8xf32, // tf32 - mfma_f32_32x32x4xf32, + mfma_f32_16x16x8xf32, // tf32 on gfx942 + mfma_f32_32x32x4xf32, // tf32 on gfx942 + mfma_f32_16x16x32xf32, // bf16x3 simulate tf32 on gfx950 + mfma_f32_32x32x16xf32, // bf16x3 simulate tf32 on gfx950 // gfx11 wmma_f32_16x16x16_f16, wmma_f32_16x16x16_bf16, @@ -1015,6 +1017,51 @@ struct mfma_type } }; +template <> +struct mfma_type +{ + // gfx950 specific: use bf16x3 simulate tf32 + static constexpr index_t group_size = 4; + static constexpr index_t num_groups_per_blk = 4; + static constexpr index_t num_regs_per_blk = 16; + static constexpr index_t num_threads_per_blk = 32; + static constexpr index_t wave_size = 64; + static constexpr index_t num_input_blks = 2; + static constexpr index_t num_output_blks = 1; + static constexpr index_t m_per_blk = 32; + static constexpr index_t n_per_blk = 32; + static constexpr index_t k_per_blk = 8; + static constexpr bool is_k_reduction = true; + + template + __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const + { + intrin_mfma_f32_32x32x16xf32::Run(a, b, reg_c); + } +}; +template <> +struct mfma_type +{ + // gfx950 specific: use bf16x3 simulate tf32 + static constexpr index_t group_size = 4; + static constexpr index_t num_groups_per_blk = 1; + static constexpr index_t num_regs_per_blk = 4; + static constexpr index_t num_threads_per_blk = 16; + static constexpr index_t wave_size = 64; + static constexpr index_t num_input_blks = 4; + static constexpr index_t num_output_blks = 1; + static constexpr index_t m_per_blk = 16; + static constexpr index_t n_per_blk = 16; + static constexpr index_t k_per_blk = 8; + static constexpr bool is_k_reduction = true; + + template + __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const + { + intrin_mfma_f32_16x16x32xf32::Run(a, b, reg_c); + } +}; + // gfx11 struct mfma_type_gfx11_base { @@ -1275,12 +1322,14 @@ struct MfmaSelector } template <> - constexpr auto GetMfma() + constexpr auto GetMfma() { #if defined(__gfx12__) return MfmaInstr::wmma_unsupport_16x16_gfx12; #elif defined(__gfx11__) return MfmaInstr::wmma_unsupport_16x16_gfx11; +#elif defined(__gfx950__) + return MfmaInstr::mfma_f32_32x32x16xf32; #elif defined(__gfx942__) return MfmaInstr::mfma_f32_32x32x4xf32; #else @@ -1289,12 +1338,14 @@ struct MfmaSelector } template <> - constexpr auto GetMfma() + constexpr auto GetMfma() { #if defined(__gfx12__) return MfmaInstr::wmma_unsupport_16x16_gfx12; #elif defined(__gfx11__) return MfmaInstr::wmma_unsupport_16x16_gfx11; +#elif defined(__gfx950__) + return MfmaInstr::mfma_f32_16x16x32xf32; #elif defined(__gfx942__) return MfmaInstr::mfma_f32_16x16x8xf32; #else @@ -2185,6 +2236,10 @@ struct XdlopsGemm (is_same::value && KPack <= 8) || ((is_same::value || is_same::value) && KPack < 32) || is_same::value) +#if defined(__gfx950__) + // tf32 on gfx950 is implemented as bf16x3, so it should be treated as bf16. + || (is_same::value && KPack <= 4) +#endif ? true : false; static constexpr auto mfma = MfmaSelector +__device__ __forceinline__ void +convert_float_to_bf16_pairs(const vector_type& reg_f32, + vector_type& reg_bf16_big, + vector_type& reg_bf16_small) +{ + static_for<0, VecSize, 1>{}([&](auto k) { + using IK = Number; + reg_bf16_big.template AsType()(k) = + type_convert(reg_f32.template AsType()[IK{}]); + reg_bf16_small.template AsType()(k) = type_convert( + reg_f32.template AsType()[IK{}] - + type_convert(reg_bf16_big.template AsType()[IK{}])); + }); +} +/* */ + // fp32 template struct intrin_mfma_f32_32x32x1f32; @@ -1636,7 +1655,7 @@ struct intrin_mfma_f32_16x16x32bf8f8<16, 16> } }; -/******************* tf32 *************************************/ +/******************* tf32 on gfx942 *************************************/ template struct intrin_mfma_f32_16x16x8xf32; @@ -1646,7 +1665,7 @@ struct intrin_mfma_f32_16x16x8xf32<16, 16> template __device__ static void Run(const float2_t& reg_a, const float2_t& reg_b, FloatC& reg_c) { -#if defined(__gfx94__) +#if defined(__gfx942__) reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x8_xf32( reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 0, 0, 0); #else @@ -1666,7 +1685,7 @@ struct intrin_mfma_f32_32x32x4xf32<32, 32> template __device__ static void Run(const float2_t& reg_a, const float2_t& reg_b, FloatC& reg_c) { -#if defined(__gfx94__) +#if defined(__gfx942__) reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x4_xf32( reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 0, 0, 0); #else @@ -1677,4 +1696,102 @@ struct intrin_mfma_f32_32x32x4xf32<32, 32> } }; +/******************* tf32/xf32 on gfx950 ********************************/ +/* bf16x3 simulate tf32/xf32: input/output/accumulator are all float; */ +/* step: */ +/* 1. separate one input to 2 bf16 registers: */ +/* in_bf16_big = f32_to_bf16(in_f32) */ +/* in_bf16_small = in_f32 - in_bf16_big */ +/* 2. run 3 xdlops gemm: the accumulator of each gemm is the same. */ +/* out_f32 = A_bf16_big * B_bf16_big */ +/* out_f32 += A_bf16_small * B_bf16_big */ +/* out_f32 += A_bf16_big * B_bf16_small */ +/************************************************************************/ +template +struct intrin_mfma_f32_16x16x32xf32; + +template <> +struct intrin_mfma_f32_16x16x32xf32<16, 16> +{ + template + __device__ static void Run(const float8_t& reg_a, const float8_t& reg_b, FloatC& reg_c) + { +#if defined(__gfx950__) + using I0 = Number<0>; + vector_type reg_a_v(reg_a); + vector_type reg_b_v(reg_b); + + vector_type v_reg_a_bf16_big; + vector_type v_reg_a_bf16_small; + vector_type v_reg_b_bf16_big; + vector_type v_reg_b_bf16_small; + + convert_float_to_bf16_pairs(reg_a_v, v_reg_a_bf16_big, v_reg_a_bf16_small); + convert_float_to_bf16_pairs(reg_b_v, v_reg_b_bf16_big, v_reg_b_bf16_small); + + // Run 3 times: big*big, small*big, big*small + intrin_mfma_f32_16x16x32bf16<16, 16>::Run( + v_reg_a_bf16_small.template AsType()[I0{}], + v_reg_b_bf16_big.template AsType()[I0{}], + reg_c); + intrin_mfma_f32_16x16x32bf16<16, 16>::Run( + v_reg_a_bf16_big.template AsType()[I0{}], + v_reg_b_bf16_small.template AsType()[I0{}], + reg_c); + intrin_mfma_f32_16x16x32bf16<16, 16>::Run( + v_reg_a_bf16_big.template AsType()[I0{}], + v_reg_b_bf16_big.template AsType()[I0{}], + reg_c); +#else + ignore = reg_a; + ignore = reg_b; + ignore = reg_c; +#endif // defined(__gfx950__) + } +}; + +template +struct intrin_mfma_f32_32x32x16xf32; + +template <> +struct intrin_mfma_f32_32x32x16xf32<32, 32> +{ + template + __device__ static void Run(const float8_t& reg_a, const float8_t& reg_b, FloatC& reg_c) + { +#if defined(__gfx950__) + using I0 = Number<0>; + vector_type reg_a_v(reg_a); + vector_type reg_b_v(reg_b); + + vector_type v_reg_a_bf16_big; + vector_type v_reg_a_bf16_small; + vector_type v_reg_b_bf16_big; + vector_type v_reg_b_bf16_small; + + convert_float_to_bf16_pairs(reg_a_v, v_reg_a_bf16_big, v_reg_a_bf16_small); + convert_float_to_bf16_pairs(reg_b_v, v_reg_b_bf16_big, v_reg_b_bf16_small); + + // Run 3 times: big*big, small*big, big*small + intrin_mfma_f32_32x32x16bf16<32, 32>::Run( + v_reg_a_bf16_small.template AsType()[I0{}], + v_reg_b_bf16_big.template AsType()[I0{}], + reg_c); + intrin_mfma_f32_32x32x16bf16<32, 32>::Run( + v_reg_a_bf16_big.template AsType()[I0{}], + v_reg_b_bf16_small.template AsType()[I0{}], + reg_c); + intrin_mfma_f32_32x32x16bf16<32, 32>::Run( + v_reg_a_bf16_big.template AsType()[I0{}], + v_reg_b_bf16_big.template AsType()[I0{}], + reg_c); +#else + ignore = reg_a; + ignore = reg_b; + ignore = reg_c; +#endif // defined(__gfx950__) + } +}; + +/******************* tf32/xf32 on gfx950 end ************************************/ } // namespace ck diff --git a/include/ck_tile/core/tensor/tensor_adaptor_coordinate.hpp b/include/ck_tile/core/tensor/tensor_adaptor_coordinate.hpp index 0d398d4237..d7b9a466ef 100644 --- a/include/ck_tile/core/tensor/tensor_adaptor_coordinate.hpp +++ b/include/ck_tile/core/tensor/tensor_adaptor_coordinate.hpp @@ -12,6 +12,7 @@ #include "ck_tile/core/container/multi_index.hpp" #include "ck_tile/core/numeric/math.hpp" #include "ck_tile/core/utility/type_traits.hpp" +#include "ck_tile/core/utility/print.hpp" namespace ck_tile { @@ -254,4 +255,115 @@ CK_TILE_HOST_DEVICE constexpr bool adaptor_coordinate_is_valid(const Adaptor& ad adaptor_coordinate_is_valid_assuming_top_index_is_valid(adaptor, coord); } +namespace detail { +template , typename SUFFIX = str_literal<>> +struct CK_PRINT_X_; + +template +struct CK_PRINT_X_, str_literal> +{ + template + struct detail; + template + struct detail< + tensor_adaptor_coordinate> + { + using coord_t = + tensor_adaptor_coordinate; + + template + CK_TILE_HOST_DEVICE static constexpr auto get_hidden_format_i() + { + constexpr bool is_bottom = + sequence_any_of(BottomDimensionHiddenIds{}, [](auto b) { return b == I; }); + constexpr bool is_top = + sequence_any_of(TopDimensionHiddenIds{}, [](auto t) { return t == I; }); + constexpr auto d = make_str_literal("%d"); + if constexpr(is_bottom && is_top) + return make_str_literal("_^") + d; + else if constexpr(is_bottom) + return make_str_literal("_") + d; + else if constexpr(is_top) + return make_str_literal("^") + d; + else + return d; + } + template + CK_TILE_HOST_DEVICE static constexpr auto get_hidden_format() + { + constexpr auto sep = make_str_literal(" "); + if constexpr(N == 0) + return str_literal<>{}; + else + return get_hidden_format() + sep + get_hidden_format_i(); + } + CK_TILE_HOST_DEVICE static constexpr auto get_format() + { + constexpr auto d = make_str_literal("%d"); + constexpr auto sep = make_str_literal(" "); + constexpr auto bottom_fmt = + d.template duplicate_n(sep); + constexpr auto top_fmt = d.template duplicate_n(sep); + constexpr auto hidden_fmt = get_hidden_format(); + return make_str_literal("[ __") + bottom_fmt + make_str_literal("__ | ^^") + top_fmt + + make_str_literal("^^ | ") + hidden_fmt + make_str_literal(" ]"); + } + CK_TILE_HOST_DEVICE static constexpr index_t get_num_values() + { + return BottomDimensionHiddenIds::size() + TopDimensionHiddenIds::size() + NDimHidden; + } + + CK_TILE_HOST_DEVICE static constexpr auto get_values(const coord_t& coord) + { + return container_concat( + coord.get_bottom_index(), coord.get_top_index(), coord.get_hidden_index()); + } + }; + + CK_TILE_HOST_DEVICE static constexpr auto get_prefix() + { + constexpr auto fmt_tid = make_str_literal("tid %03d: "); + if constexpr(sizeof...(PREFIXChars) == 0) + return fmt_tid; + else + return fmt_tid + make_str_literal(" ") + str_literal{}; + } + CK_TILE_HOST_DEVICE static constexpr auto get_suffix() + { + constexpr auto lf = make_str_literal("\n"); + if constexpr(sizeof...(SUFFIXChars) == 0) + return lf; + else + return str_literal{} + lf; + } + + template + CK_TILE_HOST_DEVICE void impl(str_literal, + const TArgs& targs, + std::integer_sequence, + Args&&... args) const + { + constexpr auto fmt_wrap_v = get_prefix() + str_literal{} + get_suffix(); +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wformat-nonliteral" + printf(fmt_wrap_v.data, get_thread_id(), args..., targs.at(number())...); +#pragma clang diagnostic pop + } + template + CK_TILE_HOST_DEVICE void operator()(T&& x, Args&&... args) const + { + using detail_t = detail>; + impl(detail_t::get_format(), + detail_t::get_values(std::forward(x)), + std::make_integer_sequence{}, + std::forward(args)...); + } +}; +} // namespace detail + +template +CK_TILE_HOST_DEVICE void print(const tensor_adaptor_coordinate& coord) +{ + detail::CK_PRINT_X_<>{}(coord); +} } // namespace ck_tile diff --git a/include/ck_tile/core/tensor/tensor_coordinate.hpp b/include/ck_tile/core/tensor/tensor_coordinate.hpp index 9b8fe731fd..a51da9f844 100644 --- a/include/ck_tile/core/tensor/tensor_coordinate.hpp +++ b/include/ck_tile/core/tensor/tensor_coordinate.hpp @@ -89,4 +89,9 @@ CK_TILE_HOST_DEVICE constexpr bool coordinate_has_valid_offset(const TensorDesc& return adaptor_coordinate_is_valid(tensor_desc, coord); } +template +CK_TILE_HOST_DEVICE void print(const tensor_coordinate& coord) +{ + print(static_cast::Base>(coord)); +} } // namespace ck_tile diff --git a/include/ck_tile/core/utility/debug.hpp b/include/ck_tile/core/utility/debug.hpp index 9f0f931bc8..581b095383 100644 --- a/include/ck_tile/core/utility/debug.hpp +++ b/include/ck_tile/core/utility/debug.hpp @@ -7,6 +7,8 @@ #include #include "ck_tile/core/numeric/integer.hpp" +#include "ck_tile/core/utility/print.hpp" +#include "ck_tile/core/arch/arch.hpp" namespace ck_tile { template @@ -18,48 +20,6 @@ template { } -template -struct str_literal -{ - static constexpr const char data[] = {Xs..., '\0'}; - static constexpr const size_t size = sizeof...(Xs); - - template - CK_TILE_HOST_DEVICE constexpr auto operator+(str_literal /*rhs*/) const - { - return str_literal{}; - } - - template - CK_TILE_HOST_DEVICE static constexpr auto duplicate_n(const str_literal sep) - { - if constexpr(N == 0) - return str_literal<>{}; - else if constexpr(N == 1) - return str_literal{}; - else - return duplicate_n(sep) + str_literal{}; - } -}; - -#define make_str_literal(lit_) \ - std::apply([](auto... indices) { return str_literal<(lit_)[decltype(indices)::value]...>{}; }, \ - makeTuple(std::make_index_sequence())) - -template -constexpr std::tuple...> -makeTuple(std::index_sequence) noexcept -{ - return {}; -} -constexpr size_t constexpr_strlen(const char* c) -{ - size_t t = 0; - while(*c++) - ++t; - return t; -} - template struct static_distributed_tensor; @@ -79,17 +39,29 @@ struct CK_PRINTF> { template - CK_TILE_HOST_DEVICE static constexpr auto default_format() + CK_TILE_HOST_DEVICE static constexpr auto default_format_and_type() { if constexpr(std::is_same_v) - return make_str_literal("%8.3f"); + return std::make_tuple(make_str_literal("%8.3f"), T{}); else if constexpr(std::is_same_v) - return make_str_literal("%5d"); + return std::make_tuple(make_str_literal("%5d"), T{}); else if constexpr(std::is_same_v) - return make_str_literal("%5u"); + return std::make_tuple(make_str_literal("%5u"), T{}); + else if constexpr(sizeof(T) == 1) + return std::make_tuple(make_str_literal("0x%02hhx"), uint8_t{}); + else if constexpr(sizeof(T) == 2) + return std::make_tuple(make_str_literal("0x%04hx"), uint16_t{}); + else if constexpr(sizeof(T) == 4) + return std::make_tuple(make_str_literal("0x%08x"), uint32_t{}); else - return make_str_literal("0x%08x"); + static_assert(false, "Unsupported type"); } + template + using default_format_t = + std::remove_reference_t(default_format_and_type()))>; + template + using default_type_t = + std::remove_reference_t(default_format_and_type()))>; CK_TILE_HOST_DEVICE static constexpr auto get_prefix() { @@ -108,49 +80,58 @@ struct CK_PRINTF{} + lf; } - template + template CK_TILE_HOST_DEVICE void impl(const thread_buffer& buf, - std::integer_sequence) const + std::integer_sequence, + Args&&... args) const { - using FMT1 = std::conditional_t()), - str_literal>; + using FMT1 = std:: + conditional_t, str_literal>; constexpr auto fmt_v = FMT1::template duplicate_n(make_str_literal(" ")); constexpr auto fmt_wrap_v = get_prefix() + fmt_v + get_suffix(); #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" - printf(fmt_wrap_v.data, get_thread_id(), N, type_convert(buf[Is])...); + printf(fmt_wrap_v.data, + get_thread_id(), + N, + args..., + bit_cast>(type_convert(buf[Is]))...); #pragma clang diagnostic pop } - template - CK_TILE_HOST_DEVICE void operator()(const thread_buffer& buf) const + template + CK_TILE_HOST_DEVICE void operator()(const thread_buffer& buf, Args&&... args) const { using ConvertTo_ = std::conditional_t, T, ConvertTo>; - impl(buf, std::make_integer_sequence{}); + impl( + buf, std::make_integer_sequence{}, std::forward(args)...); } - template - CK_TILE_HOST_DEVICE void operator()(const static_distributed_tensor& tensor) const + template + CK_TILE_HOST_DEVICE void operator()(const static_distributed_tensor& tensor, + Args&&... args) const { - return operator()(tensor.get_thread_buffer()); + return operator()(tensor.get_thread_buffer(), std::forward(args)...); } }; -template , - typename PREFIX = str_literal<>, - typename SUFFIX = str_literal<>> -struct CK_PRINTF_WARP0 : public CK_PRINTF +template +CK_TILE_HOST_DEVICE void print_warp0(T&& x) { - using base_t = CK_PRINTF; + if(get_thread_id() < get_warp_size()) + print(std::forward(x)); +} +template +struct CK_PRINTF_WARP0 : public CK_PRINTF +{ + using base_t = CK_PRINTF; - template - CK_TILE_HOST_DEVICE void operator()(const T& buf) const + template + CK_TILE_HOST_DEVICE void operator()(const T& buf, Args&&... args) const { if(get_thread_id() < get_warp_size()) - base_t::operator()(buf); + base_t::operator()(buf, std::forward(args)...); } }; diff --git a/include/ck_tile/core/utility/print.hpp b/include/ck_tile/core/utility/print.hpp index 04635959af..b7279a1ef2 100644 --- a/include/ck_tile/core/utility/print.hpp +++ b/include/ck_tile/core/utility/print.hpp @@ -7,6 +7,51 @@ namespace ck_tile { +namespace str_literal_detail { +template +constexpr std::tuple...> +makeTuple(std::index_sequence) noexcept +{ + return {}; +} +constexpr size_t constexpr_strlen(const char* c) +{ + size_t t = 0; + while(*c++) + ++t; + return t; +} +} // namespace str_literal_detail + +template +struct str_literal +{ + static constexpr const char data[] = {Xs..., '\0'}; + static constexpr const size_t size = sizeof...(Xs); + + template + CK_TILE_HOST_DEVICE constexpr auto operator+(str_literal /*rhs*/) const + { + return str_literal{}; + } + + template + CK_TILE_HOST_DEVICE static constexpr auto duplicate_n(const str_literal sep) + { + if constexpr(N == 0) + return str_literal<>{}; + else if constexpr(N == 1) + return str_literal{}; + else + return duplicate_n(sep) + str_literal{}; + } +}; + +#define make_str_literal(lit_) \ + std::apply([](auto... indices) { return str_literal<(lit_)[decltype(indices)::value]...>{}; }, \ + str_literal_detail::makeTuple( \ + std::make_index_sequence())) + /// Declare a ck_tile::print() interface that gets specialized in each header file for types that /// can be printed. template diff --git a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp index a53a4a499e..7523acc080 100644 --- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp +++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp @@ -662,17 +662,21 @@ struct FlatmmKernel const auto scale_m_view = make_naive_tensor_view( kargs.scale_m_ptr.ptr, - make_tuple( - kargs.M / ScaleGranularityM, - ScaleGranularityKA == 0 ? 1 : splitk_batch_offset.splitted_k / ScaleGranularityKA), + make_tuple(kargs.M / ScaleGranularityM, + ScaleGranularityKA == 0 + ? 1 + : splitk_batch_offset.splitted_k / + (ScaleGranularityKA != 0 ? ScaleGranularityKA : 1)), make_tuple(scale_stride_m, 0), number < ScaleGranularityM == 1 ? FlatmmPipeline::GetVectorSizeA() : 1 > {}, number<1>{}); const auto scale_n_view = make_naive_tensor_view( kargs.scale_n_ptr.ptr, - make_tuple( - ScaleGranularityKB == 0 ? 1 : (splitk_batch_offset.splitted_k / ScaleGranularityKB), - kargs.N / ScaleGranularityN), + make_tuple(ScaleGranularityKB == 0 + ? 1 + : (splitk_batch_offset.splitted_k / + (ScaleGranularityKB != 0 ? ScaleGranularityKB : 1)), + kargs.N / ScaleGranularityN), make_tuple(0, scale_stride_n), number < ScaleGranularityN == 1 ? FlatmmPipeline::GetVectorSizeB() : 1 > {}, number<1>{}); diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp index 573571bc07..f47ce05cac 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp @@ -14,6 +14,8 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp" +#include "ck/host_utility/device_prop.hpp" + #include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/fill.hpp" @@ -92,7 +94,8 @@ struct ReferenceConvFwd : public device::BaseOperator in_right_pads_{input_right_pads}, in_element_op_{in_element_op}, wei_element_op_{wei_element_op}, - out_element_op_{out_element_op} + out_element_op_{out_element_op}, + device_name_{ck::get_device_name()} { } @@ -112,6 +115,7 @@ struct ReferenceConvFwd : public device::BaseOperator InElementwiseOperation in_element_op_; WeiElementwiseOperation wei_element_op_; OutElementwiseOperation out_element_op_; + ::std::string device_name_; // the device which this conv is compared with }; struct Invoker : public device::BaseInvoker @@ -251,10 +255,39 @@ struct ReferenceConvFwd : public device::BaseOperator x); if constexpr(is_same_v) { - v_acc += ck::type_convert( - ck::type_convert(v_in)) * - ck::type_convert( - ck::type_convert(v_wei)); + if(arg.device_name_ == "gfx942") + { + v_acc += ck::type_convert( + ck::type_convert(v_in)) * + ck::type_convert( + ck::type_convert(v_wei)); + } + else if(arg.device_name_ == "gfx950") + { + ck::bhalf_t v_in_bf16_big = + ck::type_convert(v_in); + ck::bhalf_t v_in_bf16_small = + ck::type_convert( + v_in - type_convert(v_in_bf16_big)); + ck::bhalf_t v_wei_bf16_big = + ck::type_convert(v_wei); + ck::bhalf_t v_wei_bf16_small = + ck::type_convert( + v_wei - type_convert(v_wei_bf16_big)); + + v_acc += ck::type_convert(v_in_bf16_big) * + ck::type_convert(v_wei_bf16_small) + + ck::type_convert(v_in_bf16_small) * + ck::type_convert(v_wei_bf16_big) + + ck::type_convert(v_in_bf16_big) * + ck::type_convert(v_wei_bf16_big); + } + else + { + throw std::runtime_error( + "Unsupported device: " + arg.device_name_ + + " for tf32 computation"); + } } else { @@ -350,10 +383,41 @@ struct ReferenceConvFwd : public device::BaseOperator x); if constexpr(is_same_v) { - v_acc += ck::type_convert( - ck::type_convert(v_in)) * - ck::type_convert( - ck::type_convert(v_wei)); + if(arg.device_name_ == "gfx942") + { + v_acc += ck::type_convert( + ck::type_convert(v_in)) * + ck::type_convert( + ck::type_convert(v_wei)); + } + else if(arg.device_name_ == "gfx950") + { + ck::bhalf_t v_in_bf16_big = + ck::type_convert(v_in); + ck::bhalf_t v_in_bf16_small = + ck::type_convert( + v_in - type_convert(v_in_bf16_big)); + ck::bhalf_t v_wei_bf16_big = + ck::type_convert(v_wei); + ck::bhalf_t v_wei_bf16_small = + ck::type_convert( + v_wei - + type_convert(v_wei_bf16_big)); + + v_acc += + ck::type_convert(v_in_bf16_big) * + ck::type_convert(v_wei_bf16_small) + + ck::type_convert(v_in_bf16_small) * + ck::type_convert(v_wei_bf16_big) + + ck::type_convert(v_in_bf16_big) * + ck::type_convert(v_wei_bf16_big); + } + else + { + throw std::runtime_error( + "Unsupported device: " + arg.device_name_ + + " for tf32 computation"); + } } else { diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp index 8b9b973b2d..c5afebf75d 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp @@ -6,6 +6,7 @@ #include #include +#include "ck/host_utility/device_prop.hpp" #include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/library/utility/host_tensor.hpp" @@ -45,7 +46,8 @@ struct ReferenceGemm : public device::BaseOperator c_m_n_{c_m_n}, a_element_op_{a_element_op}, b_element_op_{b_element_op}, - c_element_op_{c_element_op} + c_element_op_{c_element_op}, + device_name_{ck::get_device_name()} { } @@ -56,6 +58,7 @@ struct ReferenceGemm : public device::BaseOperator AElementwiseOperation a_element_op_; BElementwiseOperation b_element_op_; CElementwiseOperation c_element_op_; + ::std::string device_name_; // the device which this gemm is compared with }; // Invoker @@ -142,12 +145,37 @@ struct ReferenceGemm : public device::BaseOperator arg.b_element_op_(v_b, arg.b_k_n_(k, n)); } - if constexpr(is_same_v && - is_same_v) - { // only for tf32 now - v_acc += - ck::type_convert(ck::type_convert(v_a)) * - ck::type_convert(ck::type_convert(v_b)); + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v && + is_same_v) + { + if(arg.device_name_ == "gfx942") + { + v_acc += + ck::type_convert(ck::type_convert(v_a)) * + ck::type_convert(ck::type_convert(v_b)); + } + else if(arg.device_name_ == "gfx950") + { + ck::bhalf_t v_a_bf16_big = ck::type_convert(v_a); + ck::bhalf_t v_a_bf16_small = ck::type_convert( + v_a - type_convert(v_a_bf16_big)); + ck::bhalf_t v_b_bf16_big = ck::type_convert(v_b); + ck::bhalf_t v_b_bf16_small = ck::type_convert( + v_b - type_convert(v_b_bf16_big)); + + v_acc += ck::type_convert(v_a_bf16_big) * + ck::type_convert(v_b_bf16_small) + + ck::type_convert(v_a_bf16_small) * + ck::type_convert(v_b_bf16_big) + + ck::type_convert(v_a_bf16_big) * + ck::type_convert(v_b_bf16_big); + } + else + { + throw std::runtime_error("Unsupported device: " + arg.device_name_); + } } else { diff --git a/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp index cf30bc7dda..1e024818c6 100644 --- a/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp @@ -82,9 +82,27 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) // multiply and accumulate if constexpr(is_same_v && is_same_v) - { // only for tf32 now - v_acc += ck::type_convert(ck::type_convert(v_a)) * - ck::type_convert(ck::type_convert(v_b)); + { +#if defined(__gfx942__) + v_acc += ck::type_convert(ck::type_convert(v_a)) * + ck::type_convert(ck::type_convert(v_b)); +#elif defined(__gfx950__) + ck::bhalf_t v_a_bf16_big = ck::type_convert(v_a); + ck::bhalf_t v_a_bf16_small = + ck::type_convert(v_a - type_convert(v_a_bf16_big)); + ck::bhalf_t v_b_bf16_big = ck::type_convert(v_b); + ck::bhalf_t v_b_bf16_small = + ck::type_convert(v_b - type_convert(v_b_bf16_big)); + + v_acc += ck::type_convert(v_a_bf16_big) * + ck::type_convert(v_b_bf16_small) + + ck::type_convert(v_a_bf16_small) * + ck::type_convert(v_b_bf16_big) + + ck::type_convert(v_a_bf16_big) * + ck::type_convert(v_b_bf16_big); +#else + v_acc += type_convert(v_a) * type_convert(v_b); +#endif } else { diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp index b498aba422..6f6fe0db40 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp +++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp @@ -34,6 +34,30 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding; static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave; static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave; +// instances for double rate mfma on gfx950 +template +using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances_dr = std::tuple< + // clang-format off + //################################| ALayout| BLayout| DsLayout| ELayout|AData| BData| DsData| EData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + //################################| | | | | Type| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline| Pipeline| + //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler| Verision| + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + // Compute friendly + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 256, 64, 32, 32, 32, 32, 4, 4, S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 256, 64, 32, 32, 32, 32, 4, 4, S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 256, 128, 32, 32, 16, 16, 8, 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 256, 128, 32, 32, 16, 16, 8, 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 2, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 128, 128, 32, 32, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 256, 128, 128, 32, 32, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 128, 256, 128, 32, 32, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 128, 256, 128, 32, 32, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 64, 64, 256, 32, 32, 32, 32, 1, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 64, 64, 256, 32, 32, 32, 32, 1, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 64, 64, 512, 32, 32, 32, 32, 1, 1, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>, + DeviceGemmMultiD_Xdl_CShuffle_V3< Row, Col, Tuple, Row, F8, F8, Tuple, BF16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 64, 64, 512, 32, 32, 32, 32, 1, 1, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8> + // clang-format on + >; + template using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances_part1 = std::tuple< // clang-format off diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_default_instance_part1.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_default_instance_part1.cpp index f0e0d3d689..c5e34bcc5c 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_default_instance_part1.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_default_instance_part1.cpp @@ -24,6 +24,13 @@ void add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_default_inst add_device_operation_instances( instances, device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances_part1{}); + + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, + device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances_dr{}); + } } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance_part1.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance_part1.cpp index 686dcb67e1..fca5f27f78 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance_part1.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_kpadding_instance_part1.cpp @@ -24,6 +24,14 @@ void add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_kpadding_ins add_device_operation_instances( instances, device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances_part1{}); + + if(ck::get_device_name() == "gfx950") + { + add_device_operation_instances( + instances, + device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances_dr< + GemmKPadding>{}); + } } } // namespace instance diff --git a/profiler/include/profiler/common.hpp b/profiler/include/profiler/common.hpp index 2f72e67c6b..21c0525a02 100644 --- a/profiler/include/profiler/common.hpp +++ b/profiler/include/profiler/common.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/data_type_enum.hpp b/profiler/include/profiler/data_type_enum.hpp index bbfb04df7d..cefe240bea 100644 --- a/profiler/include/profiler/data_type_enum.hpp +++ b/profiler/include/profiler/data_type_enum.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp b/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp index 537a4703d3..36eaebc4ae 100644 --- a/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp +++ b/profiler/include/profiler/profile_avg_pool2d_bwd_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp b/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp index c97e42228d..6528c89ea8 100644 --- a/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp +++ b/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp b/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp index 4b0b8e5bcb..c15b16e224 100644 --- a/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_batched_gemm_b_scale_impl.hpp b/profiler/include/profiler/profile_batched_gemm_b_scale_impl.hpp index 357ab8d70f..2399f793b3 100644 --- a/profiler/include/profiler/profile_batched_gemm_b_scale_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_b_scale_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp index ca0d031dba..c9eddd1553 100644 --- a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp index a8571d0779..f8113a6714 100644 --- a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_batched_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_impl.hpp index 0fdda68c4d..11a9da365d 100644 --- a/profiler/include/profiler/profile_batched_gemm_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp index cb91d8090d..249e80309b 100644 --- a/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp index 183b0e183a..029793bb18 100644 --- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp index e953cc4b66..b09b27bd69 100644 --- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp index bf5a661407..2b0e8c3806 100644 --- a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp +++ b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp index 078f6bff87..9eead80eb2 100644 --- a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp +++ b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp index c866b88e8a..5ae150f269 100644 --- a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp +++ b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_contraction_impl.hpp b/profiler/include/profiler/profile_contraction_impl.hpp index 361861a6d1..f720fa6a7f 100644 --- a/profiler/include/profiler/profile_contraction_impl.hpp +++ b/profiler/include/profiler/profile_contraction_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_contraction_utils.hpp b/profiler/include/profiler/profile_contraction_utils.hpp index adfd98a37e..cea16fded0 100644 --- a/profiler/include/profiler/profile_contraction_utils.hpp +++ b/profiler/include/profiler/profile_contraction_utils.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp index 8f7adebdd4..a0f9b9ac25 100644 --- a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp b/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp index 436fbdbd75..3cda620831 100644 --- a/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp +++ b/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp b/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp index 808c1a1c90..2a7ee6fd66 100644 --- a/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp +++ b/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_conv_fwd_impl.hpp b/profiler/include/profiler/profile_conv_fwd_impl.hpp index 200409fe61..ae92dc792c 100644 --- a/profiler/include/profiler/profile_conv_fwd_impl.hpp +++ b/profiler/include/profiler/profile_conv_fwd_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp index 171ae1662b..dc534a17e1 100644 --- a/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp +++ b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp index ca08f48bcf..d587c3a6ff 100644 --- a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp +++ b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_ab_scale_impl.hpp b/profiler/include/profiler/profile_gemm_ab_scale_impl.hpp index 3c511469f2..5396a52e21 100644 --- a/profiler/include/profiler/profile_gemm_ab_scale_impl.hpp +++ b/profiler/include/profiler/profile_gemm_ab_scale_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp b/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp index 81b8d8ddbf..2b0d2401bd 100644 --- a/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp +++ b/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp b/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp index 9e4d30142b..c85c04197b 100644 --- a/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp +++ b/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_add_impl.hpp b/profiler/include/profiler/profile_gemm_add_impl.hpp index 502d2b2951..559cf7a5b2 100644 --- a/profiler/include/profiler/profile_gemm_add_impl.hpp +++ b/profiler/include/profiler/profile_gemm_add_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp b/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp index fcb546fe96..72ff9ca5f7 100644 --- a/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp +++ b/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp b/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp index 99076a20ec..c728aa9ddf 100644 --- a/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp +++ b/profiler/include/profiler/profile_gemm_add_relu_add_layernorm_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_add_relu_impl.hpp b/profiler/include/profiler/profile_gemm_add_relu_impl.hpp index e7f4338ef0..27334d66f1 100644 --- a/profiler/include/profiler/profile_gemm_add_relu_impl.hpp +++ b/profiler/include/profiler/profile_gemm_add_relu_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_add_silu_impl.hpp b/profiler/include/profiler/profile_gemm_add_silu_impl.hpp index e8a96208f6..b9de475a87 100644 --- a/profiler/include/profiler/profile_gemm_add_silu_impl.hpp +++ b/profiler/include/profiler/profile_gemm_add_silu_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_b_scale_impl.hpp b/profiler/include/profiler/profile_gemm_b_scale_impl.hpp index 8ca1350523..d3db464559 100644 --- a/profiler/include/profiler/profile_gemm_b_scale_impl.hpp +++ b/profiler/include/profiler/profile_gemm_b_scale_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp index b265101f3f..1930cf9eb6 100644 --- a/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp +++ b/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_bilinear_impl.hpp b/profiler/include/profiler/profile_gemm_bilinear_impl.hpp index b540e938b5..7e593e9e42 100644 --- a/profiler/include/profiler/profile_gemm_bilinear_impl.hpp +++ b/profiler/include/profiler/profile_gemm_bilinear_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp b/profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp index da0dc60760..49fef5a0fc 100644 --- a/profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp +++ b/profiler/include/profiler/profile_gemm_blockscale_wp_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp b/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp index 0fe8abe242..5add7b6073 100644 --- a/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp +++ b/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_impl.hpp b/profiler/include/profiler/profile_gemm_impl.hpp index 93eac048cd..458cba381e 100644 --- a/profiler/include/profiler/profile_gemm_impl.hpp +++ b/profiler/include/profiler/profile_gemm_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_multi_abd_impl.hpp b/profiler/include/profiler/profile_gemm_multi_abd_impl.hpp index 51922fde33..e94d5bb910 100644 --- a/profiler/include/profiler/profile_gemm_multi_abd_impl.hpp +++ b/profiler/include/profiler/profile_gemm_multi_abd_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_multiply_add_impl.hpp b/profiler/include/profiler/profile_gemm_multiply_add_impl.hpp index 2711d595d6..ab107cf997 100644 --- a/profiler/include/profiler/profile_gemm_multiply_add_impl.hpp +++ b/profiler/include/profiler/profile_gemm_multiply_add_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp b/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp index dbfddeb8a4..76368d9252 100644 --- a/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp +++ b/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -97,7 +97,7 @@ bool profile_gemm_multiply_multiply_impl(int do_verification, case 1: a_m_k.GenerateTensorValue(GeneratorTensor_2{-1, 2}); b_k_n.GenerateTensorValue(GeneratorTensor_2{-1, 2}); - d0_m_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + d0_m_n.GenerateTensorValue(GeneratorTensor_2{-1, 1}); d1_m_n.GenerateTensorValue(GeneratorTensor_2{-1, 1}); break; default: diff --git a/profiler/include/profiler/profile_gemm_multiply_multiply_wp_impl.hpp b/profiler/include/profiler/profile_gemm_multiply_multiply_wp_impl.hpp index 21613e49c6..570aee7f85 100644 --- a/profiler/include/profiler/profile_gemm_multiply_multiply_wp_impl.hpp +++ b/profiler/include/profiler/profile_gemm_multiply_multiply_wp_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_mx_impl.hpp b/profiler/include/profiler/profile_gemm_mx_impl.hpp index 1fbe60c6cf..0f84744fd6 100644 --- a/profiler/include/profiler/profile_gemm_mx_impl.hpp +++ b/profiler/include/profiler/profile_gemm_mx_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_quantization_impl.hpp b/profiler/include/profiler/profile_gemm_quantization_impl.hpp index 02f374164e..24516284a0 100644 --- a/profiler/include/profiler/profile_gemm_quantization_impl.hpp +++ b/profiler/include/profiler/profile_gemm_quantization_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_reduce_impl.hpp index c870a95cbe..163cbe204d 100644 --- a/profiler/include/profiler/profile_gemm_reduce_impl.hpp +++ b/profiler/include/profiler/profile_gemm_reduce_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_splitk_impl.hpp b/profiler/include/profiler/profile_gemm_splitk_impl.hpp index 744db27675..789a460ca7 100644 --- a/profiler/include/profiler/profile_gemm_splitk_impl.hpp +++ b/profiler/include/profiler/profile_gemm_splitk_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_streamk_impl.hpp index f86e7ad447..01944dbbdd 100644 --- a/profiler/include/profiler/profile_gemm_streamk_impl.hpp +++ b/profiler/include/profiler/profile_gemm_streamk_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp b/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp index 99e24cd205..9125da53a9 100644 --- a/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp +++ b/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_universal_impl.hpp b/profiler/include/profiler/profile_gemm_universal_impl.hpp index bee907dd76..7ece78ea7b 100644 --- a/profiler/include/profiler/profile_gemm_universal_impl.hpp +++ b/profiler/include/profiler/profile_gemm_universal_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp b/profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp index 5ec056efd1..8bf8957343 100644 --- a/profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp +++ b/profiler/include/profiler/profile_gemm_universal_preshuffle_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_universal_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_universal_reduce_impl.hpp index 554956ee88..f1057529c9 100644 --- a/profiler/include/profiler/profile_gemm_universal_reduce_impl.hpp +++ b/profiler/include/profiler/profile_gemm_universal_reduce_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp index 035a1b77df..aa879eba9c 100644 --- a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp +++ b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp index 2369b2eac8..b9e463dc1e 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp index 6654275fd0..bc7ecaa6ca 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp index 2f7f3ae4d8..22ff02676a 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp index 2dbadd8eb1..2a2f516727 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp index d490cf4167..427d2b14df 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp index ae12070014..50b97c3bae 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp @@ -1,3 +1,6 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + #pragma once #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale.hpp" diff --git a/profiler/include/profiler/profile_grouped_gemm_fastgelu_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_fastgelu_impl.hpp index f05b13b749..227b494266 100644 --- a/profiler/include/profiler/profile_grouped_gemm_fastgelu_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_fastgelu_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp index 8fb20f0135..f551a16a1b 100644 --- a/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp index 8314b9053f..03a2ed3186 100644 --- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp index 1b17f05760..f76460ffd8 100644 --- a/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp index cf3c3a6bae..282d5f22fd 100644 --- a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp +++ b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp b/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp index c1647815ad..b9a9d139ac 100644 --- a/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_groupnorm_bwd_data_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp b/profiler/include/profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp index 5e9d3df1b1..216e253737 100644 --- a/profiler/include/profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp +++ b/profiler/include/profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp b/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp index 60982d18d5..e984dfcb12 100644 --- a/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp +++ b/profiler/include/profiler/profile_groupnorm_fwd_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp b/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp index 7704085048..8efe709f96 100644 --- a/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_layernorm_bwd_data_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp b/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp index e36b20e1b5..b89702a51b 100644 --- a/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp +++ b/profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_layernorm_fwd_impl.hpp b/profiler/include/profiler/profile_layernorm_fwd_impl.hpp index 51dcbb1275..1de8178d3d 100644 --- a/profiler/include/profiler/profile_layernorm_fwd_impl.hpp +++ b/profiler/include/profiler/profile_layernorm_fwd_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp b/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp index a8efee3ef0..3fad040004 100644 --- a/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp +++ b/profiler/include/profiler/profile_max_pool2d_bwd_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp b/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp index cf6050969f..35d0020418 100644 --- a/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp +++ b/profiler/include/profiler/profile_max_pool3d_bwd_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_permute_scale_impl.hpp b/profiler/include/profiler/profile_permute_scale_impl.hpp index 9ccbd67783..a2ffc48380 100644 --- a/profiler/include/profiler/profile_permute_scale_impl.hpp +++ b/profiler/include/profiler/profile_permute_scale_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp index 962be4448c..389edfbf48 100644 --- a/profiler/include/profiler/profile_pool2d_fwd_impl.hpp +++ b/profiler/include/profiler/profile_pool2d_fwd_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp index e1d0c1573d..851c3fbad5 100644 --- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp +++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_reduce_impl.hpp b/profiler/include/profiler/profile_reduce_impl.hpp index 14a93af69d..191c57780c 100644 --- a/profiler/include/profiler/profile_reduce_impl.hpp +++ b/profiler/include/profiler/profile_reduce_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_softmax_impl.hpp b/profiler/include/profiler/profile_softmax_impl.hpp index d7a790803a..bdaeb52acf 100644 --- a/profiler/include/profiler/profile_softmax_impl.hpp +++ b/profiler/include/profiler/profile_softmax_impl.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/profiler/include/profiler/profile_transpose_impl.hpp b/profiler/include/profiler/profile_transpose_impl.hpp index 0baf2eac99..81ebbcdf17 100644 --- a/profiler/include/profiler/profile_transpose_impl.hpp +++ b/profiler/include/profiler/profile_transpose_impl.hpp @@ -1,178 +1,178 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" - -#include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/utility/literals.hpp" - -namespace ck { -namespace profiler { - -template -void host_elementwise4D(HostTensorB& B_ndhwc, const HostTensorA& A_ncdhw, Functor functor) -{ - for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n) - for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c) - for(std::size_t d = 0; d < A_ncdhw.mDesc.GetLengths()[2]; ++d) - for(std::size_t h = 0; h < A_ncdhw.mDesc.GetLengths()[3]; ++h) - for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w) - { - auto a_val = A_ncdhw(n, c, d, h, w); - functor(B_ndhwc(n, d, h, w, c), a_val); - } -} - -template -bool profile_transpose_impl(int do_verification, - int init_method, - bool do_log, - bool time_kernel, - std::vector lengths) -{ - bool pass = true; - - index_t N = lengths[0]; - index_t C = lengths[1]; - index_t D = lengths[2]; - index_t H = lengths[3]; - index_t W = lengths[4]; - - std::vector ncdhw = {N, C, D, H, W}; - std::vector ndhwc = {N, D, H, W, C}; - Tensor a(ncdhw); - Tensor b(ndhwc); - Tensor host_b(ndhwc); - - // a.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - - std::array ab_lengths{N, C, H, W, D}; - std::array a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W - std::array b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, D, H, W, C - - std::cout << "A: " << a.mDesc << std::endl; - std::cout << "B: " << b.mDesc << std::endl; - - switch(init_method) - { - case 0: break; - case 1: a.GenerateTensorValue(GeneratorTensor_2{-1, 2}); break; - default: a.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - } - - using ElementOp = ck::tensor_operation::element_wise::PassThrough; - - DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); - DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); - - a_device_buf.ToDevice(a.mData.data()); - - std::array input = {a_device_buf.GetDeviceBuffer()}; - std::array output = {b_device_buf.GetDeviceBuffer()}; - using DeviceOp = ck::tensor_operation::device:: - DeviceElementwise, ck::Tuple, ElementOp, NumDim>; - - // get device op instances - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - std::cout << "found " << op_ptrs.size() << " instances" << std::endl; - - if(do_verification) - { - host_elementwise4D(host_b, a, ElementOp{}); - } - - std::string best_op_name; - float best_ave_time = 0; - float best_tflops = 0; - float best_gb_per_sec = 0; - - for(auto& op_ptr : op_ptrs) - { - auto argument_ptr = op_ptr->MakeArgumentPointer( - ab_lengths, {a_strides}, {b_strides}, input, output, ElementOp{}); - - auto invoker_ptr = op_ptr->MakeInvokerPointer(); - - if(op_ptr->IsSupportedArgument(argument_ptr.get())) - { - - // re-init C to zero before profiling next kernel - b_device_buf.SetZero(); - - // run for verification - invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); - - if(do_verification) - { - b_device_buf.FromDevice(b.mData.data()); - - pass &= ck::utils::check_err( - b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); - - if(do_log) - { - LogRangeAsType(std::cout << "a : ", a.mData, ",") << std::endl; - LogRangeAsType(std::cout << "b: ", b.mData, ",") << std::endl; - } - } - - std::string op_name = op_ptr->GetTypeString(); - - // run for timing purposes - float ave_time = - invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); - - std::size_t flop = - std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]; - - std::size_t num_btype = - sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) + - sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]); - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " - << gb_per_sec << " GB/s, " << op_name << std::endl; - - if(tflops > best_tflops) - { - best_op_name = op_name; - best_tflops = tflops; - best_ave_time = ave_time; - best_gb_per_sec = gb_per_sec; - } - } - else - { - std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl; - } - } - - std::cout << " N = " << N << " C = " << C << " D = " << D << " H = " << H << " W = " << W - << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec - << " GB/s, " << best_op_name << std::endl; - - return pass; -} - -} // namespace profiler -} // namespace ck +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" + +#include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" + +namespace ck { +namespace profiler { + +template +void host_elementwise4D(HostTensorB& B_ndhwc, const HostTensorA& A_ncdhw, Functor functor) +{ + for(std::size_t n = 0; n < A_ncdhw.mDesc.GetLengths()[0]; ++n) + for(std::size_t c = 0; c < A_ncdhw.mDesc.GetLengths()[1]; ++c) + for(std::size_t d = 0; d < A_ncdhw.mDesc.GetLengths()[2]; ++d) + for(std::size_t h = 0; h < A_ncdhw.mDesc.GetLengths()[3]; ++h) + for(std::size_t w = 0; w < A_ncdhw.mDesc.GetLengths()[4]; ++w) + { + auto a_val = A_ncdhw(n, c, d, h, w); + functor(B_ndhwc(n, d, h, w, c), a_val); + } +} + +template +bool profile_transpose_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + std::vector lengths) +{ + bool pass = true; + + index_t N = lengths[0]; + index_t C = lengths[1]; + index_t D = lengths[2]; + index_t H = lengths[3]; + index_t W = lengths[4]; + + std::vector ncdhw = {N, C, D, H, W}; + std::vector ndhwc = {N, D, H, W, C}; + Tensor a(ncdhw); + Tensor b(ndhwc); + Tensor host_b(ndhwc); + + // a.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + + std::array ab_lengths{N, C, H, W, D}; + std::array a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W + std::array b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, D, H, W, C + + std::cout << "A: " << a.mDesc << std::endl; + std::cout << "B: " << b.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: a.GenerateTensorValue(GeneratorTensor_2{-1, 2}); break; + default: a.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + } + + using ElementOp = ck::tensor_operation::element_wise::PassThrough; + + DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); + DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); + + a_device_buf.ToDevice(a.mData.data()); + + std::array input = {a_device_buf.GetDeviceBuffer()}; + std::array output = {b_device_buf.GetDeviceBuffer()}; + using DeviceOp = ck::tensor_operation::device:: + DeviceElementwise, ck::Tuple, ElementOp, NumDim>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + if(do_verification) + { + host_elementwise4D(host_b, a, ElementOp{}); + } + + std::string best_op_name; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + for(auto& op_ptr : op_ptrs) + { + auto argument_ptr = op_ptr->MakeArgumentPointer( + ab_lengths, {a_strides}, {b_strides}, input, output, ElementOp{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + + // re-init C to zero before profiling next kernel + b_device_buf.SetZero(); + + // run for verification + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + + if(do_verification) + { + b_device_buf.FromDevice(b.mData.data()); + + pass &= ck::utils::check_err( + b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); + + if(do_log) + { + LogRangeAsType(std::cout << "a : ", a.mData, ",") << std::endl; + LogRangeAsType(std::cout << "b: ", b.mData, ",") << std::endl; + } + } + + std::string op_name = op_ptr->GetTypeString(); + + // run for timing purposes + float ave_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + std::size_t flop = + std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]; + + std::size_t num_btype = + sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) + + sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]); + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl; + } + } + + std::cout << " N = " << N << " C = " << C << " D = " << D << " H = " << H << " W = " << W + << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec + << " GB/s, " << best_op_name << std::endl; + + return pass; +} + +} // namespace profiler +} // namespace ck diff --git a/profiler/src/profile_avg_pool2d_bwd.cpp b/profiler/src/profile_avg_pool2d_bwd.cpp index 61d0413d43..0b7936d57a 100644 --- a/profiler/src/profile_avg_pool2d_bwd.cpp +++ b/profiler/src/profile_avg_pool2d_bwd.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_avg_pool3d_bwd.cpp b/profiler/src/profile_avg_pool3d_bwd.cpp index 0ff50a5292..b279a45483 100644 --- a/profiler/src/profile_avg_pool3d_bwd.cpp +++ b/profiler/src/profile_avg_pool3d_bwd.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_batched_gemm.cpp b/profiler/src/profile_batched_gemm.cpp index d9da68b050..c2db49caf1 100644 --- a/profiler/src/profile_batched_gemm.cpp +++ b/profiler/src/profile_batched_gemm.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp b/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp index 3d29c4b84a..55b8d1cd57 100644 --- a/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp +++ b/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_batched_gemm_b_scale.cpp b/profiler/src/profile_batched_gemm_b_scale.cpp index 5ed673e127..ee17322e14 100644 --- a/profiler/src/profile_batched_gemm_b_scale.cpp +++ b/profiler/src/profile_batched_gemm_b_scale.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_batched_gemm_gemm.cpp b/profiler/src/profile_batched_gemm_gemm.cpp index 9a99874d1c..f59812eaad 100644 --- a/profiler/src/profile_batched_gemm_gemm.cpp +++ b/profiler/src/profile_batched_gemm_gemm.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_batched_gemm_multi_d.cpp b/profiler/src/profile_batched_gemm_multi_d.cpp index eb94c07900..f64e7ace1a 100644 --- a/profiler/src/profile_batched_gemm_multi_d.cpp +++ b/profiler/src/profile_batched_gemm_multi_d.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_batched_gemm_reduce.cpp b/profiler/src/profile_batched_gemm_reduce.cpp index 9620d63caf..2354c7938c 100644 --- a/profiler/src/profile_batched_gemm_reduce.cpp +++ b/profiler/src/profile_batched_gemm_reduce.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_batchnorm_bwd.cpp b/profiler/src/profile_batchnorm_bwd.cpp index 1738d53dbe..dabe97bfae 100644 --- a/profiler/src/profile_batchnorm_bwd.cpp +++ b/profiler/src/profile_batchnorm_bwd.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_batchnorm_fwd.cpp b/profiler/src/profile_batchnorm_fwd.cpp index 507fb4b450..6a87346c24 100644 --- a/profiler/src/profile_batchnorm_fwd.cpp +++ b/profiler/src/profile_batchnorm_fwd.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_batchnorm_infer.cpp b/profiler/src/profile_batchnorm_infer.cpp index f1c19bc36e..548cba9a42 100644 --- a/profiler/src/profile_batchnorm_infer.cpp +++ b/profiler/src/profile_batchnorm_infer.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_contraction_bilinear.cpp b/profiler/src/profile_contraction_bilinear.cpp index a64555fc66..f38e647c71 100644 --- a/profiler/src/profile_contraction_bilinear.cpp +++ b/profiler/src/profile_contraction_bilinear.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_contraction_scale.cpp b/profiler/src/profile_contraction_scale.cpp index a168c09bcf..786db970e8 100644 --- a/profiler/src/profile_contraction_scale.cpp +++ b/profiler/src/profile_contraction_scale.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_conv_bwd_data.cpp b/profiler/src/profile_conv_bwd_data.cpp index e08a39aeb0..f4685c1ed5 100644 --- a/profiler/src/profile_conv_bwd_data.cpp +++ b/profiler/src/profile_conv_bwd_data.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_conv_fwd.cpp b/profiler/src/profile_conv_fwd.cpp index 701999d8a9..59b9f11308 100644 --- a/profiler/src/profile_conv_fwd.cpp +++ b/profiler/src/profile_conv_fwd.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_conv_fwd_bias_relu.cpp b/profiler/src/profile_conv_fwd_bias_relu.cpp index 31055ec1d1..717b0f5aa4 100644 --- a/profiler/src/profile_conv_fwd_bias_relu.cpp +++ b/profiler/src/profile_conv_fwd_bias_relu.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_conv_fwd_bias_relu_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_add.cpp index 8c2439a0c7..70a5f9e101 100644 --- a/profiler/src/profile_conv_fwd_bias_relu_add.cpp +++ b/profiler/src/profile_conv_fwd_bias_relu_add.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_conv_tensor_rearrange.cpp b/profiler/src/profile_conv_tensor_rearrange.cpp index 6420698a28..70239e8a80 100644 --- a/profiler/src/profile_conv_tensor_rearrange.cpp +++ b/profiler/src/profile_conv_tensor_rearrange.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm.cpp b/profiler/src/profile_gemm.cpp index c322c7054b..5acc558f4b 100644 --- a/profiler/src/profile_gemm.cpp +++ b/profiler/src/profile_gemm.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_ab_scale.cpp b/profiler/src/profile_gemm_ab_scale.cpp index c2889d5490..09dc3134c0 100644 --- a/profiler/src/profile_gemm_ab_scale.cpp +++ b/profiler/src/profile_gemm_ab_scale.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_add.cpp b/profiler/src/profile_gemm_add.cpp index 749966af1b..85b1770c69 100644 --- a/profiler/src/profile_gemm_add.cpp +++ b/profiler/src/profile_gemm_add.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_add_add_fastgelu.cpp b/profiler/src/profile_gemm_add_add_fastgelu.cpp index 8af3768a48..15fb999b9b 100644 --- a/profiler/src/profile_gemm_add_add_fastgelu.cpp +++ b/profiler/src/profile_gemm_add_add_fastgelu.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_add_fastgelu.cpp b/profiler/src/profile_gemm_add_fastgelu.cpp index f8335d8c05..ab15e2956e 100644 --- a/profiler/src/profile_gemm_add_fastgelu.cpp +++ b/profiler/src/profile_gemm_add_fastgelu.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_add_multiply.cpp b/profiler/src/profile_gemm_add_multiply.cpp index f8ec7abb66..00deaa0953 100644 --- a/profiler/src/profile_gemm_add_multiply.cpp +++ b/profiler/src/profile_gemm_add_multiply.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_add_relu.cpp b/profiler/src/profile_gemm_add_relu.cpp index 025fddc82b..efdd041053 100644 --- a/profiler/src/profile_gemm_add_relu.cpp +++ b/profiler/src/profile_gemm_add_relu.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_add_relu_add_layernorm.cpp b/profiler/src/profile_gemm_add_relu_add_layernorm.cpp index 558d255ce1..44e48acf2a 100644 --- a/profiler/src/profile_gemm_add_relu_add_layernorm.cpp +++ b/profiler/src/profile_gemm_add_relu_add_layernorm.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_add_silu.cpp b/profiler/src/profile_gemm_add_silu.cpp index a35c0a4092..aff9089275 100644 --- a/profiler/src/profile_gemm_add_silu.cpp +++ b/profiler/src/profile_gemm_add_silu.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_b_scale.cpp b/profiler/src/profile_gemm_b_scale.cpp index 7bcc96a434..61064bf21b 100644 --- a/profiler/src/profile_gemm_b_scale.cpp +++ b/profiler/src/profile_gemm_b_scale.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_bias_add_reduce.cpp b/profiler/src/profile_gemm_bias_add_reduce.cpp index 76daffbc67..1562a28f1b 100644 --- a/profiler/src/profile_gemm_bias_add_reduce.cpp +++ b/profiler/src/profile_gemm_bias_add_reduce.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_bilinear.cpp b/profiler/src/profile_gemm_bilinear.cpp index 4527a2fa00..cc05be7d4b 100644 --- a/profiler/src/profile_gemm_bilinear.cpp +++ b/profiler/src/profile_gemm_bilinear.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_blockscale_wp.cpp b/profiler/src/profile_gemm_blockscale_wp.cpp index d5f66c0b65..b0f9d20344 100644 --- a/profiler/src/profile_gemm_blockscale_wp.cpp +++ b/profiler/src/profile_gemm_blockscale_wp.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_fastgelu.cpp b/profiler/src/profile_gemm_fastgelu.cpp index 93573002ef..94905ffb8b 100644 --- a/profiler/src/profile_gemm_fastgelu.cpp +++ b/profiler/src/profile_gemm_fastgelu.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_multi_abd.cpp b/profiler/src/profile_gemm_multi_abd.cpp index 157bcbc977..a1d1555422 100644 --- a/profiler/src/profile_gemm_multi_abd.cpp +++ b/profiler/src/profile_gemm_multi_abd.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_multiply_add.cpp b/profiler/src/profile_gemm_multiply_add.cpp index 88d3b5256a..46a2d8690a 100644 --- a/profiler/src/profile_gemm_multiply_add.cpp +++ b/profiler/src/profile_gemm_multiply_add.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_multiply_multiply.cpp b/profiler/src/profile_gemm_multiply_multiply.cpp index 87424c21a2..fa6b3832b0 100644 --- a/profiler/src/profile_gemm_multiply_multiply.cpp +++ b/profiler/src/profile_gemm_multiply_multiply.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_multiply_multiply_wp.cpp b/profiler/src/profile_gemm_multiply_multiply_wp.cpp index ff6cffb5f2..b65e0298ef 100644 --- a/profiler/src/profile_gemm_multiply_multiply_wp.cpp +++ b/profiler/src/profile_gemm_multiply_multiply_wp.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_mx.cpp b/profiler/src/profile_gemm_mx.cpp index 9fd6f29464..4734789f8c 100644 --- a/profiler/src/profile_gemm_mx.cpp +++ b/profiler/src/profile_gemm_mx.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_quantization.cpp b/profiler/src/profile_gemm_quantization.cpp index d28dd60dce..5de19dd1bb 100644 --- a/profiler/src/profile_gemm_quantization.cpp +++ b/profiler/src/profile_gemm_quantization.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_reduce.cpp b/profiler/src/profile_gemm_reduce.cpp index 48f6f5eb49..ce79102424 100644 --- a/profiler/src/profile_gemm_reduce.cpp +++ b/profiler/src/profile_gemm_reduce.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_splitk.cpp b/profiler/src/profile_gemm_splitk.cpp index 2a0467bc81..0130e8a9db 100644 --- a/profiler/src/profile_gemm_splitk.cpp +++ b/profiler/src/profile_gemm_splitk.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_streamk.cpp b/profiler/src/profile_gemm_streamk.cpp index a0a49eb36d..1f8bfba5d7 100644 --- a/profiler/src/profile_gemm_streamk.cpp +++ b/profiler/src/profile_gemm_streamk.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_universal.cpp b/profiler/src/profile_gemm_universal.cpp index d35cd27651..ded4d68891 100644 --- a/profiler/src/profile_gemm_universal.cpp +++ b/profiler/src/profile_gemm_universal.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_universal_batched.cpp b/profiler/src/profile_gemm_universal_batched.cpp index d57511fbfc..d1396fa074 100644 --- a/profiler/src/profile_gemm_universal_batched.cpp +++ b/profiler/src/profile_gemm_universal_batched.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_universal_preshuffle.cpp b/profiler/src/profile_gemm_universal_preshuffle.cpp index d8d8f29ac6..3e5f661063 100644 --- a/profiler/src/profile_gemm_universal_preshuffle.cpp +++ b/profiler/src/profile_gemm_universal_preshuffle.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_universal_reduce.cpp b/profiler/src/profile_gemm_universal_reduce.cpp index cd7423d5de..91187b7523 100644 --- a/profiler/src/profile_gemm_universal_reduce.cpp +++ b/profiler/src/profile_gemm_universal_reduce.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_gemm_universal_streamk.cpp b/profiler/src/profile_gemm_universal_streamk.cpp index 3fd61db138..c9691964ec 100644 --- a/profiler/src/profile_gemm_universal_streamk.cpp +++ b/profiler/src/profile_gemm_universal_streamk.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_grouped_conv_bwd_data.cpp b/profiler/src/profile_grouped_conv_bwd_data.cpp index 95098e2301..62d6e860f9 100644 --- a/profiler/src/profile_grouped_conv_bwd_data.cpp +++ b/profiler/src/profile_grouped_conv_bwd_data.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/profiler/src/profile_grouped_conv_bwd_weight.cpp index 7d3f1ad6c0..a18aab41a5 100644 --- a/profiler/src/profile_grouped_conv_bwd_weight.cpp +++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_grouped_conv_fwd.cpp b/profiler/src/profile_grouped_conv_fwd.cpp index 13f5cd1cda..c94b77dd4f 100644 --- a/profiler/src/profile_grouped_conv_fwd.cpp +++ b/profiler/src/profile_grouped_conv_fwd.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -105,7 +105,7 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) using INT8 = int8_t; using F8 = ck::f8_t; using BF8 = ck::bf8_t; -#if defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) using TF32 = ck::tf32_t; #endif @@ -228,7 +228,7 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) } else if(data_type == ConvDataType::F32_F32_F32_TF32) { -#if defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); #endif } @@ -253,7 +253,7 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) } else if(data_type == ConvDataType::F32_F32_F32_TF32) { -#if defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); #endif } @@ -280,7 +280,7 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) } else if(data_type == ConvDataType::F32_F32_F32_TF32) { -#if defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); #endif } @@ -306,7 +306,7 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) } else if(data_type == ConvDataType::F32_F32_F32_TF32) { -#if defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) return profile(I1, NWGC{}, GKXC{}, NWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); #endif } @@ -331,7 +331,7 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) } else if(data_type == ConvDataType::F32_F32_F32_TF32) { -#if defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); #endif } @@ -352,7 +352,7 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) } else if(data_type == ConvDataType::F32_F32_F32_TF32) { -#if defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) return profile(I2, NGCHW{}, GKYXC{}, NGKHW{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); #endif } @@ -373,7 +373,7 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) } else if(data_type == ConvDataType::F32_F32_F32_TF32) { -#if defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) return profile(I2, NGCHW{}, GKCYX{}, NGKHW{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); #endif } @@ -416,7 +416,7 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) } else if(data_type == ConvDataType::F32_F32_F32_TF32) { -#if defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); #endif } @@ -439,7 +439,7 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) } else if(data_type == ConvDataType::F32_F32_F32_TF32) { -#if defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) return profile(I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); #endif } diff --git a/profiler/src/profile_grouped_conv_fwd_bias_clamp.cpp b/profiler/src/profile_grouped_conv_fwd_bias_clamp.cpp index fb1eedf2a7..4eb12e6e19 100644 --- a/profiler/src/profile_grouped_conv_fwd_bias_clamp.cpp +++ b/profiler/src/profile_grouped_conv_fwd_bias_clamp.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp" diff --git a/profiler/src/profile_grouped_conv_fwd_clamp.cpp b/profiler/src/profile_grouped_conv_fwd_clamp.cpp index 1b100ff867..7df9fd6167 100644 --- a/profiler/src/profile_grouped_conv_fwd_clamp.cpp +++ b/profiler/src/profile_grouped_conv_fwd_clamp.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "profiler/profile_grouped_conv_fwd_impl.hpp" diff --git a/profiler/src/profile_grouped_conv_fwd_outelementop.cpp b/profiler/src/profile_grouped_conv_fwd_outelementop.cpp index 196a2cf3f2..00b4bd8f13 100644 --- a/profiler/src/profile_grouped_conv_fwd_outelementop.cpp +++ b/profiler/src/profile_grouped_conv_fwd_outelementop.cpp @@ -1,3 +1,6 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "profiler/profile_grouped_conv_fwd_outelementop_impl.hpp" diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp index 2adcd6483a..32a390c475 100644 --- a/profiler/src/profile_grouped_gemm.cpp +++ b/profiler/src/profile_grouped_gemm.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_grouped_gemm_fastgelu.cpp b/profiler/src/profile_grouped_gemm_fastgelu.cpp index 50ecf25cae..7bcf2d40d2 100644 --- a/profiler/src/profile_grouped_gemm_fastgelu.cpp +++ b/profiler/src/profile_grouped_gemm_fastgelu.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_grouped_gemm_fixed_nk.cpp b/profiler/src/profile_grouped_gemm_fixed_nk.cpp index 827866ce69..c93725c3e7 100644 --- a/profiler/src/profile_grouped_gemm_fixed_nk.cpp +++ b/profiler/src/profile_grouped_gemm_fixed_nk.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_grouped_gemm_multiply_tile_loop.cpp b/profiler/src/profile_grouped_gemm_multiply_tile_loop.cpp index 5cf0af5ecf..e747cfd5fe 100644 --- a/profiler/src/profile_grouped_gemm_multiply_tile_loop.cpp +++ b/profiler/src/profile_grouped_gemm_multiply_tile_loop.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_grouped_gemm_tile_loop.cpp b/profiler/src/profile_grouped_gemm_tile_loop.cpp index 76ff9e162e..ba4b9ba708 100644 --- a/profiler/src/profile_grouped_gemm_tile_loop.cpp +++ b/profiler/src/profile_grouped_gemm_tile_loop.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_groupnorm_bwd_data.cpp b/profiler/src/profile_groupnorm_bwd_data.cpp index f9fea1db55..2e05fc5b2d 100644 --- a/profiler/src/profile_groupnorm_bwd_data.cpp +++ b/profiler/src/profile_groupnorm_bwd_data.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_groupnorm_bwd_gamma_beta.cpp b/profiler/src/profile_groupnorm_bwd_gamma_beta.cpp index 7fcef3a4e2..31878eb3c2 100644 --- a/profiler/src/profile_groupnorm_bwd_gamma_beta.cpp +++ b/profiler/src/profile_groupnorm_bwd_gamma_beta.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_groupnorm_fwd.cpp b/profiler/src/profile_groupnorm_fwd.cpp index 9a595bf7a7..c81a9f94a9 100644 --- a/profiler/src/profile_groupnorm_fwd.cpp +++ b/profiler/src/profile_groupnorm_fwd.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_layernorm_bwd_data.cpp b/profiler/src/profile_layernorm_bwd_data.cpp index 1f364d79ba..fb2d8d6419 100644 --- a/profiler/src/profile_layernorm_bwd_data.cpp +++ b/profiler/src/profile_layernorm_bwd_data.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_layernorm_bwd_gamma_beta.cpp b/profiler/src/profile_layernorm_bwd_gamma_beta.cpp index 0f3436c663..aea4d435fe 100644 --- a/profiler/src/profile_layernorm_bwd_gamma_beta.cpp +++ b/profiler/src/profile_layernorm_bwd_gamma_beta.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_layernorm_fwd.cpp b/profiler/src/profile_layernorm_fwd.cpp index 7031b36531..014a9d0164 100644 --- a/profiler/src/profile_layernorm_fwd.cpp +++ b/profiler/src/profile_layernorm_fwd.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_max_pool2d_bwd.cpp b/profiler/src/profile_max_pool2d_bwd.cpp index 26e84c880e..0058256acf 100644 --- a/profiler/src/profile_max_pool2d_bwd.cpp +++ b/profiler/src/profile_max_pool2d_bwd.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_max_pool2d_fwd.cpp b/profiler/src/profile_max_pool2d_fwd.cpp index 9ed96b6c40..c74f6ab8b9 100644 --- a/profiler/src/profile_max_pool2d_fwd.cpp +++ b/profiler/src/profile_max_pool2d_fwd.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_max_pool3d_bwd.cpp b/profiler/src/profile_max_pool3d_bwd.cpp index 45a64df423..dd890ef280 100644 --- a/profiler/src/profile_max_pool3d_bwd.cpp +++ b/profiler/src/profile_max_pool3d_bwd.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_permute_scale.cpp b/profiler/src/profile_permute_scale.cpp index 8ebb2289ed..e6908989b6 100644 --- a/profiler/src/profile_permute_scale.cpp +++ b/profiler/src/profile_permute_scale.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_pool3d_fwd.cpp b/profiler/src/profile_pool3d_fwd.cpp index 4ea1fbcf49..21896c7e48 100644 --- a/profiler/src/profile_pool3d_fwd.cpp +++ b/profiler/src/profile_pool3d_fwd.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp index e4af5680a5..dbdc30cf2e 100644 --- a/profiler/src/profile_reduce.cpp +++ b/profiler/src/profile_reduce.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_softmax.cpp b/profiler/src/profile_softmax.cpp index dfe8d95c90..096a2d4eb4 100644 --- a/profiler/src/profile_softmax.cpp +++ b/profiler/src/profile_softmax.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profile_transpose.cpp b/profiler/src/profile_transpose.cpp index 5aa9b34a1c..c618135608 100644 --- a/profiler/src/profile_transpose.cpp +++ b/profiler/src/profile_transpose.cpp @@ -1,81 +1,81 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include - -#include "profiler/profile_transpose_impl.hpp" -#include "profiler_operation_registry.hpp" - -enum struct DataType -{ - F32_F32_F32_F32_F32, // 0 - F16_F16_F16_F16_F16, // 1 -}; - -#define OP_NAME "transpose" -#define OP_DESC "Transpose" - -static void print_helper_msg() -{ - printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); - printf("arg2: data type (0: fp32; 1: fp16)\n"); - printf("arg3: verification (0: no; 1: yes)\n"); - printf("arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg5: print tensor value (0: no; 1: yes)\n"); - printf("arg6: time kernel (0=no, 1=yes)\n"); - printf("arg7 to arg11: N, C, D, H, W\n"); -} - -int profile_transpose(int argc, char* argv[]) -{ - if(argc != 12) - { - print_helper_msg(); - exit(1); - } - - const auto data_type = static_cast(std::stoi(argv[2])); - const bool do_verification = std::stoi(argv[3]); - const int init_method = std::stoi(argv[4]); - const bool do_log = std::stoi(argv[5]); - const bool time_kernel = std::stoi(argv[6]); - const std::vector lengths = {std::stoi(argv[7]), - std::stoi(argv[8]), - std::stoi(argv[9]), - std::stoi(argv[10]), - std::stoi(argv[11])}; - - using F32 = float; - using F16 = ck::half_t; - - auto profile = [&](auto a_type, auto b_type) { - using ADataType = decltype(a_type); - using BDataType = decltype(b_type); - constexpr ck::index_t NumDim = 5; - - bool pass = ck::profiler::profile_transpose_impl( - do_verification, init_method, do_log, time_kernel, lengths); - - return pass ? 0 : 1; - }; - - if(data_type == DataType::F32_F32_F32_F32_F32) - { - return profile(F32{}, F32{}); - } - else if(data_type == DataType::F16_F16_F16_F16_F16) - { - return profile(F16{}, F16{}); - } - else - { - std::cout << "this data_type & layout is not implemented" << std::endl; - - return 1; - } -} - -REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_transpose); +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include + +#include "profiler/profile_transpose_impl.hpp" +#include "profiler_operation_registry.hpp" + +enum struct DataType +{ + F32_F32_F32_F32_F32, // 0 + F16_F16_F16_F16_F16, // 1 +}; + +#define OP_NAME "transpose" +#define OP_DESC "Transpose" + +static void print_helper_msg() +{ + printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); + printf("arg2: data type (0: fp32; 1: fp16)\n"); + printf("arg3: verification (0: no; 1: yes)\n"); + printf("arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"); + printf("arg5: print tensor value (0: no; 1: yes)\n"); + printf("arg6: time kernel (0=no, 1=yes)\n"); + printf("arg7 to arg11: N, C, D, H, W\n"); +} + +int profile_transpose(int argc, char* argv[]) +{ + if(argc != 12) + { + print_helper_msg(); + exit(1); + } + + const auto data_type = static_cast(std::stoi(argv[2])); + const bool do_verification = std::stoi(argv[3]); + const int init_method = std::stoi(argv[4]); + const bool do_log = std::stoi(argv[5]); + const bool time_kernel = std::stoi(argv[6]); + const std::vector lengths = {std::stoi(argv[7]), + std::stoi(argv[8]), + std::stoi(argv[9]), + std::stoi(argv[10]), + std::stoi(argv[11])}; + + using F32 = float; + using F16 = ck::half_t; + + auto profile = [&](auto a_type, auto b_type) { + using ADataType = decltype(a_type); + using BDataType = decltype(b_type); + constexpr ck::index_t NumDim = 5; + + bool pass = ck::profiler::profile_transpose_impl( + do_verification, init_method, do_log, time_kernel, lengths); + + return pass ? 0 : 1; + }; + + if(data_type == DataType::F32_F32_F32_F32_F32) + { + return profile(F32{}, F32{}); + } + else if(data_type == DataType::F16_F16_F16_F16_F16) + { + return profile(F16{}, F16{}); + } + else + { + std::cout << "this data_type & layout is not implemented" << std::endl; + + return 1; + } +} + +REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_transpose); diff --git a/profiler/src/profiler.cpp b/profiler/src/profiler.cpp index 0f528c008f..1631b89716 100644 --- a/profiler/src/profiler.cpp +++ b/profiler/src/profiler.cpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/profiler/src/profiler_operation_registry.hpp b/profiler/src/profiler_operation_registry.hpp index 7e6d22d4ce..de78a7f65d 100644 --- a/profiler/src/profiler_operation_registry.hpp +++ b/profiler/src/profiler_operation_registry.hpp @@ -1,5 +1,5 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include #include diff --git a/python/ck4inductor/__init__.py b/python/ck4inductor/__init__.py index f4f71c2d60..0eee25ecaa 100644 --- a/python/ck4inductor/__init__.py +++ b/python/ck4inductor/__init__.py @@ -1,3 +1,7 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + + def __version__(): import subprocess diff --git a/python/ck4inductor/batched_universal_gemm/gen_instances.py b/python/ck4inductor/batched_universal_gemm/gen_instances.py index 8879fb93db..36e70d954e 100644 --- a/python/ck4inductor/batched_universal_gemm/gen_instances.py +++ b/python/ck4inductor/batched_universal_gemm/gen_instances.py @@ -1,5 +1,5 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT -# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. import logging import os diff --git a/python/ck4inductor/batched_universal_gemm/op.py b/python/ck4inductor/batched_universal_gemm/op.py index 96978ac8d2..3e35a28e01 100644 --- a/python/ck4inductor/batched_universal_gemm/op.py +++ b/python/ck4inductor/batched_universal_gemm/op.py @@ -1,5 +1,5 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT -# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. from dataclasses import asdict, dataclass from typing import Optional, Tuple diff --git a/python/ck4inductor/grouped_conv_fwd/gen_instances.py b/python/ck4inductor/grouped_conv_fwd/gen_instances.py index feca20a3b8..f062654a29 100644 --- a/python/ck4inductor/grouped_conv_fwd/gen_instances.py +++ b/python/ck4inductor/grouped_conv_fwd/gen_instances.py @@ -1,5 +1,5 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT -# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. import logging import os diff --git a/python/ck4inductor/grouped_conv_fwd/op.py b/python/ck4inductor/grouped_conv_fwd/op.py index a99ba83e2d..8301f0d07f 100644 --- a/python/ck4inductor/grouped_conv_fwd/op.py +++ b/python/ck4inductor/grouped_conv_fwd/op.py @@ -1,5 +1,5 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT -# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. from dataclasses import asdict, dataclass from typing import Optional, Tuple diff --git a/python/ck4inductor/universal_gemm/gen_instances.py b/python/ck4inductor/universal_gemm/gen_instances.py index 6f8dc8530b..6ad44a5880 100644 --- a/python/ck4inductor/universal_gemm/gen_instances.py +++ b/python/ck4inductor/universal_gemm/gen_instances.py @@ -1,5 +1,5 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT -# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. import logging import os diff --git a/python/ck4inductor/universal_gemm/op.py b/python/ck4inductor/universal_gemm/op.py index 946aaa7afb..8753f4b6fc 100644 --- a/python/ck4inductor/universal_gemm/op.py +++ b/python/ck4inductor/universal_gemm/op.py @@ -1,5 +1,5 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT -# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. from dataclasses import asdict, dataclass from typing import Optional, Tuple diff --git a/python/ck4inductor/util.py b/python/ck4inductor/util.py index 4d7e8bd87d..4f5f844103 100644 --- a/python/ck4inductor/util.py +++ b/python/ck4inductor/util.py @@ -1,5 +1,5 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT -# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. import functools import os diff --git a/python/test/test_gen_instances.py b/python/test/test_gen_instances.py index 4a85c702f9..56fe4f7342 100644 --- a/python/test/test_gen_instances.py +++ b/python/test/test_gen_instances.py @@ -1,7 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT -# Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. import logging - import unittest from ck4inductor.universal_gemm.gen_instances import (