From 0605012453dbde4ef8288d408ba6306d5407a57f Mon Sep 17 00:00:00 2001 From: Emily Martins <65371150+ecamartins@users.noreply.github.com> Date: Sun, 20 Jul 2025 00:13:36 -0600 Subject: [PATCH] Tests for CK Tile Flatmm and MOE Smoothquant (#2458) * CK tile tests for flatmm using example * MOE smoothquant draft tests * fix create_arg default index to zero for MOE smoothquant * revert MOE smoothquant changes * code clean up * Add back MOE smoothquant changes * Add MOE smoothquant cases for different precisions and update cmake * clean up comments * Update flamm cmake * revert change made to moe_smoothquant smoke_test.sh EXE path * remove unecessary comment in MOE smoothquant cmakelist * comment out adding moe_smoothquant subdirectory for now due to bugs with GPU core dump issue on gfx942 and gfx90a * Clean up run_test_case function in MOE smootquant tests * update copyright and licensing on files * Remove flatmm test dir since tests should be done as weighted preshuffle gemm * Add flamm smoke test cases to weighted preshuffle gemm gtests * remove blank line from CMakeLists --------- Co-authored-by: root Co-authored-by: Thomas Ning [ROCm/composable_kernel commit: 20306db651858938e913533da7e4382d28912fa1] --- test/ck_tile/CMakeLists.txt | 3 + .../test_gemm_pipeline_kernel_types.hpp | 20 +- .../test_gemm_pipeline_ut_cases.inc | 119 ++++++- test/ck_tile/moe_smoothquant/CMakeLists.txt | 32 ++ .../moe_smoothquant_bf16_n1024_instance.cpp | 27 ++ .../moe_smoothquant_bf16_n1536_instance.cpp | 18 + .../moe_smoothquant_bf16_n2048_instance.cpp | 19 ++ .../moe_smoothquant_bf16_n256_instance.cpp | 16 + .../moe_smoothquant_bf16_n3072_instance.cpp | 18 + .../moe_smoothquant_bf16_n4096_instance.cpp | 18 + ...moe_smoothquant_bf16_n4096_tp_instance.cpp | 18 + .../moe_smoothquant_bf16_n512_instance.cpp | 18 + ...moe_smoothquant_bf16_n64_n128_instance.cpp | 16 + .../moe_smoothquant_bf16_n768_instance.cpp | 16 + .../moe_smoothquant_fp16_n1024_instance.cpp | 27 ++ .../moe_smoothquant_fp16_n1536_instance.cpp | 18 + .../moe_smoothquant_fp16_n2048_instance.cpp | 18 + .../moe_smoothquant_fp16_n256_instance.cpp | 16 + .../moe_smoothquant_fp16_n3072_instance.cpp | 18 + .../moe_smoothquant_fp16_n4096_instance.cpp | 18 + ...moe_smoothquant_fp16_n4096_tp_instance.cpp | 18 + .../moe_smoothquant_fp16_n512_instance.cpp | 18 + ...moe_smoothquant_fp16_n64_n128_instance.cpp | 16 + .../moe_smoothquant_fp16_n768_instance.cpp | 16 + .../instances/moe_smoothquant_fwd_api.cpp | 155 +++++++++ .../moe_smoothquant_instance_common.hpp | 65 ++++ .../moe_smoothquant/moe_smoothquant.hpp | 104 ++++++ .../moe_smoothquant/moe_smoothquant.inc | 317 ++++++++++++++++++ .../moe_smoothquant_bf16_fp8.cpp | 11 + .../moe_smoothquant_bf16_int8.cpp | 11 + .../moe_smoothquant_fp16_fp8.cpp | 11 + .../moe_smoothquant_fp16_int8.cpp | 11 + 32 files changed, 1217 insertions(+), 9 deletions(-) create mode 100644 test/ck_tile/moe_smoothquant/CMakeLists.txt create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp create mode 100644 test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp create mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant.hpp create mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant.inc create mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_fp8.cpp create mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_int8.cpp create mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_fp8.cpp create mode 100644 test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_int8.cpp diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt index 5c0f3fb076..0b6fd35988 100644 --- a/test/ck_tile/CMakeLists.txt +++ b/test/ck_tile/CMakeLists.txt @@ -5,6 +5,9 @@ add_subdirectory(batched_gemm) add_subdirectory(grouped_gemm) add_subdirectory(gemm_multi_d) add_subdirectory(data_type) +# Not including these tests as there is a bug on gfx90a and gfx942 +# resulting in "GPU core dump" +#add_subdirectory(moe_smoothquant) add_subdirectory(slice_tile) add_subdirectory(batched_transpose) add_subdirectory(smoothquant) diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp index 152017dbad..f66f3cb0aa 100644 --- a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp +++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_kernel_types.hpp @@ -1,5 +1,5 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT #include #include @@ -9,9 +9,10 @@ #include "ck_tile/host.hpp" #include "test_gemm_pipeline_util.hpp" -using F16 = ck_tile::half_t; -using F32 = float; -using F8 = ck_tile::fp8_t; +using F16 = ck_tile::half_t; +using F32 = float; +using F8 = ck_tile::fp8_t; +using BF16 = ck_tile::bf16_t; using Row = ck_tile::tensor_layout::gemm::RowMajor; using Col = ck_tile::tensor_layout::gemm::ColumnMajor; @@ -22,11 +23,16 @@ using Default = ck_tile::integral_constant; +// Adding alias for the F8 parameters to facilitate skipping tests. +// This alias can be removed once test failures are fixed. +using F8Types = std::tuple; + // clang-format off using KernelTypesWeightPreshuffle = ::testing::Types< std::tuple< Row, Col, Row, F16, F16, F32, F16, Default, WeightPreshuffle>, - std::tuple< Row, Col, Row, F8, F8, F32, F16, Default, WeightPreshuffle> ->; + std::tuple< Row, Col, Row, BF16, BF16, F32, BF16, Default, WeightPreshuffle>, + F8Types + >; // clang-format on diff --git a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc index b3a82e5fbc..389e0d53ea 100755 --- a/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc +++ b/test/ck_tile/gemm_weight_preshuffle/test_gemm_pipeline_ut_cases.inc @@ -1,5 +1,5 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT #pragma once @@ -18,4 +18,119 @@ TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle) this->template Run(M, N, K); } +TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_128x128x128) +{ + if constexpr(std::is_same_v) + { + GTEST_SKIP() << "Skipping this test due to failures with F8"; + } + constexpr int M = 128; + constexpr int N = 128; + constexpr int K = 128; + constexpr bool PadM = false; + constexpr bool PadN = false; + constexpr bool PadK = false; + constexpr bool Preshuffle = true; + this->template Run(M, N, K); +} + +TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_128x128x4096) +{ + constexpr int M = 128; + constexpr int N = 128; + constexpr int K = 4096; + constexpr bool PadM = false; + constexpr bool PadN = false; + constexpr bool PadK = false; + constexpr bool Preshuffle = true; + this->template Run(M, N, K); +} + +TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_128x2048x128) +{ + if constexpr(std::is_same_v) + { + GTEST_SKIP() << "Skipping this test due to failures with F8"; + } + + constexpr int M = 128; + constexpr int N = 2048; + constexpr int K = 128; + constexpr bool PadM = false; + constexpr bool PadN = false; + constexpr bool PadK = false; + constexpr bool Preshuffle = true; + this->template Run(M, N, K); +} + +TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_128x2048x4096) +{ + constexpr int M = 128; + constexpr int N = 2048; + constexpr int K = 4096; + constexpr bool PadM = false; + constexpr bool PadN = false; + constexpr bool PadK = false; + constexpr bool Preshuffle = true; + this->template Run(M, N, K); +} + +TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_1024x128x128) +{ + if constexpr(std::is_same_v) + { + GTEST_SKIP() << "Skipping this test due to failures with F8"; + } + + constexpr int M = 1024; + constexpr int N = 128; + constexpr int K = 128; + constexpr bool PadM = false; + constexpr bool PadN = false; + constexpr bool PadK = false; + constexpr bool Preshuffle = true; + this->template Run(M, N, K); +} + +TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_1024x128x4096) +{ + constexpr int M = 1024; + constexpr int N = 128; + constexpr int K = 4096; + constexpr bool PadM = false; + constexpr bool PadN = false; + constexpr bool PadK = false; + constexpr bool Preshuffle = true; + this->template Run(M, N, K); +} + +TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_1024x2048x128) +{ + if constexpr(std::is_same_v) + { + GTEST_SKIP() << "Skipping this test due to failures with F8"; + } + + constexpr int M = 1024; + constexpr int N = 2048; + constexpr int K = 128; + constexpr bool PadM = false; + constexpr bool PadN = false; + constexpr bool PadK = false; + constexpr bool Preshuffle = true; + this->template Run(M, N, K); +} + +TYPED_TEST(TEST_SUITE_NAME, GemmPreshuffle_1024x2048x4096) +{ + constexpr int M = 1024; + constexpr int N = 2048; + constexpr int K = 4096; + constexpr bool PadM = false; + constexpr bool PadN = false; + constexpr bool PadK = false; + constexpr bool Preshuffle = true; + this->template Run(M, N, K); +} + #endif diff --git a/test/ck_tile/moe_smoothquant/CMakeLists.txt b/test/ck_tile/moe_smoothquant/CMakeLists.txt new file mode 100644 index 0000000000..70999fa06b --- /dev/null +++ b/test/ck_tile/moe_smoothquant/CMakeLists.txt @@ -0,0 +1,32 @@ +# Currently ck_tile is only built on gfx9 +if(GPU_TARGETS MATCHES "gfx9") + function (add_moe_smoothquant_test TARGET_NAME MAIN_SRC) + message(DEBUG "adding ${TARGET_NAME}") + add_test_executable(${TARGET_NAME} ${MAIN_SRC}) + target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) + + foreach(source IN LISTS ARGN) + list(APPEND INSTANCE_SRCS ${source}) + endforeach() + + target_sources(${TARGET_NAME} PRIVATE ${INSTANCE_SRCS}) + + set(COMPILE_OPTIONS) + # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations + list(APPEND COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) + # list(APPEND COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker) + + target_compile_options(${TARGET_NAME} PRIVATE ${COMPILE_OPTIONS}) + endfunction(add_moe_smoothquant_test TARGET_NAME MAIN_SRC) + + file(GLOB INSTANCE_SRCS instances/*.cpp) + + add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_fp16_fp8 moe_smoothquant_fp16_fp8.cpp ${INSTANCE_SRCS}) + add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_fp16_int8 moe_smoothquant_fp16_int8.cpp ${INSTANCE_SRCS}) + + add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_bf16_fp8 moe_smoothquant_bf16_fp8.cpp ${INSTANCE_SRCS}) + add_moe_smoothquant_test(test_ck_tile_moe_smoothquant_bf16_int8 moe_smoothquant_bf16_int8.cpp ${INSTANCE_SRCS}) + +else() + message(DEBUG "Skipping ck_tile MOE smoothquant tests for current target") +endif() \ No newline at end of file diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp new file mode 100644 index 0000000000..93a1b9fed4 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp @@ -0,0 +1,27 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +#if 0 +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +#endif + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp new file mode 100644 index 0000000000..7e55a542d7 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1536_instance.cpp @@ -0,0 +1,18 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp new file mode 100644 index 0000000000..74bd206e02 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n2048_instance.cpp @@ -0,0 +1,19 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp new file mode 100644 index 0000000000..169f4cdc72 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n256_instance.cpp @@ -0,0 +1,16 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp new file mode 100644 index 0000000000..bfb34e64a1 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n3072_instance.cpp @@ -0,0 +1,18 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp new file mode 100644 index 0000000000..03bbc0e06f --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp @@ -0,0 +1,18 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp new file mode 100644 index 0000000000..000845bc40 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp @@ -0,0 +1,18 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp new file mode 100644 index 0000000000..798a02248c --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp @@ -0,0 +1,18 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp new file mode 100644 index 0000000000..7864e3e3dd --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp @@ -0,0 +1,16 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp new file mode 100644 index 0000000000..c3d25c8859 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp @@ -0,0 +1,16 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp new file mode 100644 index 0000000000..eaaed6c5bb --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp @@ -0,0 +1,27 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +#if 0 +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +#endif + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp new file mode 100644 index 0000000000..556ac25809 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp @@ -0,0 +1,18 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp new file mode 100644 index 0000000000..589faef0b5 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp @@ -0,0 +1,18 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp new file mode 100644 index 0000000000..ca331b1793 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp @@ -0,0 +1,16 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp new file mode 100644 index 0000000000..dc80dadec5 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp @@ -0,0 +1,18 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp new file mode 100644 index 0000000000..2947c3b698 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp @@ -0,0 +1,18 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp new file mode 100644 index 0000000000..b194fd457b --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp @@ -0,0 +1,18 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp new file mode 100644 index 0000000000..fee9a6a454 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp @@ -0,0 +1,18 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp new file mode 100644 index 0000000000..17986277f7 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp @@ -0,0 +1,16 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp new file mode 100644 index 0000000000..a7fb2d0d6c --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp @@ -0,0 +1,16 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant_instance_common.hpp" + +// clang-format off +// rm rn tm tn vn pd 2p +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); + +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +template float moe_smoothquant_>(const S&, A); +// clang-format on diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp new file mode 100644 index 0000000000..0b890ab3ac --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp @@ -0,0 +1,155 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include "moe_smoothquant.hpp" + +template +using trait_ = moe_smoothquant_traits_; + +template +float moe_smoothquant_dispatch(moe_smoothquant_traits /*t*/, + moe_smoothquant_args a, + const ck_tile::stream_config& s) +{ + float r = -1; + // clang-format off + // rm rn tm tn vn pd 2p + if(a.hidden_size <= 64) { + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 128) { + if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 256) { + if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 512) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 768) { + if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 1024) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 1536) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 2048) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 3072) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size <= 4096) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + else if(a.hidden_size > 4096) { + if (a.hidden_size % 8 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 4 == 0) + r = moe_smoothquant_>(s, a); + else if (a.hidden_size % 2 == 0) + r = moe_smoothquant_>(s, a); + else + r = moe_smoothquant_>(s, a); + } + return r; + // clang-format on +} + +float moe_smoothquant(moe_smoothquant_traits t, + moe_smoothquant_args a, + const ck_tile::stream_config& s) +{ + if(t.in_type.compare("fp16") == 0 && t.out_type == "int8") + { + return moe_smoothquant_dispatch(t, a, s); + } + else if(t.in_type.compare("fp16") == 0 && t.out_type == "fp8") + { + return moe_smoothquant_dispatch(t, a, s); + } + else if(t.in_type.compare("bf16") == 0 && t.out_type == "int8") + { + return moe_smoothquant_dispatch(t, a, s); + } + else if(t.in_type.compare("bf16") == 0 && t.out_type == "fp8") + { + return moe_smoothquant_dispatch(t, a, s); + } + else + throw std::runtime_error("Without supported instances!"); +} diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp new file mode 100644 index 0000000000..9d8c9caf00 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_instance_common.hpp @@ -0,0 +1,65 @@ + +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include "moe_smoothquant.hpp" +#include + +#pragma once + +using S = ck_tile::stream_config; +using A = moe_smoothquant_args; + +template +using trait_ = moe_smoothquant_traits_; + +template +float moe_smoothquant_(const S& s, A a) +{ + using InputType = typename Traits_::InputType; + using OutputType = typename Traits_::OutputType; + + using PipelineProblem = ck_tile::SmoothquantPipelineProblem< + typename MoeSmoothquantTypeConfig::XDataType, + typename MoeSmoothquantTypeConfig::SmoothScaleDataType, + typename MoeSmoothquantTypeConfig::ComputeDataType, + typename MoeSmoothquantTypeConfig::YScaleDataType, + typename MoeSmoothquantTypeConfig::QYDataType, + typename Traits_::Shape, + Traits_::kPadN, + Traits_::kTwoPass>; + + using OnePassPipeline = ck_tile::SmoothquantPipelineOnePass; + using TwoPassPipeline = ck_tile::SmoothquantPipelineTwoPass; + using Pipeline = std::conditional_t; + + using Kernel = ck_tile::MoeSmoothquant; + + const dim3 grids = Kernel::GridSize(a); + constexpr dim3 blocks = Kernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = 1; + + auto kargs = Kernel::MakeKargs(a); + if(s.log_level_ > 0) + std::cout << ", " << Kernel::GetName() << std::flush; + + return ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); +} diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp b/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp new file mode 100644 index 0000000000..d137e64cb4 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/moe_smoothquant.hpp @@ -0,0 +1,104 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/smoothquant.hpp" +#include + +template +struct MoeSmoothquantTypeConfig +{ + using XDataType = InputType; + using SmoothScaleDataType = float; + using YScaleDataType = float; + using QYDataType = OutputType; + using ComputeDataType = float; +}; + +// runtime args +struct moe_smoothquant_args : public ck_tile::MoeSmoothquantHostArgs +{ +}; + +// this is used to pattern-match internl kernel implementation, not to instantiate kernel +template +struct moe_smoothquant_traits_ +{ + using InputType = ck_tile::remove_cvref_t; + using OutputType = ck_tile::remove_cvref_t; + + static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size(); + static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0); + static constexpr ck_tile::index_t total_warps = + (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size(); + + // num of warps along m + static constexpr ck_tile::index_t BlockWarps_M = []() { + if constexpr(is_warp_per_row) + { + static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0); + return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_); + } + else + { + // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0); + return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size()); + } + }(); + + // num of warps along n + static constexpr ck_tile::index_t BlockWarps_N = []() { + if constexpr(is_warp_per_row) + { + static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0); + return 1; + } + else + { + static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0); + return ThreadPerBlock_N_ / ck_tile::get_warp_size(); + } + }(); + + static constexpr ck_tile::index_t Repeat_M = Repeat_M_; + static constexpr ck_tile::index_t Repeat_N = Repeat_N_; + + static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_; + static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_; + + static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M; + static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_; + + using BlockTile = ck_tile::sequence; + using BlockWarps = ck_tile::sequence; + using WarpTile = ck_tile::sequence; + using Vector = ck_tile::sequence<1, Vector_N_>; + + using Shape = ck_tile::Generic2dBlockShape; + + static constexpr bool kPadN = kPadN_; + static constexpr bool kTwoPass = kTwoPass_; +}; + +template +float moe_smoothquant_(const ck_tile::stream_config& s, moe_smoothquant_args a); + +// This is the public API, will be generated by script +struct moe_smoothquant_traits +{ + std::string in_type; // input type + std::string out_type; // output type +}; + +float moe_smoothquant(moe_smoothquant_traits, moe_smoothquant_args, const ck_tile::stream_config&); diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant.inc b/test/ck_tile/moe_smoothquant/moe_smoothquant.inc new file mode 100644 index 0000000000..ff23c99e74 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/moe_smoothquant.inc @@ -0,0 +1,317 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "ck_tile/host.hpp" +#include "moe_smoothquant.hpp" +#include +#include +#include + +// different threshold for different dtype +template +auto get_elimit() +{ + double rtol = 1e-5; + double atol = 1e-5; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + double rtol = 1e-5; + double atol = 1e-5; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + // due to rounding, int8 quantization might have 1 abs error + double rtol = 1; + double atol = 1; + return ck_tile::make_tuple(rtol, atol); +} + +template +void topid_unique_gen( + std::vector& host_tensor, int tokens, int topk, int num_expert, int seed) +{ + size_t total_size = topk * tokens; + std::srand(seed); + std::set unique_set; + IndexType current_v; + for(size_t i = 0; i < total_size; i++) + { + if(i % topk == 0) + { + unique_set.clear(); + } + current_v = std::rand() % num_expert; + while(unique_set.find(current_v) != unique_set.end()) + { + current_v = std::rand() % num_expert; + } + unique_set.insert(current_v); + host_tensor[i] = current_v; + } +} + +auto create_args(int argc, char* argv[], int index = 0) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("t", "3328", "tokens dimension") + .insert("h", "4096", "hidden_size dimension") + .insert("e", "32", "experts") + .insert("k", "5", "topk") + .insert("stride", "-1", "stride per row, if -1 then equal to hidden_size") + .insert("v", "1", "cpu validation or not") + .insert("kname", "1", "print kernel name or not") + .insert("prec_i", "fp16", "input precision, fp16/bf16") + .insert("prec_o", "int8", "precision, int8/fp8") + .insert("warmup", "5", "cold iter") + .insert("repeat", "20", "hot iter"); + + bool result = arg_parser.parse(argc, argv, index); + return std::make_tuple(result, arg_parser); +} + +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + ck_tile::index_t tokens = arg_parser.get_int("t"); + ck_tile::index_t hidden_size = arg_parser.get_int("h"); + ck_tile::index_t stride = arg_parser.get_int("stride"); + if(stride < 0) + stride = hidden_size; + ck_tile::index_t experts = arg_parser.get_int("e"); + ck_tile::index_t topk = arg_parser.get_int("k"); + std::string prec_i = arg_parser.get_str("prec_i"); + std::string prec_o = arg_parser.get_str("prec_o"); + int kname = arg_parser.get_int("kname"); + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + + assert(stride >= hidden_size); + + using TypeConfig = MoeSmoothquantTypeConfig; + + using XDataType = typename TypeConfig::XDataType; + using SmoothScaleDataType = typename TypeConfig::SmoothScaleDataType; + using YScaleDataType = typename TypeConfig::YScaleDataType; + using QYDataType = typename TypeConfig::QYDataType; + using ComputeDataType = typename TypeConfig::ComputeDataType; + + // host verify + ck_tile::HostTensor x_host({tokens, hidden_size}, {stride, 1}); + ck_tile::HostTensor smscale_host({experts * hidden_size}); + ck_tile::HostTensor topk_ids_host({tokens, topk}); + + ck_tile::HostTensor yscale_host_ref({topk * tokens}, {1}); + ck_tile::HostTensor yscale_host_dev({topk * tokens}, {1}); + + ck_tile::HostTensor qy_host_ref({topk * tokens, hidden_size}, {stride, 1}); + ck_tile::HostTensor qy_host_dev({topk * tokens, hidden_size}, {stride, 1}); + + topid_unique_gen(topk_ids_host.mData, tokens, topk, experts, 11937); + ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); + ck_tile::FillUniformDistribution{1e-3, .5f}(smscale_host); + + ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem smscale_buf(smscale_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem topk_ids_buf(topk_ids_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem yscale_buf(yscale_host_dev.get_element_space_size_in_bytes()); + ck_tile::DeviceMem qy_buf(qy_host_dev.get_element_space_size_in_bytes()); + + x_buf.ToDevice(x_host.data()); + smscale_buf.ToDevice(smscale_host.data()); + topk_ids_buf.ToDevice(topk_ids_host.data()); + + std::cout << "[" << prec_i << "-" << prec_o << "]" + << " tokens:" << tokens << ", hidden_size:" << hidden_size << ", stride:" << stride + << ", experts:" << experts << ", topk:" << topk << std::flush; + + moe_smoothquant_traits traits{prec_i, prec_o}; + + moe_smoothquant_args args{x_buf.GetDeviceBuffer(), + smscale_buf.GetDeviceBuffer(), + topk_ids_buf.GetDeviceBuffer(), + yscale_buf.GetDeviceBuffer(), + qy_buf.GetDeviceBuffer(), + tokens, + hidden_size, + experts, + topk, + stride, + stride}; + + float ave_time = moe_smoothquant( + traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); + + std::size_t num_byte = sizeof(XDataType) * tokens * hidden_size + + sizeof(SmoothScaleDataType) * topk * hidden_size + + sizeof(YScaleDataType) * topk * tokens + + sizeof(QYDataType) * topk * tokens * hidden_size; + + float gb_per_sec = num_byte / 1.E6 / ave_time; + std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush; + + bool pass = true; + + if(do_validation) + { + using YDataType = ComputeDataType; + ck_tile::HostTensor y_host({topk * tokens, hidden_size}, {stride, 1}); + // smooth outlier + { + auto f = [&](auto i_token) { + for(int i_topk = 0; i_topk < topk; i_topk++) + { + auto i_expert = topk_ids_host(i_token, i_topk); + + for(int i_h = 0; i_h < hidden_size; ++i_h) + { + auto v_smscale = ck_tile::type_convert( + smscale_host(i_expert * hidden_size + i_h)); + auto v_x = ck_tile::type_convert(x_host(i_token, i_h)); + // y_host(i_token * topk + i_topk, i_h) = v_x * v_smscale; + y_host(i_topk * tokens + i_token, i_h) = v_x * v_smscale; + } + } + }; + + ck_tile::make_ParallelTensorFunctor(f, tokens)(std::thread::hardware_concurrency()); + } + + // yscale + { + ck_tile::HostTensor y_rowwise_amax_host({topk * tokens}); + + using ReduceAmax = ck_tile::ReduceOp::AbsMax; + ck_tile::reference_reduce( + y_host, y_rowwise_amax_host, ReduceAmax{}); + + auto op = [](const auto& v0) { + return v0 / + ck_tile::type_convert(ck_tile::numeric::max()); + }; + ck_tile::reference_unary_elementwise( + y_rowwise_amax_host, yscale_host_ref, op); + + yscale_buf.FromDevice(yscale_host_dev.mData.data()); + + auto [rtol, atol] = get_elimit(); + pass &= ck_tile::check_err(yscale_host_dev, + yscale_host_ref, + std::string("yscale Error: Incorrect results!"), + rtol, + atol); + } + + // rowwise quantization + { + ck_tile::reference_rowwise_quantization2d( + y_host, yscale_host_ref, qy_host_ref); + + qy_buf.FromDevice(qy_host_dev.data()); + auto [rtol, atol] = get_elimit(); + + if(stride == hidden_size) + { + pass = ck_tile::check_err(qy_host_dev, + qy_host_ref, + std::string("qy Error: Incorrect results!"), + rtol, + atol); + } + else + { + for(int i_r = 0; i_r < topk * tokens; i_r++) + { + std::vector qy_host_dev_row(qy_host_dev.begin() + i_r * stride, + qy_host_dev.begin() + i_r * stride + + hidden_size); + std::vector qy_host_ref_row(qy_host_ref.begin() + i_r * stride, + qy_host_ref.begin() + i_r * stride + + hidden_size); + pass &= ck_tile::check_err(qy_host_dev_row, + qy_host_ref_row, + std::string("qy[") + std::to_string(i_r) + + std::string("] Error: Incorrect results!"), + rtol, + atol); + } + } + } + + std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl; + } + + return pass; +} + +std::vector> generate_test_cases(const std::string prec_in, + const std::string prec_out) +{ + return {{"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=99", "-h=13", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=17", "-h=16", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=1", "-h=100", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=4", "-h=128", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=80", "-h=127", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=22", "-h=255", "-stride=256"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=7", "-h=599", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=19", "-h=512", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=33", "-h=313", "-stride=1000"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=11", "-h=510", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=171", "-h=676", "-stride=818"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=12", "-h=768", "-stride=800"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=100", "-h=766", "-stride=812"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=31", "-h=1024", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=64", "-h=1000", "-stride=1004"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=8", "-h=1501", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=3", "-h=1826", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=5", "-h=2040", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=7", "-h=2734", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=1", "-h=3182", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=9", "-h=4096", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=3", "-h=8192", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=1", "-h=10547", "-stride=-1"}, + {"-prec_i=" + prec_in, "-prec_o=" + prec_out, "-t=3", "-h=17134", "-stride=-1"}}; +} + +template +bool run_test_case(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return false; + + return run(arg_parser); +} + +template +bool run_test_cases(std::vector>& test_cases) +{ + bool valid = true; + constexpr int num_args = 5; + char* argv[num_args]; + + for(std::size_t test_idx = 0; test_idx < test_cases.size(); ++test_idx) + { + + assert(num_args == test_cases[test_idx].size() && "invalid number of arguments"); + + for(int arg_idx = 0; arg_idx < num_args; ++arg_idx) + { + argv[arg_idx] = test_cases[test_idx][arg_idx].data(); + } + + valid = valid && run_test_case(num_args, argv); + + if(!valid) + break; + } + + return valid; +} diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_fp8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_fp8.cpp new file mode 100644 index 0000000000..3b5350da4b --- /dev/null +++ b/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_fp8.cpp @@ -0,0 +1,11 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant.inc" + +int main() +{ + std::vector> test_cases = generate_test_cases("bf16", "fp8"); + + return !run_test_cases(test_cases); +} diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_int8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_int8.cpp new file mode 100644 index 0000000000..4751273f1d --- /dev/null +++ b/test/ck_tile/moe_smoothquant/moe_smoothquant_bf16_int8.cpp @@ -0,0 +1,11 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant.inc" + +int main() +{ + std::vector> test_cases = generate_test_cases("bf16", "int8"); + + return !run_test_cases(test_cases); +} diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_fp8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_fp8.cpp new file mode 100644 index 0000000000..b9932dee65 --- /dev/null +++ b/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_fp8.cpp @@ -0,0 +1,11 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant.inc" + +int main() +{ + std::vector> test_cases = generate_test_cases("fp16", "fp8"); + + return !run_test_cases(test_cases); +} diff --git a/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_int8.cpp b/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_int8.cpp new file mode 100644 index 0000000000..91c53b77bc --- /dev/null +++ b/test/ck_tile/moe_smoothquant/moe_smoothquant_fp16_int8.cpp @@ -0,0 +1,11 @@ +// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "moe_smoothquant.inc" + +int main() +{ + std::vector> test_cases = generate_test_cases("fp16", "int8"); + + return !run_test_cases(test_cases); +}