From dee2696501fe84f423341096d12e3a5d129533b6 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Tue, 2 Aug 2022 21:52:27 +0200 Subject: [PATCH] CGEMM examples bf16, fp32, int8 (#332) * Add int8 specialization for elementwise Add and Subtract. * CGEMM examples bf16, fp32, int8 * Add convert reference output to CDataType. * Skip BF16 data type during testing. * Lower K value to get rid of accumulation error. * Fix merge artifact. * Fix changed function name: GetElementSpaceSize() * Fix merge artifact. Co-authored-by: Adam Osewski [ROCm/composable_kernel commit: fb0dc35861056cbf08f68fd3208aa787e789230e] --- example/22_cgemm/CMakeLists.txt | 10 + example/22_cgemm/cgemm_xdl_bf16.cpp | 132 +++++++++++ example/22_cgemm/cgemm_xdl_common.hpp | 192 ++++++++++++++++ example/22_cgemm/cgemm_xdl_fp16.cpp | 211 +++--------------- example/22_cgemm/cgemm_xdl_fp32.cpp | 132 +++++++++++ example/22_cgemm/cgemm_xdl_int8.cpp | 132 +++++++++++ .../element/binary_element_wise_operation.hpp | 14 ++ include/ck/utility/dynamic_buffer.hpp | 1 + .../cpu/reference_cgemm.hpp | 7 +- 9 files changed, 649 insertions(+), 182 deletions(-) create mode 100644 example/22_cgemm/cgemm_xdl_bf16.cpp create mode 100644 example/22_cgemm/cgemm_xdl_common.hpp create mode 100644 example/22_cgemm/cgemm_xdl_fp32.cpp create mode 100644 example/22_cgemm/cgemm_xdl_int8.cpp diff --git a/example/22_cgemm/CMakeLists.txt b/example/22_cgemm/CMakeLists.txt index 048df3bba4..0bad707f24 100644 --- a/example/22_cgemm/CMakeLists.txt +++ b/example/22_cgemm/CMakeLists.txt @@ -1 +1,11 @@ +add_custom_target(example_cgemm_xdl) + +add_example_executable(example_cgemm_xdl_bf16 cgemm_xdl_bf16.cpp) add_example_executable(example_cgemm_xdl_fp16 cgemm_xdl_fp16.cpp) +add_example_executable(example_cgemm_xdl_fp32 cgemm_xdl_fp32.cpp) +add_example_executable(example_cgemm_xdl_int8 cgemm_xdl_int8.cpp) + +add_dependencies(example_cgemm_xdl example_cgemm_xdl_bf16) +add_dependencies(example_cgemm_xdl example_cgemm_xdl_fp16) +add_dependencies(example_cgemm_xdl example_cgemm_xdl_fp32) +add_dependencies(example_cgemm_xdl example_cgemm_xdl_int8) diff --git a/example/22_cgemm/cgemm_xdl_bf16.cpp b/example/22_cgemm/cgemm_xdl_bf16.cpp new file mode 100644 index 0000000000..5f73c684c7 --- /dev/null +++ b/example/22_cgemm/cgemm_xdl_bf16.cpp @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "cgemm_xdl_common.hpp" + +#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp" + +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" + +using ADataType = BF16; +using BDataType = BF16; +using CDataType = BF16; +using AccDataType = F32; + +using ALayout = ck::tensor_layout::gemm::RowMajor; +using BLayout = ck::tensor_layout::gemm::ColumnMajor; +using CLayout = ck::tensor_layout::gemm::RowMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +using ReferenceCGemmInstance = ck::tensor_operation::host:: + ReferenceCGemm; + +// clang-format off +using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_CShuffle + , // typename ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // typename ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // typename ABlockTransferSrcAccessOrder + 2, // index_t ABlockTransferSrcVectorDim + 8, // index_t ABlockTransferSrcScalarPerVector + 8, // index_t ABlockTransferDstScalarPerVector_AK1 + 1, // index_t ABlockLdsExtraM + S<4, 64, 1>, // typename BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // typename BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // typename BBlockTransferSrcAccessOrder + 2, // index_t BBlockTransferSrcVectorDim + 8, // index_t BBlockTransferSrcScalarPerVector + 8, // index_t BBlockTransferDstScalarPerVector_BK1 + 1, // index_t BBlockLdsExtraN + 1, // index_t CShuffleMXdlPerWavePerShuffle + 1, // index_t CShuffleNXdlPerWavePerShuffle + S<1, 32, 1, 8>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock + 8>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock +// clang-format on + +int main(int argc, char* argv[]) +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; + + // CGEMM shape + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 416; + + ck::index_t StrideA = 4096; + ck::index_t StrideB = 4096; + ck::index_t StrideC = 4096; + + if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + } + else if(argc == 10) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + + M = std::stoi(argv[4]); + N = std::stoi(argv[5]); + K = std::stoi(argv[6]); + + StrideA = std::stoi(argv[7]); + StrideB = std::stoi(argv[8]); + StrideC = std::stoi(argv[9]); + } + else + { + std::cout << "arg1: verification (0=no, 1=yes)\n" + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg3: run kernel # of times (>1)\n" + << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n" + << std::endl; + exit(0); + } + + return run_cgemm_xdl( + M, N, K, StrideA, StrideB, StrideC, do_verification, init_method, time_kernel); +} diff --git a/example/22_cgemm/cgemm_xdl_common.hpp b/example/22_cgemm/cgemm_xdl_common.hpp new file mode 100644 index 0000000000..d388a6e71b --- /dev/null +++ b/example/22_cgemm/cgemm_xdl_common.hpp @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/stream_config.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +template +using S = ck::Sequence; + +using F16 = ck::half_t; +using F32 = float; +using BF16 = ck::bhalf_t; +using INT8 = std::int8_t; +using INT32 = std::int32_t; + +template +int run_cgemm_xdl(ck::index_t M, + ck::index_t N, + ck::index_t K, + ck::index_t StrideA, + ck::index_t StrideB, + ck::index_t StrideC, + bool do_verification, + int init_method, + bool time_kernel) +{ + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if(std::is_same::value) + { + return HostTensorDescriptor(std::vector({row, col}), + std::vector({stride, 1})); + } + else + { + return HostTensorDescriptor(std::vector({row, col}), + std::vector({1, stride})); + } + }; + + Tensor a_m_k_real(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); + Tensor a_m_k_imag(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); + Tensor b_k_n_real(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + Tensor b_k_n_imag(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + Tensor c_m_n_real_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_m_n_imag_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + + std::cout << "a_m_k_real: " << a_m_k_real.mDesc << std::endl; + std::cout << "a_m_k_imag: " << a_m_k_imag.mDesc << std::endl; + std::cout << "b_k_n_real: " << b_k_n_real.mDesc << std::endl; + std::cout << "b_k_n_imag: " << b_k_n_imag.mDesc << std::endl; + std::cout << "c_m_n_real: " << c_m_n_real_device_result.mDesc << std::endl; + std::cout << "c_m_n_imag: " << c_m_n_imag_device_result.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + a_m_k_real.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + a_m_k_imag.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b_k_n_real.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b_k_n_imag.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + default: + a_m_k_real.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + a_m_k_imag.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + b_k_n_real.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + b_k_n_imag.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + auto cgemm = DeviceCGemmInstance{}; + + DeviceMem a_m_k_real_device_buf(sizeof(ADataType) * a_m_k_real.mDesc.GetElementSpaceSize()); + DeviceMem a_m_k_imag_device_buf(sizeof(ADataType) * a_m_k_imag.mDesc.GetElementSpaceSize()); + DeviceMem b_k_n_real_device_buf(sizeof(BDataType) * b_k_n_real.mDesc.GetElementSpaceSize()); + DeviceMem b_k_n_imag_device_buf(sizeof(BDataType) * b_k_n_imag.mDesc.GetElementSpaceSize()); + DeviceMem c_m_n_real_device_buf(sizeof(CDataType) * + c_m_n_real_device_result.mDesc.GetElementSpaceSize()); + DeviceMem c_m_n_imag_device_buf(sizeof(CDataType) * + c_m_n_imag_device_result.mDesc.GetElementSpaceSize()); + DeviceMem workspace_device_buf(cgemm.GetWorkspaceSize(M, N, K, StrideA, StrideB, StrideC)); + + a_m_k_real_device_buf.ToDevice(a_m_k_real.mData.data()); + a_m_k_imag_device_buf.ToDevice(a_m_k_imag.mData.data()); + b_k_n_real_device_buf.ToDevice(b_k_n_real.mData.data()); + b_k_n_imag_device_buf.ToDevice(b_k_n_imag.mData.data()); + + auto a_element_op = AElementwiseOperation{}; + auto b_element_op = BElementwiseOperation{}; + auto c_element_op = CElementwiseOperation{}; + + // do GEMM + auto invoker = cgemm.MakeInvoker(); + auto argument = + cgemm.MakeArgument(static_cast(a_m_k_real_device_buf.GetDeviceBuffer()), + static_cast(a_m_k_imag_device_buf.GetDeviceBuffer()), + static_cast(b_k_n_real_device_buf.GetDeviceBuffer()), + static_cast(b_k_n_imag_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_real_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_imag_device_buf.GetDeviceBuffer()), + static_cast(workspace_device_buf.GetDeviceBuffer()), + M, + N, + K, + StrideA, + StrideB, + StrideC, + a_element_op, + b_element_op, + c_element_op); + + if(!cgemm.IsSupportedArgument(argument)) + { + throw std::runtime_error( + "wrong! device_cgemm with the specified compilation parameters does " + "not support this CGEMM problem"); + } + + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); + + std::size_t flop = std::size_t(8) * M * N * K; + std::size_t num_btype = + std::size_t(2) * + (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N); + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << cgemm.GetTypeString() << std::endl; + + c_m_n_real_device_buf.FromDevice(c_m_n_real_device_result.mData.data()); + c_m_n_imag_device_buf.FromDevice(c_m_n_imag_device_result.mData.data()); + + if(do_verification) + { + Tensor c_m_n_real_host_result( + f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_m_n_imag_host_result( + f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + + auto ref_cgemm = ReferenceCGemmInstance{}; + auto ref_invoker = ref_cgemm.MakeInvoker(); + + auto ref_argument = ref_cgemm.MakeArgument(a_m_k_real, + a_m_k_imag, + b_k_n_real, + b_k_n_imag, + c_m_n_real_host_result, + c_m_n_imag_host_result, + a_element_op, + b_element_op, + c_element_op); + + ref_invoker.Run(ref_argument); + + bool result = true; + result = ck::utils::check_err(c_m_n_real_device_result.mData, + c_m_n_real_host_result.mData, + "Verification error: incorrect results in real part!", + 1e-2f, + 1e-1f); + result = result && + ck::utils::check_err(c_m_n_imag_device_result.mData, + c_m_n_imag_host_result.mData, + "Verification error: incorrect results in imaginary part!", + 1e-2f, + 1e-1f); + return result ? 0 : 1; + } + return 0; +} diff --git a/example/22_cgemm/cgemm_xdl_fp16.cpp b/example/22_cgemm/cgemm_xdl_fp16.cpp index 8796dbfb08..7909bc1d65 100644 --- a/example/22_cgemm/cgemm_xdl_fp16.cpp +++ b/example/22_cgemm/cgemm_xdl_fp16.cpp @@ -2,43 +2,30 @@ // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. #include -#include -#include -#include -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" +#include "cgemm_xdl_common.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -template -using S = ck::Sequence; - -using F16 = ck::half_t; -using F32 = float; - -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -using ADataType = F16; -using BDataType = F16; -using CDataType = F16; -using AccDataType = F32; +using ADataType = F16; +using BDataType = F16; +using CDataType = F16; +using AccDataType = F32; +using CShuffleDataType = F32; using ALayout = ck::tensor_layout::gemm::RowMajor; using BLayout = ck::tensor_layout::gemm::ColumnMajor; using CLayout = ck::tensor_layout::gemm::RowMajor; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; +using ReferenceCGemmInstance = ck::tensor_operation::host:: + ReferenceCGemm; + // clang-format off using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_CShuffle ; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock // clang-format on -using ReferenceCGemmInstance = ck::tensor_operation::host:: - ReferenceCGemm; - int main(int argc, char* argv[]) { bool do_verification = true; @@ -124,155 +108,24 @@ int main(int argc, char* argv[]) } else { - printf("arg1: verification (0=no, 1=yes)\n"); - printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); - printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); + std::cout << "arg1: verification (0=no, 1=yes)\n" + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg3: run kernel # of times (>1)\n" + << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n" + << std::endl; exit(0); } - auto f_host_tensor_descriptor = - [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { - if(std::is_same::value) - { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); - } - else - { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); - } - }; - - Tensor a_m_k_real(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); - Tensor a_m_k_imag(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); - Tensor b_k_n_real(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); - Tensor b_k_n_imag(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); - Tensor c_m_n_real_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); - Tensor c_m_n_imag_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); - - std::cout << "a_m_k_real: " << a_m_k_real.mDesc << std::endl; - std::cout << "a_m_k_imag: " << a_m_k_imag.mDesc << std::endl; - std::cout << "b_k_n_real: " << b_k_n_real.mDesc << std::endl; - std::cout << "b_k_n_imag: " << b_k_n_imag.mDesc << std::endl; - std::cout << "c_m_n_real: " << c_m_n_real_device_result.mDesc << std::endl; - std::cout << "c_m_n_imag: " << c_m_n_imag_device_result.mDesc << std::endl; - - switch(init_method) - { - case 0: break; - case 1: - a_m_k_real.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - a_m_k_imag.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - b_k_n_real.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - b_k_n_imag.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - break; - default: - a_m_k_real.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - a_m_k_imag.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - b_k_n_real.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - b_k_n_imag.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - } - - auto cgemm = DeviceCGemmInstance{}; - - DeviceMem a_m_k_real_device_buf(sizeof(ADataType) * a_m_k_real.mDesc.GetElementSpaceSize()); - DeviceMem a_m_k_imag_device_buf(sizeof(ADataType) * a_m_k_imag.mDesc.GetElementSpaceSize()); - DeviceMem b_k_n_real_device_buf(sizeof(BDataType) * b_k_n_real.mDesc.GetElementSpaceSize()); - DeviceMem b_k_n_imag_device_buf(sizeof(BDataType) * b_k_n_imag.mDesc.GetElementSpaceSize()); - DeviceMem c_m_n_real_device_buf(sizeof(CDataType) * - c_m_n_real_device_result.mDesc.GetElementSpaceSize()); - DeviceMem c_m_n_imag_device_buf(sizeof(CDataType) * - c_m_n_imag_device_result.mDesc.GetElementSpaceSize()); - DeviceMem workspace_device_buf(cgemm.GetWorkspaceSize(M, N, K, StrideA, StrideB, StrideC)); - - a_m_k_real_device_buf.ToDevice(a_m_k_real.mData.data()); - a_m_k_imag_device_buf.ToDevice(a_m_k_imag.mData.data()); - b_k_n_real_device_buf.ToDevice(b_k_n_real.mData.data()); - b_k_n_imag_device_buf.ToDevice(b_k_n_imag.mData.data()); - - auto a_element_op = PassThrough{}; - auto b_element_op = PassThrough{}; - auto c_element_op = PassThrough{}; - - // do GEMM - auto invoker = cgemm.MakeInvoker(); - auto argument = - cgemm.MakeArgument(static_cast(a_m_k_real_device_buf.GetDeviceBuffer()), - static_cast(a_m_k_imag_device_buf.GetDeviceBuffer()), - static_cast(b_k_n_real_device_buf.GetDeviceBuffer()), - static_cast(b_k_n_imag_device_buf.GetDeviceBuffer()), - static_cast(c_m_n_real_device_buf.GetDeviceBuffer()), - static_cast(c_m_n_imag_device_buf.GetDeviceBuffer()), - static_cast(workspace_device_buf.GetDeviceBuffer()), - M, - N, - K, - StrideA, - StrideB, - StrideC, - a_element_op, - b_element_op, - c_element_op); - - if(!cgemm.IsSupportedArgument(argument)) - { - throw std::runtime_error( - "wrong! device_cgemm with the specified compilation parameters does " - "not support this CGEMM problem"); - } - - float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); - - std::size_t flop = std::size_t(8) * M * N * K; - std::size_t num_btype = - std::size_t(2) * - (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N); - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " - << cgemm.GetTypeString() << std::endl; - - c_m_n_real_device_buf.FromDevice(c_m_n_real_device_result.mData.data()); - c_m_n_imag_device_buf.FromDevice(c_m_n_imag_device_result.mData.data()); - - if(do_verification) - { - Tensor c_m_n_real_host_result( - f_host_tensor_descriptor(M, N, StrideC, CLayout{})); - Tensor c_m_n_imag_host_result( - f_host_tensor_descriptor(M, N, StrideC, CLayout{})); - - auto ref_cgemm = ReferenceCGemmInstance{}; - auto ref_invoker = ref_cgemm.MakeInvoker(); - - auto ref_argument = ref_cgemm.MakeArgument(a_m_k_real, - a_m_k_imag, - b_k_n_real, - b_k_n_imag, - c_m_n_real_host_result, - c_m_n_imag_host_result, - a_element_op, - b_element_op, - c_element_op); - - ref_invoker.Run(ref_argument); - - ck::utils::check_err(c_m_n_real_device_result.mData, - c_m_n_real_host_result.mData, - "Verification error: incorrect results in real part!", - 1e-2f, - 1e-1f); - ck::utils::check_err(c_m_n_imag_device_result.mData, - c_m_n_imag_host_result.mData, - "Verification error: incorrect results in imaginary part!", - 1e-2f, - 1e-1f); - } - - return 0; + return run_cgemm_xdl( + M, N, K, StrideA, StrideB, StrideC, do_verification, init_method, time_kernel); } diff --git a/example/22_cgemm/cgemm_xdl_fp32.cpp b/example/22_cgemm/cgemm_xdl_fp32.cpp new file mode 100644 index 0000000000..53b6afbc89 --- /dev/null +++ b/example/22_cgemm/cgemm_xdl_fp32.cpp @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "cgemm_xdl_common.hpp" + +#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp" + +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" + +using ADataType = F32; +using BDataType = F32; +using CDataType = F32; +using AccDataType = F32; + +using ALayout = ck::tensor_layout::gemm::RowMajor; +using BLayout = ck::tensor_layout::gemm::ColumnMajor; +using CLayout = ck::tensor_layout::gemm::RowMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +using ReferenceCGemmInstance = ck::tensor_operation::host:: + ReferenceCGemm; + +// clang-format off +using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_CShuffle + , // typename ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // typename ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // typename ABlockTransferSrcAccessOrder + 2, // index_t ABlockTransferSrcVectorDim + 4, // index_t ABlockTransferSrcScalarPerVector + 4, // index_t ABlockTransferDstScalarPerVector_AK1 + 1, // index_t ABlockLdsExtraM + S<4, 64, 1>, // typename BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // typename BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // typename BBlockTransferSrcAccessOrder + 2, // index_t BBlockTransferSrcVectorDim + 4, // index_t BBlockTransferSrcScalarPerVector + 4, // index_t BBlockTransferDstScalarPerVector_BK1 + 1, // index_t BBlockLdsExtraN + 1, // index_t CShuffleMXdlPerWavePerShuffle + 1, // index_t CShuffleNXdlPerWavePerShuffle + S<1, 16, 1, 16>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock + 4>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock +// clang-format on + +int main(int argc, char* argv[]) +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; + + // CGEMM shape + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; + + ck::index_t StrideA = 4096; + ck::index_t StrideB = 4096; + ck::index_t StrideC = 4096; + + if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + } + else if(argc == 10) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + + M = std::stoi(argv[4]); + N = std::stoi(argv[5]); + K = std::stoi(argv[6]); + + StrideA = std::stoi(argv[7]); + StrideB = std::stoi(argv[8]); + StrideC = std::stoi(argv[9]); + } + else + { + std::cout << "arg1: verification (0=no, 1=yes)\n" + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg3: run kernel # of times (>1)\n" + << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n" + << std::endl; + exit(0); + } + + return run_cgemm_xdl( + M, N, K, StrideA, StrideB, StrideC, do_verification, init_method, time_kernel); +} diff --git a/example/22_cgemm/cgemm_xdl_int8.cpp b/example/22_cgemm/cgemm_xdl_int8.cpp new file mode 100644 index 0000000000..be91877387 --- /dev/null +++ b/example/22_cgemm/cgemm_xdl_int8.cpp @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "cgemm_xdl_common.hpp" + +#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp" + +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" + +using ADataType = INT8; +using BDataType = INT8; +using CDataType = INT8; +using AccDataType = INT32; + +using ALayout = ck::tensor_layout::gemm::RowMajor; +using BLayout = ck::tensor_layout::gemm::ColumnMajor; +using CLayout = ck::tensor_layout::gemm::RowMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +using ReferenceCGemmInstance = ck::tensor_operation::host:: + ReferenceCGemm; + +// clang-format off +using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_CShuffle + , // typename ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // typename ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // typename ABlockTransferSrcAccessOrder + 2, // index_t ABlockTransferSrcVectorDim + 16, // index_t ABlockTransferSrcScalarPerVector + 16, // index_t ABlockTransferDstScalarPerVector_AK1 + 1, // index_t ABlockLdsExtraM + S<4, 64, 1>, // typename BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // typename BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // typename BBlockTransferSrcAccessOrder + 2, // index_t BBlockTransferSrcVectorDim + 8, // index_t BBlockTransferSrcScalarPerVector + 8, // index_t BBlockTransferDstScalarPerVector_BK1 + 1, // index_t BBlockLdsExtraN + 1, // index_t CShuffleMXdlPerWavePerShuffle + 1, // index_t CShuffleNXdlPerWavePerShuffle + S<1, 64, 1, 4>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock + 16>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock +// clang-format on + +int main(int argc, char* argv[]) +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; + + // CGEMM shape + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; + + ck::index_t StrideA = 4096; + ck::index_t StrideB = 4096; + ck::index_t StrideC = 4096; + + if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + } + else if(argc == 10) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + + M = std::stoi(argv[4]); + N = std::stoi(argv[5]); + K = std::stoi(argv[6]); + + StrideA = std::stoi(argv[7]); + StrideB = std::stoi(argv[8]); + StrideC = std::stoi(argv[9]); + } + else + { + std::cout << "arg1: verification (0=no, 1=yes)\n" + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg3: run kernel # of times (>1)\n" + << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n" + << std::endl; + exit(0); + } + + return run_cgemm_xdl( + M, N, K, StrideA, StrideB, StrideC, do_verification, init_method, time_kernel); +} diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp index 0466702aba..f8aea82471 100644 --- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp @@ -51,6 +51,13 @@ struct Add const float y_tmp = x1_tmp + x2_tmp; y = ck::type_convert(y_tmp); } + + template <> + __host__ __device__ constexpr void + operator()(int8_t& y, const int8_t& x0, const int8_t& x1) const + { + y = x0 + x1; + }; }; struct Subtract @@ -88,6 +95,13 @@ struct Subtract const float y_tmp = x1_tmp - x2_tmp; y = ck::type_convert(y_tmp); } + + template <> + __host__ __device__ constexpr void + operator()(int8_t& y, const int8_t& x0, const int8_t& x1) const + { + y = x0 - x1; + }; }; struct Bilinear diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp index ad88655879..c6f0d299ef 100644 --- a/include/ck/utility/dynamic_buffer.hpp +++ b/include/ck/utility/dynamic_buffer.hpp @@ -4,6 +4,7 @@ #pragma once #include "ck/ck.hpp" +#include "ck/utility/data_type.hpp" #include "enable_if.hpp" #include "c_style_pointer_cast.hpp" #include "amd_buffer_addressing.hpp" diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp index ce0e337498..b0149d88fd 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp @@ -6,8 +6,9 @@ #include #include -#include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/library/utility/host_tensor.hpp" +#include "ck/tensor_operation/gpu/device/device_base.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { namespace tensor_operation { @@ -91,7 +92,7 @@ struct ReferenceCGemm : public device::BaseOperator v_c_real += v_a_real * v_b_real - v_a_imag * v_b_imag; } - arg.c_m_n_real_(m, n) = v_c_real; + arg.c_m_n_real_(m, n) = ck::type_convert(v_c_real); }; auto f_mk_kn_mn_imag = [&](auto m, auto n) { @@ -107,7 +108,7 @@ struct ReferenceCGemm : public device::BaseOperator v_c_imag += v_a_real * v_b_imag + v_a_imag * v_b_real; } - arg.c_m_n_imag_(m, n) = v_c_imag; + arg.c_m_n_imag_(m, n) = ck::type_convert(v_c_imag); }; make_ParallelTensorFunctor(f_mk_kn_mn_real,