From 7efc5b518dd7c8bcd4778015d9cfb36be49c8d2e Mon Sep 17 00:00:00 2001 From: zjing14 Date: Thu, 5 Oct 2023 11:41:01 -0500 Subject: [PATCH] remove example 60 (#963) Co-authored-by: Jing Zhang [ROCm/composable_kernel commit: 570ff3ddbe52d6e1d5e89284d8f3456c0ba34c23] --- example/60_gemm_multiABD/CMakeLists.txt | 10 - .../gemm_multiABD_xdl_fp16.cpp | 361 ------------------ 2 files changed, 371 deletions(-) delete mode 100644 example/60_gemm_multiABD/CMakeLists.txt delete mode 100644 example/60_gemm_multiABD/gemm_multiABD_xdl_fp16.cpp diff --git a/example/60_gemm_multiABD/CMakeLists.txt b/example/60_gemm_multiABD/CMakeLists.txt deleted file mode 100644 index 9e2f70649f..0000000000 --- a/example/60_gemm_multiABD/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) -list(APPEND gpu_list2 gfx908 gfx90a gfx940 gfx941 gfx942) -set(target 0) -foreach(gpu IN LISTS GPU_TARGETS) - if(gpu IN_LIST gpu_list2 AND target EQUAL 0) - add_example_executable(example_gemm_multiABD_xdl_fp16 gemm_multiABD_xdl_fp16.cpp) - set(target 1) - endif() -endforeach() -endif() diff --git a/example/60_gemm_multiABD/gemm_multiABD_xdl_fp16.cpp b/example/60_gemm_multiABD/gemm_multiABD_xdl_fp16.cpp deleted file mode 100644 index 23d41d7cc4..0000000000 --- a/example/60_gemm_multiABD/gemm_multiABD_xdl_fp16.cpp +++ /dev/null @@ -1,361 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/utility/literals.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" -#include "ck/library/utility/check_err.hpp" - -template -using S = ck::Sequence; - -using F16 = ck::half_t; -using F32 = float; - -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -using ADataType = F16; -using BDataType = F16; -using AccDataType = F32; -using CShuffleDataType = F32; -using DDataType = F16; -using EDataType = F16; - -using ALayout = Row; -using BLayout = Col; -using DLayout = Row; -using ELayout = Row; - -struct AddScale -{ - static constexpr auto I0 = ck::Number<0>{}; - static constexpr auto I1 = ck::Number<1>{}; - static constexpr auto I2 = ck::Number<2>{}; - static constexpr auto I3 = ck::Number<3>{}; - - __host__ __device__ constexpr void - operator()(ck::half4_t& a, const ck::half4_t& a0, const ck::half4_t& a1) const - { - const auto a0_v_t = ck::vector_type{a0}; - const auto a1_v_t = ck::vector_type{a1}; - - auto r_v_t = ck::vector_type{}; - - r_v_t.AsType()(I0) = - scale * (a0_v_t.AsType()[I0] + a1_v_t.AsType()[I0]); - r_v_t.AsType()(I1) = - scale * (a0_v_t.AsType()[I1] + a1_v_t.AsType()[I1]); - r_v_t.AsType()(I2) = - scale * (a0_v_t.AsType()[I2] + a1_v_t.AsType()[I2]); - r_v_t.AsType()(I3) = - scale * (a0_v_t.AsType()[I3] + a1_v_t.AsType()[I3]); - - a = r_v_t.AsType()[I0]; - } - - __host__ __device__ constexpr void - operator()(ck::half_t& a, const ck::half_t& a0, const ck::half_t& a1) const - { - a = scale * (a0 + a1); - } - - static constexpr ck::index_t vec_len = 4; - - float scale = 1.0; -}; - -struct AlphaBetaAdd -{ - AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){}; - - template - __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const; - - template <> - __host__ __device__ constexpr void operator()( - ck::half_t& e, const float& c, const ck::half_t& d) const - { - e = ck::type_convert(alpha_ * c + beta_ * ck::type_convert(d)); - }; - - float alpha_; - float beta_; -}; - -using AElementOp = AddScale; -using BElementOp = PassThrough; -using CDEElementOp = AlphaBetaAdd; - -static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - -using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Xdl_CShuffle< - ck::Tuple, - ck::Tuple, - ck::Tuple, - ELayout, - ck::Tuple, - ck::Tuple, - AccDataType, - CShuffleDataType, - ck::Tuple, - EDataType, - AElementOp, - BElementOp, - CDEElementOp, - GemmSpec, - 1, - 256, - 256, - 128, - 32, - 8, - 8, - 32, - 32, - 4, - 2, - S<4, 64, 1>, - S<1, 0, 2>, - S<1, 0, 2>, - 2, - 8, - 8, - 1, - S<4, 64, 1>, - S<1, 0, 2>, - S<1, 0, 2>, - 2, - 8, - 8, - 1, - 1, - 1, - S<1, 32, 1, 8>, - 8>; - -int main(int argc, char* argv[]) -{ - bool do_verification = true; - int init_method = 1; - bool time_kernel = false; - - // GEMM shape - ck::index_t M = 3840; - ck::index_t N = 4096; - ck::index_t K = 4096; - - ck::index_t StrideA = 4096; - ck::index_t StrideB = 4096; - ck::index_t StrideD = 4096; - ck::index_t StrideE = 4096; - - float alpha = 1.0f; - float beta = 1.0f; - - if(argc == 1) - { - // use default case - } - else if(argc == 4) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - } - else if(argc == 6) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - - alpha = std::stof(argv[4]); - beta = std::stof(argv[5]); - } - else if(argc == 13) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - - M = std::stoi(argv[4]); - N = std::stoi(argv[5]); - K = std::stoi(argv[6]); - - StrideA = std::stoi(argv[7]); - StrideB = std::stoi(argv[8]); - StrideD = std::stoi(argv[9]); - StrideE = std::stoi(argv[10]); - - alpha = std::stof(argv[11]); - beta = std::stof(argv[12]); - } - else - { - printf("arg1: verification (0=no, 1=yes)\n"); - printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: time kernel (0=no, 1=yes)\n"); - printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, " - "beta\n"); - exit(0); - } - - auto f_host_tensor_descriptor = - [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { - using namespace ck::literals; - - if(std::is_same::value) - { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); - } - else - { - return HostTensorDescriptor({row, col}, {1_uz, stride}); - } - }; - - Tensor a0_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); - Tensor a1_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); - Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); - Tensor d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{})); - Tensor e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); - Tensor e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); - - std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl; - std::cout << "a1_m_k: " << a1_m_k.mDesc << std::endl; - std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; - std::cout << "d_m_n: " << d_m_n.mDesc << std::endl; - std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl; - - switch(init_method) - { - case 0: break; - case 1: - a0_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - a1_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - d_m_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - break; - default: - a0_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - a1_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - d_m_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - } - - DeviceMem a0_device_buf(sizeof(ADataType) * a0_m_k.mDesc.GetElementSpaceSize()); - DeviceMem a1_device_buf(sizeof(ADataType) * a1_m_k.mDesc.GetElementSpaceSize()); - DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); - DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize()); - DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize()); - - a0_device_buf.ToDevice(a0_m_k.mData.data()); - a1_device_buf.ToDevice(a1_m_k.mData.data()); - b_device_buf.ToDevice(b_k_n.mData.data()); - d_device_buf.ToDevice(d_m_n.mData.data()); - e_device_buf.ToDevice(e_m_n_device_result.mData.data()); - - auto a_element_op = AElementOp{0.2}; - auto b_element_op = BElementOp{}; - auto cde_element_op = CDEElementOp{alpha, beta}; - - // do GEMM - auto device_op = DeviceOpInstance{}; - auto invoker = device_op.MakeInvoker(); - auto argument = - device_op.MakeArgument(std::array{a0_device_buf.GetDeviceBuffer(), - a1_device_buf.GetDeviceBuffer()}, - std::array{b_device_buf.GetDeviceBuffer()}, - std::array{d_device_buf.GetDeviceBuffer()}, - e_device_buf.GetDeviceBuffer(), - M, - N, - K, - std::array{StrideA, StrideA}, - std::array{StrideB}, - std::array{StrideD}, - StrideE, - a_element_op, - b_element_op, - cde_element_op); - - if(!device_op.IsSupportedArgument(argument)) - { - throw std::runtime_error( - "wrong! device_gemm with the specified compilation parameters does " - "not support this GEMM problem"); - } - - float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); - - std::size_t flop = std::size_t(2) * M * N * K; - std::size_t num_btype = - sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N; - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" - << std::endl; - - e_device_buf.FromDevice(e_m_n_device_result.mData.data()); - - if(do_verification) - { - Tensor c_m_n({M, N}); - - Tensor a_m_k({M, K}); - - for(int m = 0; m < M; ++m) - { - for(int k = 0; k < K; ++k) - { - a_element_op(a_m_k(m, k), a0_m_k(m, k), a1_m_k(m, k)); - } - } - - using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; - auto ref_gemm = ReferenceGemmInstance{}; - auto ref_invoker = ref_gemm.MakeInvoker(); - - auto ref_argument = - ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, PassThrough{}, b_element_op, PassThrough{}); - - ref_invoker.Run(ref_argument); - - for(int m = 0; m < M; ++m) - { - for(int n = 0; n < N; ++n) - { - cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n)); - } - } - - e_device_buf.FromDevice(e_m_n_device_result.mData.data()); - - return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1; - } - - return 0; -}