From 17ba91c926c4501d17751ec2d295d6388affc517 Mon Sep 17 00:00:00 2001 From: qin letao Date: Fri, 25 Apr 2025 07:18:36 +0000 Subject: [PATCH] rm example for fp16 bpreshuffle --- .../65_gemm_multiply_multiply/CMakeLists.txt | 1 - ...multiply_multiply_xdl_fp16_bpreshuffle.cpp | 345 ------------------ 2 files changed, 346 deletions(-) delete mode 100644 example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt index 1c3946d01b..a646bf4474 100644 --- a/example/65_gemm_multiply_multiply/CMakeLists.txt +++ b/example/65_gemm_multiply_multiply/CMakeLists.txt @@ -2,7 +2,6 @@ add_example_executable(example_gemm_multiply_multiply_xdl_fp8 gemm_multiply_mult add_example_executable(example_gemm_multiply_multiply_xdl_fp8_ab_scale gemm_multiply_multiply_xdl_fp8_ab_scale.cpp) add_example_executable(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp) add_example_executable(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle_padding gemm_multiply_multiply_xdl_fp8_bpreshuffle_padding.cpp) -add_example_executable(example_gemm_multiply_multiply_xdl_fp16_bpreshuffle gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp) add_example_executable(example_gemm_add_add_xdl_fp16 gemm_add_add_xdl_fp16.cpp) add_example_executable(example_gemm_multiply_multiply_xdl_int8 gemm_multiply_multiply_xdl_int8.cpp) # add_example_executable(example_moe_gemm1_xdl_fp8 moe_gemm1_xdl_fp8.cpp) diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp deleted file mode 100644 index 82060a4da3..0000000000 --- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp +++ /dev/null @@ -1,345 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_b_preshuffle.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" - -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/utility/literals.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" -#include "ck/library/utility/check_err.hpp" - -#include "ck/utility/blkgemmpipe_scheduler.hpp" - -template -using S = ck::Sequence; - -using F16 = ck::half_t; -using BF16 = ck::bhalf_t; -using FP8 = ck::f8_t; -using F32 = float; - -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; - -using A0DataType = F16; -using B0DataType = F16; -using AccDataType = F32; -using CShuffleDataType = F32; -using D0DataType = F32; -using D1DataType = F32; -using DsDataType = ck::Tuple; -using EDataType = F16; - -using A0Layout = Row; -using B0Layout = Col; -using D0Layout = Row; -using D1Layout = Col; -using DsLayout = ck::Tuple; -using ELayout = Row; - -template -void preShuffleBuffer(const DataType* src, DataType* dst, int N, int K, int NXdl) -{ - int KPack = 16 / sizeof(DataType); - int NLane = NXdl; - int KLane = 64 / NLane; - - int K0 = K / (KLane * KPack); - // K -> K0 KLane KPack - // N -> N0 NLane - // N, K -> N0 K0 KLane NLane KPack - int tempk; - for(int n = 0; n < N; ++n) - { - for(int k = 0; k < K; ++k) - { - int n0 = n / NLane; - int n1 = n % NLane; - - int k0 = k / (KLane * KPack); - tempk = k % (KLane * KPack); - int k1 = tempk / KPack; - int k2 = tempk % KPack; - - int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane + - k1 * KPack * NLane + n1 * KPack + k2; - - dst[outputIndex] = src[n * K + k]; - } - } -} -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -using AElementOp = PassThrough; -using BElementOp = PassThrough; -using CDEElementOp = ck::tensor_operation::element_wise::MultiplyMultiply; - -static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default; - -using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle - // clang-format off - < Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, - AElementOp, BElementOp, CDEElementOp, GemmSpec, 256, - 128, 128, 64, - 8, 8, - 32, 32, - 2, 2, - S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, - S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, - 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, - ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, F16>; -// clang-format on - -int main(int argc, char* argv[]) -{ - bool do_verification = true; - int init_method = 1; - bool time_kernel = false; - - // GEMM shape - ck::index_t M = 3840; - ck::index_t N = 4096; - ck::index_t K = 4096; - - ck::index_t StrideA = K; - ck::index_t StrideB = K; - ck::index_t StrideD = 0; - ck::index_t StrideE = N; - - ck::index_t KBatch = 1; - - ck::index_t Warmup = 50; - ck::index_t Repeat = 50; - - if(argc == 1) - { - // use default case - } - else if(argc == 4) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - } - else if(argc == 12) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - - M = std::stoi(argv[4]); - N = std::stoi(argv[5]); - K = std::stoi(argv[6]); - - StrideA = std::stoi(argv[7]); - StrideB = std::stoi(argv[8]); - StrideD = std::stoi(argv[9]); - StrideE = std::stoi(argv[10]); - - KBatch = std::stoi(argv[11]); - } - else if(argc == 14) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - - M = std::stoi(argv[4]); - N = std::stoi(argv[5]); - K = std::stoi(argv[6]); - - StrideA = std::stoi(argv[7]); - StrideB = std::stoi(argv[8]); - StrideD = std::stoi(argv[9]); - StrideE = std::stoi(argv[10]); - - KBatch = std::stoi(argv[11]); - - Warmup = std::stoi(argv[12]); - Repeat = std::stoi(argv[13]); - } - else - { - printf("arg1: verification (0=no, 1=yes)\n"); - printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: time kernel (0=no, 1=yes)\n"); - printf( - "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, KBatch\n"); - printf("arg10 to 11: Warmup, Repeat\n"); - exit(0); - } - - auto f_host_tensor_descriptor = - [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { - using namespace ck::literals; - - if(std::is_same::value) - { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); - } - else - { - return HostTensorDescriptor({row, col}, {1_uz, stride}); - } - }; - - Tensor a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{})); - Tensor b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{})); - Tensor b0_preshuffled( - f_host_tensor_descriptor(K, N, StrideB, B0Layout{})); // use laout only for size - Tensor d0_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{})); - Tensor d1_m_n(f_host_tensor_descriptor(M, N, StrideD, D1Layout{})); - Tensor e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); - Tensor e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); - - std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl; - std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl; - std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl; - std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl; - std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl; - - switch(init_method) - { - case 0: break; - case 1: - a0_m_k.GenerateTensorValue(GeneratorTensor_3{-2, 2}); - b0_k_n.GenerateTensorValue(GeneratorTensor_3{-2, 2}); - d0_m_n.GenerateTensorValue(GeneratorTensor_3{-2, 2}); - d1_m_n.GenerateTensorValue(GeneratorTensor_3{-2, 2}); - break; - case 2: - a0_m_k.GenerateTensorValue(GeneratorTensor_1{}); - b0_k_n.GenerateTensorValue(GeneratorTensor_1{}); - d0_m_n.GenerateTensorValue(GeneratorTensor_1{}); - d1_m_n.GenerateTensorValue(GeneratorTensor_1{}); - break; - default: - a0_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - b0_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - d0_m_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - d1_m_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - } - DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize()); - DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize()); - DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize()); - DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize()); - DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize()); - - a0_device_buf.ToDevice(a0_m_k.mData.data()); - d0_device_buf.ToDevice(d0_m_n.mData.data()); - d1_device_buf.ToDevice(d1_m_n.mData.data()); - e_device_buf.ToDevice(e_m_n_device_result.mData.data()); - - auto a_element_op = AElementOp{}; - auto b_element_op = BElementOp{}; - auto cde_element_op = CDEElementOp{}; - - constexpr ck::index_t NumDTensor = DsDataType::Size(); - - constexpr auto I0 = ck::Number<0>{}; - - // do GEMM - auto device_op = DeviceOpInstance{}; - - int NPerXdl = device_op.GetPreShuffleParameters(); - - preShuffleBuffer(b0_k_n.mData.data(), b0_preshuffled.mData.data(), N, K, NPerXdl); - - b0_device_buf.ToDevice(b0_preshuffled.mData.data()); - - auto invoker = device_op.MakeInvoker(); - auto argument = - device_op.MakeArgument(a0_device_buf.GetDeviceBuffer(), - b0_device_buf.GetDeviceBuffer(), - std::array{d0_device_buf.GetDeviceBuffer(), - d1_device_buf.GetDeviceBuffer()}, - e_device_buf.GetDeviceBuffer(), - M, - N, - K, - StrideA, - StrideB, - std::array{I0, I0}, - StrideE, - KBatch, - a_element_op, - b_element_op, - cde_element_op); - - if(!device_op.IsSupportedArgument(argument)) - { - throw std::runtime_error( - "wrong! device_gemm with the specified compilation parameters does " - "not support this GEMM problem"); - } - - size_t total_size = - (M * K * sizeof(A0DataType) + N * K * sizeof(B0DataType) + M * sizeof(D0DataType) + - N * sizeof(D1DataType) + M * N * sizeof(EDataType)); - int rotate_buf_num = - ck::math::min(size_t(Repeat), ck::math::integer_divide_ceil(512 * 1024 * 1024, total_size)); - - float ave_time = invoker.Run( - argument, StreamConfig{nullptr, time_kernel, 0, Warmup, Repeat, true, rotate_buf_num}); - - std::size_t flop = std::size_t(2) * M * N * K; - std::size_t num_btype = - sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N; - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" - << std::endl; - - if(do_verification) - { - invoker.Run(argument, StreamConfig{nullptr, false}); - - e_device_buf.FromDevice(e_m_n_device_result.mData.data()); - - Tensor c_m_n({M, N}); - - using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; - auto ref_gemm = ReferenceGemmInstance{}; - auto ref_invoker = ref_gemm.MakeInvoker(); - - auto ref_argument = ref_gemm.MakeArgument( - a0_m_k, b0_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{}); - - ref_invoker.Run(ref_argument); - - for(int m = 0; m < M; ++m) - { - for(int n = 0; n < N; ++n) - { - cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n)); - } - } - - e_device_buf.FromDevice(e_m_n_device_result.mData.data()); - - return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1; - } - - return 0; -}