From 8d2fae714b45a13b47430321092ff1cf6373bbd4 Mon Sep 17 00:00:00 2001 From: coderfeli Date: Tue, 4 Mar 2025 02:33:22 +0000 Subject: [PATCH] rm unrelated files --- .../65_gemm_multiply_multiply/CMakeLists.txt | 16 +- ...emm_multiply_multiply_xdl_fp8_ab_scale.cpp | 72 +- .../65_gemm_multiply_multiply/moe_gemm1.cpp | 34 +- .../ck_tile/13_moe_sorting/moe_sorting.cpp | 5 +- ..._pipeline_xdlops_b_preshuffle_selector.hpp | 2 - .../blockwise_gemm_pipeline_xdlops_base.hpp | 5 +- ...kwise_gemm_pipeline_xdlops_v1_ab_scale.hpp | 615 +++--------------- ...kwise_gemm_pipeline_xdlops_v2_ab_scale.hpp | 93 +-- ...kwise_gemm_pipeline_xdlops_v3_ab_scale.hpp | 153 +---- .../gpu/device/device_gemm_multiple_d.hpp | 2 + .../gpu/device/device_gemm_v2.hpp | 36 +- ...batched_gemm_softmax_gemm_xdl_cshuffle.hpp | 2 +- ...mm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp | 195 ++++-- .../element/unary_element_wise_operation.hpp | 81 --- .../threadwise_tensor_slice_transfer.hpp | 73 +-- .../threadwise_tensor_slice_transfer_v7r3.hpp | 12 - 16 files changed, 309 insertions(+), 1087 deletions(-) diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt index 1108d8dd65..a9e886d6db 100644 --- a/example/65_gemm_multiply_multiply/CMakeLists.txt +++ b/example/65_gemm_multiply_multiply/CMakeLists.txt @@ -1,19 +1,7 @@ -list(APPEND TILE_EXAPMLE_BLOCKSCALE_COMPILE_OPTIONS -mllvm -greedy-reverse-local-assignment=1) -list(APPEND TILE_EXAPMLE_BLOCKSCALE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker) - -add_example_executable(example_moe_gemm1 moe_gemm1.cpp) -add_example_executable(example_moe_gemm2 moe_gemm2.cpp) -target_compile_options(example_moe_gemm1 PRIVATE ${TILE_EXAPMLE_BLOCKSCALE_COMPILE_OPTIONS}) -target_compile_options(example_moe_gemm2 PRIVATE ${TILE_EXAPMLE_BLOCKSCALE_COMPILE_OPTIONS}) - - add_example_executable(example_gemm_multiply_multiply_xdl_fp8 gemm_multiply_multiply_xdl_fp8.cpp) add_example_executable(example_gemm_multiply_multiply_xdl_fp8_ab_scale gemm_multiply_multiply_xdl_fp8_ab_scale.cpp) add_example_executable(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp) add_example_executable(example_gemm_add_add_xdl_fp16 gemm_add_add_xdl_fp16.cpp) add_example_executable(example_gemm_multiply_multiply_xdl_int8 gemm_multiply_multiply_xdl_int8.cpp) -add_example_executable(example_moe_pk_i4_gemm1 moe_pk_i4_gemm1.cpp) -set(EXAMPLE_COMPILE_OPTIONS) -list(APPEND EXAMPLE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker -g -fverbose-asm) -target_compile_options(example_moe_pk_i4_gemm1 PRIVATE ${EXAMPLE_COMPILE_OPTIONS}) -add_example_executable(example_moe_pk_i4_gemm2 moe_pk_i4_gemm2.cpp) +add_example_executable(example_moe_gemm1 moe_gemm1.cpp) +add_example_executable(example_moe_gemm2 moe_gemm2.cpp) \ No newline at end of file diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp index b54ba5ddfb..9b7849a654 100644 --- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp +++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp @@ -55,7 +55,7 @@ using CDEElementOp = PassThrough; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default; -static constexpr ck::index_t Scale_Block_M = 1; +static constexpr ck::index_t Scale_Block_M = 128; static constexpr ck::index_t Scale_Block_N = 128; static constexpr ck::index_t Scale_Block_K = 128; @@ -65,14 +65,14 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_ABScale_ A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 256, Scale_Block_M, Scale_Block_N, Scale_Block_K, - 16, 128, - 256, 16, 16, + 128, 128, + 128, 16, 16, 16, 16, - 1, 2, - S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, - S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, - 1, 2, S<1, 16, 1, 16>, S<8>, - ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, FP8>; + 4, 4, + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, + 1, 2, S<1, 32, 1, 8>, S<8, 8, 1>, + ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>; // clang-format on int main(int argc, char* argv[]) @@ -80,12 +80,11 @@ int main(int argc, char* argv[]) bool do_verification = true; int init_method = 1; bool time_kernel = false; - bool flush_cache = true; // GEMM shape - ck::index_t M = 128; - ck::index_t N = 1024; - ck::index_t K = 1024; + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; ck::index_t StrideA = K; ck::index_t StrideB = K; @@ -101,7 +100,7 @@ int main(int argc, char* argv[]) init_method = std::stoi(argv[2]); time_kernel = std::stoi(argv[3]); } - else if(argc == 8) + else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); @@ -111,19 +110,16 @@ int main(int argc, char* argv[]) N = std::stoi(argv[5]); K = std::stoi(argv[6]); - flush_cache = std::stoi(argv[7]); - - StrideA = K; - StrideB = K; - StrideE = N; + StrideA = std::stoi(argv[7]); + StrideB = std::stoi(argv[8]); + StrideE = std::stoi(argv[9]); } else { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg3: time kernel (0=no, 1=yes)\n"); - printf("arg4 to 6: M, N, K\n"); - printf("arg7: flush both I$ and L2$ (0=no, 1=yes)\n"); + printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"); exit(0); } @@ -186,15 +182,9 @@ int main(int argc, char* argv[]) b1_k_n.GenerateTensorValue(GeneratorTensor_1{}); break; case 4: - a0_m_k.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - b0_k_n.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + a0_m_k.GenerateTensorValue(GeneratorTensor_1{}); + b0_k_n.GenerateTensorValue(GeneratorTensor_1{}); a1_m_k.GenerateTensorValue(GeneratorTensor_3{0, 1.0}); - b1_k_n.GenerateTensorValue(GeneratorTensor_1{}); - break; - case 5: - a0_m_k.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - b0_k_n.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - a1_m_k.GenerateTensorValue(GeneratorTensor_1{}); b1_k_n.GenerateTensorValue(GeneratorTensor_3{0, 1.0}); break; default: @@ -204,16 +194,6 @@ int main(int argc, char* argv[]) b1_k_n.GenerateTensorValue(GeneratorTensor_3{0, 1.0}); } #endif -#if 0 - for(int im =0; im< (M + Scale_Block_M - 1) / Scale_Block_M; im++){ - float row_sum = .0; - for(int ik =0; ik< (K + Scale_Block_K - 1) / Scale_Block_K; ik++){ - printf("%lf ",a1_m_k(im, ik)); - row_sum += a1_m_k(im, ik); - } - printf("sum: %lf\n", row_sum * 128); - } -#endif DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize()); DeviceMem a1_device_buf(sizeof(A1DataType) * a1_m_k.mDesc.GetElementSpaceSize()); @@ -259,24 +239,12 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 20, 50}); + std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N; - float ave_time = .0; - - if(flush_cache) - { - int rotating_buf = (512 * 1024 * 1024 + num_btype - 1) / num_btype; - - ave_time = invoker.Run(argument, - StreamConfig{nullptr, time_kernel, 0, 50, 100, true, rotating_buf}); - } - else - { - ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 0, 50, 100}); - } - float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time; diff --git a/example/65_gemm_multiply_multiply/moe_gemm1.cpp b/example/65_gemm_multiply_multiply/moe_gemm1.cpp index c8897e1a75..df9b58267a 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm1.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm1.cpp @@ -148,14 +148,6 @@ static constexpr ck::index_t D1Vec = 1; // using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3 using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm // clang-format off -///######| ALayout| BLayout| DsLayout| ELayout| AData| BData| DsData| EData| AccData| CShuffle| A| B| CDE| GEMM| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| -///######| | | | | Type| Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| -///######| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| -///######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | S| -///###### RCR - // kernel 1: 256->32x128x128 - // < Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 256, 32, 128, 128, 16, 16, 32, 32, 1, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, EDataType>; - // < Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 256, 32, 128, 256, 16, 16, 32, 32, 1, 1, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, EDataType>; < Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, //threadnum, mblock, nblock, kblock @@ -167,18 +159,13 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm // mn_xdlperwave MXDLPerWave, NXDLPerWave, // a,b: loadtranfer cluster, cluster order, srcorder,VECDIM, srcpervec, dstpervec, lds_extra - // S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, - // S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0, - S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0, + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 0, // CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| // MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| // PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| 4, 1, S<1, 32, 1, 8>, S, ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, Nswizzle, true, A0DataType>; - // kernel 2: 128->32x128x128 - // < Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 128, 32, 128, 128, 16, 16, 32, 32, 1, 2, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 16, 1, 8>, S<8, 8, 1>, ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, EDataType>; -// DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle< Row, Col, Tuple, Row, F8, F8, Tuple, F16, F32, F32, PassThrough, PassThrough, MultiplyMultiply, GemmSpec, 256, 128, 128, 128, 16, 16, 32, 32, 4, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>, // clang-format on @@ -188,10 +175,6 @@ int main(int argc, char* argv[]) int init_method = 1; bool time_kernel = true; -// tokens = 1 -// topk = 1 -// experts = 8 -// per expert: // GEMM shape ck::index_t N = 14336 * 2; ck::index_t K = 4096; @@ -322,9 +305,6 @@ int main(int argc, char* argv[]) d0_t_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); d1_e_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); } - // d0_t_n.GenerateTensorValue(GeneratorTensor_1{}); - // d1_e_n.GenerateTensorValue(GeneratorTensor_1{}); - // b0_e_n_k.GenerateTensorValue(GeneratorTensor_1{}); DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) * sorted_token_ids.mDesc.GetElementSpaceSize()); DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.mDesc.GetElementSpaceSize()); DeviceMem max_token_id_dev(sizeof(ck::index_t) * max_token_id.mDesc.GetElementSpaceSize()); @@ -333,9 +313,9 @@ int main(int argc, char* argv[]) DeviceMem d0_device_buf(sizeof(D0DataType) * d0_t_n.mDesc.GetElementSpaceSize()); DeviceMem d1_device_buf(sizeof(D1DataType) * d1_e_n.mDesc.GetElementSpaceSize()); DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.mDesc.GetElementSpaceSize()); - a0_t_k.savetxt("a.txt"); - d0_t_n.savetxt("d0_t_n.txt", "int"); - d1_e_n.savetxt("d1_e_n.txt", "int"); + // a0_t_k.savetxt("a.txt"); + // d0_t_n.savetxt("d0_t_n.txt", "int"); + // d1_e_n.savetxt("d1_e_n.txt", "int"); sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data()); expert_ids_dev.ToDevice(expert_ids.mData.data()); max_token_id_dev.ToDevice(max_token_id.mData.data()); @@ -429,7 +409,6 @@ int main(int argc, char* argv[]) const int fuse_t = sorted_token_ids.mData[m]; const int t = fuse_t & 0xffffff; const int topk_id = (fuse_t & 0xff000000) >> 24; - // printf("m %d fuset %d %d %d\n",m, fuse_t, t, topk_id); if (t >= tokens) { @@ -439,13 +418,12 @@ int main(int argc, char* argv[]) for(int n = 0; n < N; ++n) { cde_element_op(e_t_n_host_result(t, topk_id, n), c_t_k_n(t, topk_id, n), d0_t_n(t, n), d1_e_n(e, n)); - // printf("m %d n %d topk %d token %d %f %f\n",m, n,topk_id, t, e_t_n_host_result(t, topk_id, n), c_t_k_n(t, topk_id, n)); } } e_device_buf.FromDevice(e_t_n_device_result.mData.data()); - e_t_n_device_result.savetxt("out.txt"); - e_t_n_host_result.savetxt("ref.txt"); + // e_t_n_device_result.savetxt("out.txt"); + // e_t_n_host_result.savetxt("ref.txt"); return ck::utils::check_err( e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-2) ? 0 diff --git a/example/ck_tile/13_moe_sorting/moe_sorting.cpp b/example/ck_tile/13_moe_sorting/moe_sorting.cpp index 69d211236f..f00d948f25 100644 --- a/example/ck_tile/13_moe_sorting/moe_sorting.cpp +++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp @@ -125,7 +125,7 @@ bool test_moe_sorting(ck_tile::ArgParser args) ck_tile::HostTensor sorted_ids_host({max_output_ids}, {1}); ck_tile::HostTensor sorted_weights_host({max_output_ids}, {1}); ck_tile::HostTensor sorted_expert_ids_host({max_output_ids / unit_size}, {1}); - ck_tile::HostTensor sorted_id_cnt_host({1 + max_output_ids / unit_size}, {1}); + ck_tile::HostTensor sorted_id_cnt_host({1}, {1}); ck_tile::HostTensor moe_buf_host({moe_buf_size}); ck_tile::FillUniformDistribution{-.5f, .5f}(weights_host); @@ -268,8 +268,7 @@ bool test_moe_sorting(ck_tile::ArgParser args) { moe_buf_dev.FromDevice(moe_buf_host.data()); } - sorted_expert_ids_host.savetxt("sorted_expert_ids_host.txt","int"); - sorted_id_cnt_host.savetxt("sorted_id_cnt_host.txt","int"); + bool rtn = true; if(validate) { diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp index 0e647a10cb..0fbe7d63a9 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp @@ -4,10 +4,8 @@ #pragma once #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp" -#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp" #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp" #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp" -#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v3.hpp" namespace ck { diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp index f3e2a67ed7..c6a9d60e34 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp @@ -46,9 +46,8 @@ struct BlockwiseGemmXdlops_pipeline_base static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0); static constexpr index_t B_K0 = BTileDesc{}.GetLength(I0); static constexpr index_t A_K1 = ATileDesc{}.GetLength(I2); - // static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2); - static constexpr index_t B_K1 = BBlockTransferSrcScalarPerVector; - + static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2); + static constexpr auto xdlops_gemm = XdlopsGemm{}; diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp index 8375e81fa0..821bbb0051 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_ab_scale.hpp @@ -7,10 +7,10 @@ namespace ck { -// Compute optimized pipeline -// GlobalPrefetchStages: 2 +// Naive pipeline with lowest resource request per WGP +// GlobalPrefetchStages: 1 // LocalPreFillStages: 1 -// LocalPreFetchStages: 1 +// LocalPreFetchStages: 0 // LocalSharedMemoryBuffer: 1 template + KPack> { using Base = BlockwiseGemmXdlops_pipeline_base; - using Base::A_K1; - using Base::B_K1; + KPack>; using Base::I0; - using Base::I1; using Base::KRepeat; using Base::xdlops_gemm; - using typename Base::HotLoopInstList; using Base::CalculateCThreadOriginDataIndex; using Base::CalculateCThreadOriginDataIndex8D; @@ -137,43 +131,19 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale PrefetchStages; @@ -181,116 +151,11 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale - // sizeof(ComputeDataType) / sizeof(BDataType) - // ? sizeof(ComputeDataType) / sizeof(ADataType) - // : sizeof(ComputeDataType) / sizeof(BDataType); - constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma); - constexpr auto num_mfma_per_issue = - num_mfma_stage1 / (num_buffer_load_inst_a + num_buffer_load_inst_b); - constexpr auto num_dswrite_per_issue_a = num_ds_write_inst_a / num_buffer_load_inst_a; - constexpr auto num_dswrite_per_issue_b = num_ds_write_inst_b / num_buffer_load_inst_b; - - static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) { - ignore = i; - static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) { - ignore = idswrite; - __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write - __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA - }); - __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read - __builtin_amdgcn_sched_group_barrier( - 0x008, num_mfma_per_issue - num_dswrite_per_issue_a, 0); // MFMA - }); - static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) { - ignore = i; - static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) { - ignore = idswrite; - __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write - __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA - }); - __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read - __builtin_amdgcn_sched_group_barrier( - 0x008, num_mfma_per_issue - num_dswrite_per_issue_b, 0); // MFMA - }); - - // stage 2 - static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) { - if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >= - ds_read_a_mfma_rate) - { - __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read - } - else - { - __builtin_amdgcn_sched_group_barrier(0x100, - num_ds_read_inst_a - (num_dsread_a_mfma - 1) * - ds_read_a_mfma_rate, - 0); // DS read - } - __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA - }); - - static_for<0, num_dsread_b_mfma, 1>{}([&](auto i) { - if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >= - ds_read_b_mfma_rate) - { - __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0); // DS read - } - else - { - __builtin_amdgcn_sched_group_barrier(0x100, - num_ds_read_inst_b - (num_dsread_b_mfma - 1) * - ds_read_b_mfma_rate, - 0); // DS read - } - __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA - }); + ignore = num_loop; + return TailNumber::Full; } template ( a_thread_desc_.GetElementSpaceSize()); auto b_thread_buf = make_static_buffer( @@ -359,8 +223,6 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale( b_scale_thread_desc.GetElementSpaceSize()); - auto c_scale_thread_buf = make_static_buffer( - c_scale_thread_desc.GetElementSpaceSize()); // Global prefetch 1 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); @@ -369,26 +231,11 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale{}([&](auto m0) { - a_scale_thread_copy.Run(a_scale_grid_desc, - a_scale_grid_buf, - a_scale_thread_desc, - make_tuple(m0, I0), - a_scale_thread_buf); - a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, - a_scale_thread_copy_step.At(Number<0>{})); - }); - - if constexpr(NumKBlockPerScale == 1) - { - a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, - a_scale_thread_copy_step.At(Number<2>{})); - } - else - { - a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, - a_scale_thread_copy_step.At(Number<1>{})); - } + a_scale_thread_copy.Run(a_scale_grid_desc, + a_scale_grid_buf, + a_scale_thread_desc, + make_tuple(I0, I0), + a_scale_thread_buf); b_scale_thread_copy.Run(b_scale_grid_desc, b_scale_grid_buf, @@ -396,101 +243,17 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale{}); - constexpr auto num_scale_m_block = CScaleThreadDesc{}.GetLength(Number<1>{}); - constexpr auto num_scale_n_block = CScaleThreadDesc{}.GetLength(Number<2>{}); - - static_for<0, num_scale_m_block, 1>{}([&](auto m0) { - static_for<0, num_scale_n_block, 1>{}([&](auto n0) { - static_for<0, num_scale_k_block, 1>{}([&](auto k0) { - constexpr index_t c_offset = - CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0)); - constexpr index_t a_offset = - AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0)); - constexpr index_t b_offset = - BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0)); - - c_scale_thread_buf(Number{}) = - a_scale_thread_buf[Number{}] * - b_scale_thread_buf[Number{}]; - }); - }); - }); - // Local prefill 1 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); - // Global prefetch 2 - a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); - b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); - - a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); - b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); - - static_for<0, MRepeat, 1>{}([&](auto m0) { - a_scale_thread_copy.Run(a_scale_grid_desc, - a_scale_grid_buf, - a_scale_thread_desc, - make_tuple(m0, I0), - a_scale_thread_buf); - a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, - a_scale_thread_copy_step.At(Number<0>{})); - }); - - if constexpr(NumKBlockPerScale == 1) - { - a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, - a_scale_thread_copy_step.At(Number<2>{})); - } - else - { - a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, - a_scale_thread_copy_step.At(Number<1>{})); - } - - b_scale_thread_copy.Run(b_scale_grid_desc, - b_scale_grid_buf, - b_scale_thread_desc, - make_tuple(I0, I0), - b_scale_thread_buf); - - b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step); - // Initialize C c_thread_buf.Clear(); - StaticBufferTupleOfVector - c_thread_buf_per_scale; - - // Local prefetch 1 - block_sync_lds(); - static_for<0, KRepeat, 1>{}([&](auto k0) { - static_for<0, MRepeat, 1>{}([&](auto m0) { - a_thread_copy_.Run(a_block_desc_m0_m1_m2_k, - make_tuple(m0, I0, I0, Number{}), - a_block_buf, - a_thread_desc_, - make_tuple(m0, I0, k0, I0), - a_thread_buf); - }); - static_for<0, NRepeat, 1>{}([&](auto n0) { - b_thread_copy_.Run(b_block_desc_n0_n1_n2_k, - make_tuple(n0, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(n0, I0, k0, I0), - b_thread_buf); - }); - }); - - __builtin_amdgcn_sched_barrier(0); + auto c_thread_buf_per_scale = remove_cvref_t(); // main body if constexpr(HasMainLoop) @@ -498,85 +261,13 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale{}([&](auto m0) { - static_for<0, NRepeat, 1>{}([&](auto n0) { - static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) { - static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}) - .template AsType()(Number{}) = 0; - }); - static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) { - vector_type a_thread_vec; - vector_type b_thread_vec; - - static_for<0, KPack, 1>{}([&](auto ik) { - a_thread_vec.template AsType()(ik) = - a_thread_buf[Number{}]; - b_thread_vec.template AsType()(ik) = - b_thread_buf[Number{}]; - }); - - using mfma_input_type = - typename vector_type::type; - - xdlops_gemm.template Run<>( - a_thread_vec.template AsType(), - b_thread_vec.template AsType(), - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})); - }); - static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t)); - constexpr index_t cscale_offset = - CScaleThreadDesc{}.CalculateOffset( - make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat)); - - c_thread_buf(Number{}) += - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}) - .template AsType()[Number{}] * - type_convert( - c_scale_thread_buf[Number{}]); - }); - }); - }); - }); - - static_for<0, MRepeat, 1>{}([&](auto m0) { - static_for<0, num_scale_n_block, 1>{}([&](auto n0) { - static_for<0, num_scale_k_block, 1>{}([&](auto k0) { - constexpr index_t c_offset = - CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0)); - constexpr index_t a_offset = - AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0)); - constexpr index_t b_offset = - BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0)); - - c_scale_thread_buf(Number{}) = - a_scale_thread_buf[Number{}] * - b_scale_thread_buf[Number{}]; - }); - }); - }); - block_sync_lds(); static_for<0, KRepeat, 1>{}([&](auto k) { static_for<0, MRepeat, 1>{}([&](auto m0) { @@ -598,70 +289,19 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale{}([&](auto m0) { - a_scale_thread_copy.Run(a_scale_grid_desc, - a_scale_grid_buf, - a_scale_thread_desc, - make_tuple(m0, I0), - a_scale_thread_buf); - a_scale_thread_copy.MoveSrcSliceWindow( - a_scale_grid_desc, a_scale_thread_copy_step.At(Number<0>{})); - }); - - if constexpr(NumKBlockPerScale == 1) - { - a_scale_thread_copy.MoveSrcSliceWindow( - a_scale_grid_desc, a_scale_thread_copy_step.At(Number<2>{})); - } - else - { - a_scale_thread_copy.MoveSrcSliceWindow( - a_scale_grid_desc, a_scale_thread_copy_step.At(Number<1>{})); - } - - b_scale_thread_copy.Run(b_scale_grid_desc, - b_scale_grid_buf, - b_scale_thread_desc, - make_tuple(I0, I0), - b_scale_thread_buf); - - b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step); - HotLoopScheduler(); - __builtin_amdgcn_sched_barrier(0); - i += 1; - } while(i < (num_loop - 2)); - } - - // tail - if constexpr(TailNum == TailNumber::Full) - { - block_sync_lds(); - a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); - b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); - - static_for<0, MRepeat, 1>{}([&](auto m0) { - static_for<0, NRepeat, 1>{}([&](auto n0) { - static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) { - static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}) - .template AsType()(Number{}) = 0; - }); - static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) { + static_for<0, NRepeat, 1>{}([&](auto n0) { + c_thread_buf_per_scale.Clear(); + static_for<0, KRepeat, 1>{}([&](auto k0) { vector_type a_thread_vec; vector_type b_thread_vec; static_for<0, KPack, 1>{}([&](auto ik) { a_thread_vec.template AsType()(ik) = a_thread_buf[Number{}]; + make_tuple(m0, I0, k0, ik))>{}]; b_thread_vec.template AsType()(ik) = b_thread_buf[Number{}]; + make_tuple(n0, I0, k0, ik))>{}]; }); using mfma_input_type = @@ -671,41 +311,46 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale( a_thread_vec.template AsType(), b_thread_vec.template AsType(), - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})); + c_thread_buf_per_scale.GetVectorTypeReference(I0)); }); static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t)); - constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset( - make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat)); - c_thread_buf(Number{}) += - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}) - .template AsType()[Number{}] * - type_convert( - c_scale_thread_buf[Number{}]); + c_thread_buf_per_scale[Number{}] * + type_convert(a_scale_thread_buf[I0]) * + type_convert(b_scale_thread_buf[I0]); }); }); }); - }); - static_for<0, MRepeat, 1>{}([&](auto m0) { - static_for<0, num_scale_n_block, 1>{}([&](auto n0) { - static_for<0, num_scale_k_block, 1>{}([&](auto k0) { - constexpr index_t c_offset = - CScaleThreadDesc{}.CalculateOffset(make_tuple(k0, m0, n0)); - constexpr index_t a_offset = - AScaleThreadDesc{}.CalculateOffset(make_tuple(m0, k0)); - constexpr index_t b_offset = - BScaleThreadDesc{}.CalculateOffset(make_tuple(n0, k0)); + a_scale_thread_copy.Run(a_scale_grid_desc, + a_scale_grid_buf, + a_scale_thread_desc, + make_tuple(I0, I0), + a_scale_thread_buf); - c_scale_thread_buf(Number{}) = - a_scale_thread_buf[Number{}] * - b_scale_thread_buf[Number{}]; - }); - }); - }); + b_scale_thread_copy.Run(b_scale_grid_desc, + b_scale_grid_buf, + b_scale_thread_desc, + make_tuple(I0, I0), + b_scale_thread_buf); + a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, a_scale_thread_copy_step); + b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step); + + block_sync_lds(); + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); + + i += 1; + + } while(i < (num_loop - 1)); + } + + // tail + if constexpr(TailNum == TailNumber::Full) + { block_sync_lds(); static_for<0, KRepeat, 1>{}([&](auto k) { static_for<0, MRepeat, 1>{}([&](auto m0) { @@ -726,143 +371,49 @@ struct BlockwiseGemmXdlops_pipeline_v1_ab_scale{}([&](auto m0) { static_for<0, NRepeat, 1>{}([&](auto n0) { - static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) { - static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}) - .template AsType()(Number{}) = 0; + c_thread_buf_per_scale.Clear(); + static_for<0, KRepeat, 1>{}([&](auto k0) { + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, KPack, 1>{}([&](auto ik) { + a_thread_vec.template AsType()(ik) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(ik) = + b_thread_buf[Number{}]; }); - static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) { - vector_type a_thread_vec; - vector_type b_thread_vec; - static_for<0, KPack, 1>{}([&](auto ik) { - a_thread_vec.template AsType()(ik) = - a_thread_buf[Number{}]; - b_thread_vec.template AsType()(ik) = - b_thread_buf[Number{}]; - }); + using mfma_input_type = + typename vector_type::type; - using mfma_input_type = - typename vector_type::type; - - xdlops_gemm.template Run<>( - a_thread_vec.template AsType(), - b_thread_vec.template AsType(), - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})); - }); - static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t)); - constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset( - make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat)); - - c_thread_buf(Number{}) += - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}) - .template AsType()[Number{}] * - type_convert( - c_scale_thread_buf[Number{}]); - }); + xdlops_gemm.template Run<>( + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf_per_scale.GetVectorTypeReference(I0)); + }); + static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t)); + c_thread_buf(Number{}) += + c_thread_buf_per_scale[Number{}] * + type_convert(a_scale_thread_buf[I0]) * + type_convert(b_scale_thread_buf[I0]); }); }); }); - __builtin_amdgcn_sched_barrier(0); - } - else if constexpr(TailNum == TailNumber::Odd) - { - static_for<0, MRepeat, 1>{}([&](auto m0) { - static_for<0, NRepeat, 1>{}([&](auto n0) { - static_for<0, num_scale_k_block, 1>{}([&](auto kscale0) { - static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}) - .template AsType()(Number{}) = 0; - }); - static_for<0, KRepeat / num_scale_k_block, 1>{}([&](auto k0) { - vector_type a_thread_vec; - vector_type b_thread_vec; - - static_for<0, KPack, 1>{}([&](auto ik) { - a_thread_vec.template AsType()(ik) = - a_thread_buf[Number{}]; - b_thread_vec.template AsType()(ik) = - b_thread_buf[Number{}]; - }); - - using mfma_input_type = - typename vector_type::type; - - xdlops_gemm.template Run<>( - a_thread_vec.template AsType(), - b_thread_vec.template AsType(), - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})); - }); - static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t)); - constexpr index_t cscale_offset = CScaleThreadDesc{}.CalculateOffset( - make_tuple(kscale0, m0, n0 * num_scale_n_block / NRepeat)); - - c_thread_buf(Number{}) += - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}) - .template AsType()[Number{}] * - type_convert( - c_scale_thread_buf[Number{}]); - }); - }); - }); - }); - __builtin_amdgcn_sched_barrier(0); } } protected: + using Base::a_thread_copy_; using Base::a_thread_desc_; + using Base::b_thread_copy_; using Base::b_thread_desc_; using Base::c_thread_desc_; - using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3>, - 3, - A_K1, - A_K1>; - - using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3>, - 3, - B_K1, - B_K1>; - - AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()}; - BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; }; } // namespace ck diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp index c8ad9c5b02..40fa776484 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_ab_scale.hpp @@ -96,8 +96,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale + KPack> { using Base = BlockwiseGemmXdlops_pipeline_base; + KPack>; using Base::I0; using Base::KRepeat; using Base::xdlops_gemm; @@ -272,26 +270,11 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale{}([&](auto m0) { - a_scale_thread_copy.Run(a_scale_grid_desc, - a_scale_grid_buf, - a_scale_thread_desc, - make_tuple(m0, I0), - a_scale_thread_buf); - a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, - a_scale_thread_copy_step.At(Number<0>{})); - }); - - if(num_loop_per_scale == 1) - { - a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, - a_scale_thread_copy_step.At(Number<2>{})); - } - else - { - a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, - a_scale_thread_copy_step.At(Number<1>{})); - } + a_scale_thread_copy.Run(a_scale_grid_desc, + a_scale_grid_buf, + a_scale_thread_desc, + make_tuple(I0, I0), + a_scale_thread_buf); b_scale_thread_copy.Run(b_scale_grid_desc, b_scale_grid_buf, @@ -299,6 +282,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale{}) += c_thread_buf_per_scale[Number{}] * - type_convert(a_scale_thread_buf[m0]) * + type_convert(a_scale_thread_buf[I0]) * type_convert(b_scale_thread_buf[I0]); }); }); }); - static_for<0, MRepeat, 1>{}([&](auto m0) { - a_scale_thread_copy.Run(a_scale_grid_desc, - a_scale_grid_buf, - a_scale_thread_desc, - make_tuple(m0, I0), - a_scale_thread_buf); - a_scale_thread_copy.MoveSrcSliceWindow( - a_scale_grid_desc, a_scale_thread_copy_step.At(Number<0>{})); - }); - - if(num_loop_per_scale == 1) - { - a_scale_thread_copy.MoveSrcSliceWindow( - a_scale_grid_desc, a_scale_thread_copy_step.At(Number<2>{})); - } - else - { - a_scale_thread_copy.MoveSrcSliceWindow( - a_scale_grid_desc, a_scale_thread_copy_step.At(Number<1>{})); - } + a_scale_thread_copy.Run(a_scale_grid_desc, + a_scale_grid_buf, + a_scale_thread_desc, + make_tuple(I0, I0), + a_scale_thread_buf); b_scale_thread_copy.Run(b_scale_grid_desc, b_scale_grid_buf, @@ -409,6 +378,8 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale{}) += c_thread_buf_per_scale[Number{}] * - type_convert(a_scale_thread_buf[m0]) * + type_convert(a_scale_thread_buf[I0]) * type_convert(b_scale_thread_buf[I0]); }); }); }); - static_for<0, MRepeat, 1>{}([&](auto m0) { - a_scale_thread_copy.Run(a_scale_grid_desc, - a_scale_grid_buf, - a_scale_thread_desc, - make_tuple(m0, I0), - a_scale_thread_buf); - a_scale_thread_copy.MoveSrcSliceWindow( - a_scale_grid_desc, a_scale_thread_copy_step.At(Number<0>{})); - }); - - if(num_loop_per_scale == 1) - { - a_scale_thread_copy.MoveSrcSliceWindow( - a_scale_grid_desc, a_scale_thread_copy_step.At(Number<2>{})); - } - else - { - a_scale_thread_copy.MoveSrcSliceWindow( - a_scale_grid_desc, a_scale_thread_copy_step.At(Number<1>{})); - } + a_scale_thread_copy.Run(a_scale_grid_desc, + a_scale_grid_buf, + a_scale_thread_desc, + make_tuple(I0, I0), + a_scale_thread_buf); b_scale_thread_copy.Run(b_scale_grid_desc, b_scale_grid_buf, @@ -515,6 +471,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale{}) += c_thread_buf_per_scale[Number{}] * - type_convert(a_scale_thread_buf[m0]) * + type_convert(a_scale_thread_buf[I0]) * type_convert(b_scale_thread_buf[I0]); }); }); @@ -629,7 +586,7 @@ struct BlockwiseGemmXdlops_pipeline_v2_ab_scale{}) += c_thread_buf_per_scale[Number{}] * - type_convert(a_scale_thread_buf[m0]) * + type_convert(a_scale_thread_buf[I0]) * type_convert(b_scale_thread_buf[I0]); }); }); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp index fc0075b196..de542866a6 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_ab_scale.hpp @@ -96,8 +96,7 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale + KPack> { using Base = BlockwiseGemmXdlops_pipeline_base; + KPack>; using Base::I0; using Base::KRepeat; using Base::xdlops_gemm; @@ -179,11 +177,11 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale{}) == 1, - "Pipeline v3 only support scaleblocksliceK=1"); - static_assert(CScaleThreadDesc{}.GetLength(Number<2>{}) == 1, - "Pipeline v3 only support scaleblocksliceN=1"); // assume kperblock = scaleblockk + ignore = num_loop_per_scale; auto a_thread_buf = make_static_buffer( a_thread_desc_.GetElementSpaceSize()); auto b_thread_buf = make_static_buffer( @@ -337,8 +330,6 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale( b_scale_thread_desc.GetElementSpaceSize()); - auto c_scale_thread_buf = make_static_buffer( - c_scale_thread_desc.GetElementSpaceSize()); // Global prefetch 1 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); @@ -347,26 +338,11 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale{}([&](auto m0) { - a_scale_thread_copy.Run(a_scale_grid_desc, - a_scale_grid_buf, - a_scale_thread_desc, - make_tuple(m0, I0), - a_scale_thread_buf); - a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, - a_scale_thread_copy_step.At(Number<0>{})); - }); - - if constexpr(NumKBlockPerScale == 1) - { - a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, - a_scale_thread_copy_step.At(Number<2>{})); - } - else - { - a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, - a_scale_thread_copy_step.At(Number<1>{})); - } + a_scale_thread_copy.Run(a_scale_grid_desc, + a_scale_grid_buf, + a_scale_thread_desc, + make_tuple(I0, I0), + a_scale_thread_buf); b_scale_thread_copy.Run(b_scale_grid_desc, b_scale_grid_buf, @@ -374,12 +350,8 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale{}([&](auto m0) { - c_scale_thread_buf(m0) = a_scale_thread_buf[m0] * b_scale_thread_buf[I0]; - }); - // Local prefill 1 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); @@ -391,44 +363,10 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale{}([&](auto m0) { - a_scale_thread_copy.Run(a_scale_grid_desc, - a_scale_grid_buf, - a_scale_thread_desc, - make_tuple(m0, I0), - a_scale_thread_buf); - a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, - a_scale_thread_copy_step.At(Number<0>{})); - }); - - if constexpr(NumKBlockPerScale == 1) - { - a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, - a_scale_thread_copy_step.At(Number<2>{})); - } - else - { - a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc, - a_scale_thread_copy_step.At(Number<1>{})); - } - - b_scale_thread_copy.Run(b_scale_grid_desc, - b_scale_grid_buf, - b_scale_thread_desc, - make_tuple(I0, I0), - b_scale_thread_buf); - - b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc, b_scale_thread_copy_step); - // Initialize C c_thread_buf.Clear(); - StaticBufferTupleOfVector - c_thread_buf_per_scale; + auto c_thread_buf_per_scale = remove_cvref_t(); // Local prefetch 1 block_sync_lds(); @@ -471,10 +409,7 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale{}([&](auto m0) { static_for<0, NRepeat, 1>{}([&](auto n0) { - static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}) - .template AsType()(Number{}) = 0; - }); + c_thread_buf_per_scale.Clear(); static_for<0, KRepeat, 1>{}([&](auto k0) { vector_type a_thread_vec; vector_type b_thread_vec; @@ -495,23 +430,19 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale( a_thread_vec.template AsType(), b_thread_vec.template AsType(), - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})); + c_thread_buf_per_scale.GetVectorTypeReference(I0)); }); static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t)); c_thread_buf(Number{}) += - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}) - .template AsType()[Number{}] * - type_convert(c_scale_thread_buf[m0]); + c_thread_buf_per_scale[Number{}] * + type_convert(a_scale_thread_buf[I0]) * + type_convert(b_scale_thread_buf[I0]); }); }); }); - static_for<0, MRepeat, 1>{}([&](auto m0) { - c_scale_thread_buf(m0) = a_scale_thread_buf[m0] * b_scale_thread_buf[I0]; - }); - block_sync_lds(); static_for<0, KRepeat, 1>{}([&](auto k) { static_for<0, MRepeat, 1>{}([&](auto m0) { @@ -531,27 +462,11 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale{}([&](auto m0) { - a_scale_thread_copy.Run(a_scale_grid_desc, - a_scale_grid_buf, - a_scale_thread_desc, - make_tuple(m0, I0), - a_scale_thread_buf); - a_scale_thread_copy.MoveSrcSliceWindow( - a_scale_grid_desc, a_scale_thread_copy_step.At(Number<0>{})); - }); - - if constexpr(NumKBlockPerScale == 1) - { - a_scale_thread_copy.MoveSrcSliceWindow( - a_scale_grid_desc, a_scale_thread_copy_step.At(Number<2>{})); - } - else - { - a_scale_thread_copy.MoveSrcSliceWindow( - a_scale_grid_desc, a_scale_thread_copy_step.At(Number<1>{})); - } + a_scale_thread_copy.Run(a_scale_grid_desc, + a_scale_grid_buf, + a_scale_thread_desc, + make_tuple(I0, I0), + a_scale_thread_buf); b_scale_thread_copy.Run(b_scale_grid_desc, b_scale_grid_buf, @@ -559,6 +474,7 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale{}([&](auto m0) { static_for<0, NRepeat, 1>{}([&](auto n0) { - static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}) - .template AsType()(Number{}) = 0; - }); + c_thread_buf_per_scale.Clear(); static_for<0, KRepeat, 1>{}([&](auto k0) { vector_type a_thread_vec; vector_type b_thread_vec; @@ -594,15 +507,15 @@ struct BlockwiseGemmXdlops_pipeline_v3_ab_scale( a_thread_vec.template AsType(), b_thread_vec.template AsType(), - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{})); + c_thread_buf_per_scale.GetVectorTypeReference(I0)); }); static_for<0, xdlops_gemm.GetRegSizePerXdlops(), 1>{}([&](auto t) { constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, t)); c_thread_buf(Number{}) += - c_thread_buf_per_scale.GetVectorTypeReference(Number<0>{}) - .template AsType()[Number{}] * - type_convert(c_scale_thread_buf[m0]); + c_thread_buf_per_scale[Number{}] * + type_convert(a_scale_thread_buf[I0]) * + type_convert(b_scale_thread_buf[I0]); }); }); }); diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp index 3c79b92ec8..ef0b5286ac 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp @@ -125,6 +125,7 @@ struct DeviceGemmMultipleDSplitKBPreShuffle : public BaseOperator { static constexpr index_t NumDTensor = DsDataType::Size(); +#ifndef CK_CODE_GEN_RTC virtual std::unique_ptr MakeArgumentPointer(const void* p_a, const void* p_b, @@ -145,6 +146,7 @@ struct DeviceGemmMultipleDSplitKBPreShuffle : public BaseOperator virtual std::unique_ptr MakeInvokerPointer() = 0; virtual int GetPreShuffleParameters() = 0; +#endif }; } // namespace device diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp index b251fb97b9..78d8aa997e 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -114,40 +114,6 @@ struct DeviceGemmV2BScale : public BaseOperator virtual ck::index_t GetKPerBlock() = 0; }; -template -struct DeviceGemmV2BPreshuffle : public BaseOperator -{ - virtual std::unique_ptr - MakeArgumentPointer(const void* p_a, - const void* p_b, - void* p_c, - ck::index_t M, - ck::index_t N, - ck::index_t K, - ck::index_t StrideA, - ck::index_t StrideB, - ck::index_t StrideC, - ck::index_t KSplit, - AElementwiseOperation a_element_op, - BElementwiseOperation b_element_op, - CElementwiseOperation c_element_op) = 0; - - virtual std::unique_ptr MakeInvokerPointer() = 0; - - virtual bool GetPermuteA() = 0; - virtual bool GetPermuteB() = 0; - virtual ck::index_t GetKPerBlock() = 0; - virtual int GetPreShuffleParameters() = 0; -}; - } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp index b4ab96d397..e846b0630b 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp @@ -614,7 +614,6 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle return true; } -#ifndef __HIPCC_RTC__ static constexpr bool IsSupported(index_t MRaw_, index_t NRaw_, index_t KRaw_, index_t Gemm1NRaw_) { @@ -705,6 +704,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle return true; } +#ifndef __HIPCC_RTC__ static bool IsSupportedArgument(const Argument& arg) { if(!ck::is_xdl_supported()) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp index d5fec7201a..480402b7e1 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp @@ -15,7 +15,6 @@ #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" -#include "ck/host_utility/flush_cache.hpp" namespace ck { namespace tensor_operation { @@ -178,57 +177,14 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3 const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split); const auto Run = [&](const auto& kernel) { - if(stream_config.flush_cache) - { - Argument arg_ = arg; + if(arg.KBatch > 1) + hipGetErrorString(hipMemsetAsync(arg.p_c_grid, + 0, + arg.M * arg.N * sizeof(CDataType), + stream_config.stream_id_)); - const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1( - arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0); - const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1( - arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0); - - auto size_a_buffer = - a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType); - auto size_b_buffer = - b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType); - - ck::utility::RotatingMemWrapper rotating_mem( - arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer); - rotating_mem.Print(); - - auto run_flush_cache = [&]() { - // flush icache - ck::utility::flush_icache(); - // rotating mem - rotating_mem.Next(); - // clear c mem - if(arg_.KBatch > 1) - hipGetErrorString(hipMemsetAsync(arg_.p_c_grid, - 0, - arg_.M * arg_.N * sizeof(CDataType), - stream_config.stream_id_)); - }; - - ave_time = ck::utility::launch_and_time_kernel_with_preprocess( - stream_config, - run_flush_cache, - kernel, - dim3(gdx, gdy, gdz), - dim3(BlockSize), - 0, - arg_); - } - else - { - if(arg.KBatch > 1) - hipGetErrorString(hipMemsetAsync(arg.p_c_grid, - 0, - arg.M * arg.N * sizeof(CDataType), - stream_config.stream_id_)); - - ave_time = launch_and_time_kernel( - stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg); - } + ave_time = launch_and_time_kernel( + stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg); }; constexpr index_t minimum_occupancy = @@ -239,7 +195,7 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3 if(has_main_k_block_loop) { - // Tail number always full + // Tail number always 1 if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 || BlkGemmPipelineVer == BlockGemmPipelineVersion::v3) { @@ -252,13 +208,127 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3 Run(kernel); } } + // Tail number could be One to Seven + else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2) + { + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Full) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Three) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Four) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Five) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + + if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == + TailNumber::Seven) + { + const auto kernel = + kernel_gemm_xdl_cshuffle_v3; + Run(kernel); + } + } + } + } } else { // Tail number always 1 if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1) { - if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Full) { const auto kernel = kernel_gemm_xdl_cshuffle_v3; Run(kernel); } - else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) - { - const auto kernel = - kernel_gemm_xdl_cshuffle_v3; - Run(kernel); - } } } return ave_time; @@ -303,11 +363,10 @@ struct DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3 return false; } - // if(ScaleBlockM % MPerBlock != 0 || ScaleBlockN % NPerBlock != 0 || ScaleBlockK != - // KPerBlock) - // { - // return false; - // } + if(ScaleBlockM % MPerBlock != 0 || ScaleBlockN % NPerBlock != 0 || ScaleBlockK != KPerBlock) + { + return false; + } if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding || GemmSpec == GemmSpecialization::NKPadding || diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 037a026eca..f1d0f9844d 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -79,32 +79,6 @@ __device__ inline half4_t i4_to_half4_scale(int q, const ck::half2_t& scale) return res.template AsType()[Number<0>{}]; } -__device__ inline f8x4_t i4_to_f8x4(int q) -{ - const int LO = 0x000f000f; - const int HI = 0x00f000f0; - - int lo = amd_assembly_and_b32(q, LO); - int hi = amd_assembly_and_b32(q, HI); - - float f32_0 = amd_assemble_cvt_f32_i4(lo); - float f32_1 = amd_assemble_cvt_f32_i4(lo >> 16); - float f32_2 = amd_assemble_cvt_f32_i4(hi); - float f32_3 = amd_assemble_cvt_f32_i4(hi >> 16); - - // vector_type res; - // res.template AsType()(Number<0>{}) = amd_assemble_cvt_f8_f32(f32_1st, f32_2nd, f32_3rd, f32_4th); - return amd_assembly_cvt_f8_to_f32(f32_0, f32_1, f32_2, f32_3); -} - -__device__ inline f8x8_t i4_to_fp8x8(int q) -{ - // f8x8_t res; - // amd_assembly_i4_to_fp8x8(res, q); - // return res; - return amd_assembly_i4_to_fp8x8(q); -} - __device__ inline bhalf4_t i4_to_bhalf4(int q) { uint32_t i8s = (q & 0xf) | ((q & 0xf0) << 4) | ((q & 0xf00) << 8) | ((q & 0xf000) << 12); @@ -168,61 +142,6 @@ struct PassThroughPack8 #endif } - __host__ __device__ constexpr void operator()(ck::f8x8_t& y, const ck::pk_i4x4_t& x) const - { -#if CK_USE_PK4_LAYOUT_SHUFFLE - y = i4_to_fp8x8(bit_cast(x)); - - // vector_type result; - - // result.template AsType()(Number<0>{}) = i4_to_f8x4(bit_cast(x)); - // result.template AsType()(Number<1>{}) = i4_to_f8x4(bit_cast(x) >> 8); - - // y = result.template AsType()[Number<0>{}]; -#else - // Added pk_i4_t to f8x2_fnuz_t conversion - vector_type dst; - vector_type dst_tmp; - vector_type src{x}; - - // pk_i4_t to float2_t conversion - dst_tmp.template AsType()(Number<0>{}) = - type_convert(src.template AsType()[Number<0>{}]); - - dst_tmp.template AsType()(Number<1>{}) = - type_convert(src.template AsType()[Number<1>{}]); - - dst_tmp.template AsType()(Number<2>{}) = - type_convert(src.template AsType()[Number<2>{}]); - - dst_tmp.template AsType()(Number<3>{}) = - type_convert(src.template AsType()[Number<3>{}]); - - // float to f8_t conversion - dst.template AsType()(Number<0>{}) = - type_convert(dst_tmp.template AsType()[Number<0>{}]); - dst.template AsType()(Number<1>{}) = - type_convert(dst_tmp.template AsType()[Number<1>{}]); - - dst.template AsType()(Number<2>{}) = - type_convert(dst_tmp.template AsType()[Number<2>{}]); - dst.template AsType()(Number<3>{}) = - type_convert(dst_tmp.template AsType()[Number<3>{}]); - - dst.template AsType()(Number<4>{}) = - type_convert(dst_tmp.template AsType()[Number<4>{}]); - dst.template AsType()(Number<5>{}) = - type_convert(dst_tmp.template AsType()[Number<5>{}]); - - dst.template AsType()(Number<6>{}) = - type_convert(dst_tmp.template AsType()[Number<6>{}]); - dst.template AsType()(Number<7>{}) = - type_convert(dst_tmp.template AsType()[Number<7>{}]); - - y = dst.template AsType()[Number<0>{}]; -#endif - } - __host__ __device__ constexpr void operator()(ck::bhalf8_t& y, const ck::pk_i4x4_t& x) const { #if CK_USE_PK4_LAYOUT_SHUFFLE diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 3b227af9c1..23cb15bb4c 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -224,13 +224,6 @@ struct ThreadwiseTensorSliceTransfer_v2 using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})); - static constexpr index_t PackedSize = []() { - if constexpr(is_same_v, pk_i4_t>) - return 2; - else - return 1; - }(); - __device__ constexpr ThreadwiseTensorSliceTransfer_v2(const SrcDesc& src_desc, const Index& src_slice_origin_idx) : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin_idx)) @@ -239,11 +232,6 @@ struct ThreadwiseTensorSliceTransfer_v2 "wrong! SrcDesc need to known at compile-time"); static_assert(SliceLengths::At(Number{}) % SrcScalarPerVector == 0, "wrong! Not divisible"); - - if constexpr(is_same_v, pk_i4_t>) - { - static_assert(SrcScalarPerVector % PackedSize == 0, "pk data N cannot be 1"); - } } __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) @@ -288,10 +276,10 @@ struct ThreadwiseTensorSliceTransfer_v2 constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess(); static_for<0, num_access, 1>{}([&](auto idx_1d) { - typename vector_type_maker::type src_vector; + typename vector_type_maker::type src_vector; using src_vector_t = - typename vector_type_maker::type::type; + typename vector_type_maker::type::type; constexpr auto src_data_idx = SpaceFillingCurve::GetIndex(idx_1d); const bool is_src_valid = @@ -299,10 +287,10 @@ struct ThreadwiseTensorSliceTransfer_v2 // copy data from src_buf into src_vector src_vector.template AsType()(Number<0>{}) = - src_buf.template Get(src_coord_.GetOffset() / PackedSize, is_src_valid); + src_buf.template Get(src_coord_.GetOffset(), is_src_valid); // copy data from src_vector into dst_buf - static_for<0, SrcScalarPerVector / PackedSize, 1>{}([&](auto i) { + static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { constexpr index_t dst_offset = dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx + i * src_scalar_step_in_vector); @@ -1477,13 +1465,6 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic using Index = MultiIndex; - static constexpr index_t PackedSize = []() { - if constexpr(is_same_v, pk_i4_t>) - return 2; - else - return 1; - }(); - __device__ constexpr ThreadwiseTensorSliceTransfer_StaticToStatic( const ElementwiseOperation& element_op) : element_op_{element_op} @@ -1504,7 +1485,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic const SrcBuffer& src_buf, const DstDesc&, const DstSliceOriginIdx&, - DstBuffer& dst_buf) const + DstBuffer& dst_buf) { static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(), "wrong! Desc need to known at compile-time"); @@ -1538,49 +1519,6 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess(); - if constexpr(is_same, pk_i4_t>::value) - { - static_for<0, num_access, 1>{}([&](auto idx_1d) { - typename vector_type_maker::type src_tmp_vector; - - constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d); - - // copy data from src_buf into dst_vector - static_for<0, DstScalarPerVector / PackedSize, 1>{}([&](auto i) { - constexpr index_t src_offset = src_desc.CalculateOffset( - src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector); - - src_tmp_vector.template AsType()(i) = src_buf[Number{}]; - }); - - // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to - // DstData) - vector_type_maker_t dst_tmp_vector; - - constexpr index_t pack_size = 8; - - static_assert(DstScalarPerVector % pack_size == 0, ""); - - using src_v_t = typename vector_type_maker_t::type; - using dst_v_t = typename vector_type_maker_t::type; - - static_for<0, DstScalarPerVector / pack_size, 1>{}([&](auto i) { - ck::tensor_operation::element_wise::PassThroughPack8{}( - dst_tmp_vector.template AsType()(i), - src_tmp_vector.template AsType()[i]); - }); - - // copy data from dst_tmp_vector into dst_buf - static_for<0, DstScalarPerVector, 1>{}([&](auto i) { - constexpr index_t dst_offset = dst_desc.CalculateOffset( - dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector); - - dst_buf(Number{}) = dst_tmp_vector.template AsType()[i]; - }); - }); - } - else - { static_for<0, num_access, 1>{}([&](auto idx_1d) { constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d); @@ -1602,7 +1540,6 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic }); }); } - } ElementwiseOperation element_op_; }; diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3.hpp index 30e820b45c..ea074144b6 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3.hpp @@ -47,9 +47,6 @@ template {}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; static constexpr auto SrcScalarPerVector = SrcScalarPerVectors{}[I0]; @@ -123,7 +120,6 @@ struct ThreadwiseTensorSliceTransfer_v7r3 { static_for<0, nDst, 1>{}([&](auto i) { dst_coords_(i) = make_tensor_coordinate(dst_descs[i], dst_slice_origin_idxs[i]); - // printf("tid %d origin %d %d %d %d off %d\n", threadIdx.x, dst_slice_origin_idxs[i][I0], dst_slice_origin_idxs[i][I1], dst_slice_origin_idxs[i][I2], dst_slice_origin_idxs[i][I3], dst_coords_(i).GetOffset()); }); } @@ -423,14 +419,6 @@ struct ThreadwiseTensorSliceTransfer_v7r3 dst_coords_[i].GetOffset(), is_dst_valid, dst_vectors[i].template AsType()[I0]); - // if(1) { - // static_for<0, DstScalarPerVector, 1>{}([&](auto idx) { - // using DstData = remove_cvref_t>; - // using print_vec_t = typename vector_type::type; - // printf("tid %d off %d valid %d %f\n",threadIdx.x, dst_coords_[i].GetOffset(), is_dst_valid, - // type_convert(dst_vectors[i].template AsType()[idx])); - // }); - // } }); // move coordinate