BF16 GEMM Stream-K (#1541)

* initial

* Cmake file

* successfull compilation but validation failed

* Cmake

* update

* gpu validation

* gemm universal

* gemm universal sk update

* sk bf16 universal instance

* gemm_universal_streamk.hpp

* only build for gfx94

* Cmakelist

* profiler update, bf16 sk only works at gfx42

* clang

* clang

* clang all

* no need flags

* cmake script

* delete comment

* gemm universal sk fix

* clang

* profiler fix

* clang

* update

* update

* delete comment

* code formatting

* cmake

* fix instance

* clang

* argument supported

* argument supported and clang

* update

* fix

* removing unnecessary comments

* clang formatting

* Update library/src/tensor_operation_instance/gpu/CMakeLists.txt

Co-authored-by: afagaj <john.afaganis@gmail.com>

* CopyRight Comment 2025

* clang reformatting

* copy right 2025

---------

Co-authored-by: Emin Ozturk <ozturk.27@osu.edu>
Co-authored-by: root <root@ctr-ubbsmc16.amd.com>
Co-authored-by: Muhammed Emin Ozturk <meozturk@t004-008.hpcfund>
Co-authored-by: root <root@splinter-126-wr-d3.amd.com>
Co-authored-by: Muhammed Emin Ozturk <meozturk@t006-001.hpcfund>
Co-authored-by: Muhammed Emin Ozturk <meozturk@login1.hpcfund>
Co-authored-by: Muhammed Emin Ozturk <meozturk@t004-004.hpcfund>
Co-authored-by: Emin Ozturk <emin.ozturk@utah.edu>
Co-authored-by: Muhammed Emin Ozturk <meozturk@t008-001.hpcfund>
Co-authored-by: afagaj <john.afaganis@gmail.com>
This commit is contained in:
Muhammed Emin Ozturk
2025-01-02 10:30:04 -08:00
committed by GitHub
parent 1d8e4ec2ce
commit 9e95d54cd2
51 changed files with 2101 additions and 10 deletions

3
example/01_gemm/CMakeLists.txt Normal file → Executable file
View File

@@ -35,6 +35,9 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_v3)
add_example_executable(example_gemm_xdl_bf16_v3 gemm_xdl_bf16_v3.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_v3)
add_example_executable(example_gemm_xdl_bf16_streamk_v3 gemm_xdl_bf16_streamk_v3.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_streamk_v3)
add_example_executable(example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)

0
example/01_gemm/gemm_xdl_bf16.cpp Normal file → Executable file
View File

View File

@@ -0,0 +1,59 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
using ADataType = ck::bhalf_t;
using BDataType = ck::bhalf_t;
using CDataType = ck::bhalf_t;
using AccDataType = float;
using CShuffleDataType = ck::bhalf_t;
using ALayout = Row;
using BLayout = Col;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceGemmV2_Streamk_Instance =
ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
ALayout, BLayout, CLayout,
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
PassThrough, PassThrough, PassThrough, GemmDefault,
256,
128, 128,
64, 8, 8,
16, 16,
4, 4,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
1, 2, S<1, 32, 1, 8>, 8,
ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
BLayout,
CLayout,
ADataType,
BDataType,
CDataType,
AccDataType,
AElementOp,
BElementOp,
CElementOp>;
#include "run_gemm_example_streamk_v2.inc"
int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); }

1
example/01_gemm/gemm_xdl_streamk.cpp Normal file → Executable file
View File

@@ -15,7 +15,6 @@ using F16 = ck::half_t;
using ALayout = Row;
using BLayout = Row;
// using BLayout = Col;
using CLayout = Row;
using AElementOp = PassThrough;

0
example/01_gemm/run_gemm_example_streamk_v2.inc Executable file → Normal file
View File