From db557ada8b1c9c25e1fb7a9f074a97151dddebd3 Mon Sep 17 00:00:00 2001 From: Thomas Ning Date: Sat, 14 Sep 2024 06:08:40 -0700 Subject: [PATCH] Ck tile GPU verification sample develop & Add the CK TILE GEMM to the CI/CD test (#1505) * Finished the feature of gpu verification * Add the ck_tile_gemm test in the CI CD * add the include of tensor_layou in reference_gemm * Comment Addressed * split ck_tile fhma and gemm tests into separate stages * restructure the reference gemm * restructure a new reference_gemm api that could read the device mem --------- Co-authored-by: carlushuang Co-authored-by: illsilin [ROCm/composable_kernel commit: 844f5a17123a80d7d640d86486336a8ea6c8c769] --- Jenkinsfile | 66 ++++++- example/ck_tile/03_gemm/gemm_basic.cpp | 168 +++++++++++++----- example/ck_tile/03_gemm/gemm_basic.hpp | 1 + .../ck_tile/03_gemm/script/run_full_test.sh | 25 +++ example/ck_tile/03_gemm/script/smoke_test.sh | 35 ++++ .../ck_tile/host/reference/reference_gemm.hpp | 118 ++++++++++++ .../ck_tile/ops/gemm/kernel/gemm_kernel.hpp | 5 +- .../ops/gemm/kernel/gemm_tile_partitioner.hpp | 10 +- 8 files changed, 368 insertions(+), 60 deletions(-) create mode 100755 example/ck_tile/03_gemm/script/run_full_test.sh create mode 100755 example/ck_tile/03_gemm/script/smoke_test.sh diff --git a/Jenkinsfile b/Jenkinsfile index 40d9e86874..c6f8d9ba3f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -703,7 +703,7 @@ def process_results(Map conf=[:]){ } //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version -CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2; RUN_CK_TILE_TESTS=true +CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=;RUN_CK_TILE_GEMM_TESTS=true 0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true @@ -775,9 +775,13 @@ pipeline { defaultValue: false, description: "Run the grouped conv large cases tests (default: OFF)") booleanParam( - name: "RUN_CK_TILE_TESTS", + name: "RUN_CK_TILE_FMHA_TESTS", defaultValue: false, - description: "Run the ck_tile tests (default: OFF)") + description: "Run the ck_tile FMHA tests (default: OFF)") + booleanParam( + name: "RUN_CK_TILE_GEMM_TESTS", + defaultValue: false, + description: "Run the ck_tile GEMM tests (default: OFF)") booleanParam( name: "BUILD_INSTANCES_ONLY", defaultValue: false, @@ -894,15 +898,15 @@ pipeline { } } } - stage("Run CK_TILE Tests") + stage("Run CK_TILE_FMHA Tests") { parallel { - stage("Run CK_TILE Tests on gfx90a") + stage("Run CK_TILE_FMHA Tests on gfx90a") { when { beforeAgent true - expression { params.RUN_CK_TILE_TESTS.toBoolean() } + expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() } } agent{ label rocmnode("gfx90a") } environment{ @@ -917,11 +921,11 @@ pipeline { cleanWs() } } - stage("Run CK_TILE Tests on gfx942") + stage("Run CK_TILE_FMHA Tests on gfx942") { when { beforeAgent true - expression { params.RUN_CK_TILE_TESTS.toBoolean() } + expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() } } agent{ label rocmnode("gfx942") } environment{ @@ -937,6 +941,52 @@ pipeline { } } } + } + stage("Run CK_TILE_GEMM Tests") + { + parallel + { + + stage("Run CK_TILE_GEMM Tests on gfx90a") + { + when { + beforeAgent true + expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() } + } + agent{ label rocmnode("gfx90a") } + environment{ + setup_args = "NO_CK_BUILD" + execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \ + make -j64 tile_example_gemm_basic && \ + cd ../ && + example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """ + } + steps{ + buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + cleanWs() + } + + } + stage("Run CK_TILE_GEMM Tests on gfx942") + { + when { + beforeAgent true + expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() } + } + agent{ label rocmnode("gfx942") } + environment{ + setup_args = "NO_CK_BUILD" + execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \ + make -j64 tile_example_gemm_basic && \ + cd ../ && + example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """ + } + steps{ + buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + cleanWs() + } + } + } } stage("Build CK and run Tests") { diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp index 734ba0fe65..d0b61612a0 100644 --- a/example/ck_tile/03_gemm/gemm_basic.cpp +++ b/example/ck_tile/03_gemm/gemm_basic.cpp @@ -3,7 +3,7 @@ // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. #include "gemm_basic.hpp" -#include "ck_tile/host.hpp" +#include #include #include @@ -21,7 +21,7 @@ auto create_args(int argc, char* argv[]) .insert("stride_a", "0", "Tensor A stride") .insert("stride_b", "0", "Tensor B stride") .insert("stride_c", "0", "Tensor C stride") - .insert("v", "1", "cpu validation or not") + .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU") .insert("e", "1e-5", "Absolute error tolerance") .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8") .insert("warmup", "10", "number of iterations before benchmark the kernel") @@ -32,41 +32,22 @@ auto create_args(int argc, char* argv[]) return std::make_tuple(result, arg_parser); } -template +template float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) { - // ToDo: This will be modified by the codegen code later. - constexpr ck_tile::index_t M_Tile = 128; - constexpr ck_tile::index_t N_Tile = 128; - constexpr ck_tile::index_t K_Tile = 32; - - constexpr ck_tile::index_t M_Warp = 2; - constexpr ck_tile::index_t N_Warp = 2; - constexpr ck_tile::index_t K_Warp = 1; - - constexpr ck_tile::index_t M_Warp_Tile = 32; - constexpr ck_tile::index_t N_Warp_Tile = 32; - constexpr ck_tile::index_t K_Warp_Tile = 8; - // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part. constexpr bool kPadA = true; constexpr bool kPadB = true; - constexpr bool kPadC = false; constexpr int kBlockPerCu = 1; - // =============================================== - - using GemmShape = - ck_tile::TileGemmShape, - ck_tile::sequence, - ck_tile::sequence>; using TilePartitioner = ck_tile::GemmTilePartitioner; - using PipelineProblem = ck_tile:: - BlockGemmPipelineProblem; - // The GemmPipeline should also come from the Codegen. - using GemmPipeline = ck_tile::BlockGemmPipelineAGmemBGmemCRegV1; - using GemmEpilogue = ck_tile::Default2DEpilogue< + using GemmEpilogue = ck_tile::Default2DEpilogue< ck_tile::Default2DEpilogueProblem>; // ToDo: Will add the codegen part to test different pipeline policies in GEMM. // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy. @@ -93,7 +74,13 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s) return ave_time; } -template +template float invoke_gemm(ck_tile::DeviceMem& a_buf, ck_tile::DeviceMem& b_buf, ck_tile::DeviceMem& c_buf, @@ -155,7 +142,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_buf, else { args.stride_B = [&]() { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) { return N; } @@ -184,8 +171,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_buf, }(); } - float ave_time = - gemm_calc(args, ck_tile::stream_config{nullptr, true}); + float ave_time = gemm_calc( + args, ck_tile::stream_config{nullptr, true}); std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N; float gb_per_sec = num_byte / 1.E6 / ave_time; @@ -212,7 +199,7 @@ int main(int argc, char* argv[]) // The Matrix Multiplication goes with Matrix A (M, K), Matrix B (N, K) = Matrix C (M, N). using matrix_a_layout = ck_tile::tensor_layout::gemm::RowMajor; - using matrix_b_layout = ck_tile::tensor_layout::gemm::RowMajor; + using matrix_b_layout = ck_tile::tensor_layout::gemm::ColumnMajor; using matrix_c_layout = ck_tile::tensor_layout::gemm::RowMajor; // host verify @@ -221,7 +208,7 @@ int main(int argc, char* argv[]) ? std::vector{M, K} : std::vector{K, M}; std::vector b_dimensions = - (std::is_same_v) + (std::is_same_v) ? std::vector{N, K} : std::vector{K, N}; std::vector c_dimensions = @@ -245,12 +232,52 @@ int main(int argc, char* argv[]) a_buf.ToDevice(a_host.data()); b_buf.ToDevice(b_host.data()); - invoke_gemm( - a_buf, b_buf, c_buf, arg_parser); + // The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part. + constexpr bool kPadA = true; + constexpr bool kPadB = true; + constexpr bool kPadC = false; - bool pass = true; + // This part comes from the Codegen + constexpr ck_tile::index_t M_Tile = 128; + constexpr ck_tile::index_t N_Tile = 128; + constexpr ck_tile::index_t K_Tile = 32; - if(arg_parser.get_bool("v")) + constexpr ck_tile::index_t M_Warp = 2; + constexpr ck_tile::index_t N_Warp = 2; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = 32; + constexpr ck_tile::index_t N_Warp_Tile = 32; + constexpr ck_tile::index_t K_Warp_Tile = 8; + + using CodegenGemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + using CodegenPipelineProblem = ck_tile::BlockGemmPipelineProblem; + + using CodegenGemmPipeline = ck_tile::BlockGemmPipelineAGmemBGmemCRegV1; + + invoke_gemm(a_buf, b_buf, c_buf, arg_parser); + + c_buf.FromDevice(c_host_dev.data()); + + bool pass_cpu = true; + + if(arg_parser.get_int("v") == 1) { // ToDo: Will Add the Element Op (bias) verification in the future. ck_tile::reference_gemm(a_host, b_host, c_host_ref); - c_buf.FromDevice(c_host_dev.data()); + pass_cpu = ck_tile::check_err(c_host_dev, c_host_ref); - pass = ck_tile::check_err(c_host_dev, c_host_ref); + std::cout << "The CPU veification result is:" << (pass_cpu ? "correct" : "fail") + << std::flush; + } - std::cout << "The veification result is:" << (pass ? "correct" : "fail") << std::flush; + bool pass_gpu = true; + + if(arg_parser.get_int("v") == 2) + { + ck_tile::index_t stride_a = arg_parser.get_int("stride_a"); + ck_tile::index_t stride_b = arg_parser.get_int("stride_b"); + ck_tile::index_t stride_c = arg_parser.get_int("stride_c"); + + if(stride_a == 0) + { + if constexpr(std::is_same_v) + { + stride_a = M; + } + else + { + stride_a = K; + } + } + + if(stride_b == 0) + { + if constexpr(std::is_same_v) + { + stride_b = N; + } + else + { + stride_b = K; + } + } + + if(stride_c == 0) + { + if constexpr(std::is_same_v) + { + stride_c = M; + } + else + { + stride_c = N; + } + } + + ck_tile::HostTensor c_host_gpu_ref(c_dimensions); + ck_tile::DeviceMem c_gpu_buf(c_host_gpu_ref.get_element_space_size_in_bytes()); + + ck_tile::reference_gemm_gpu( + a_buf, b_buf, c_gpu_buf, M, N, K, stride_a, stride_b, stride_c); + + c_buf.FromDevice(c_host_gpu_ref.data()); + + pass_gpu = ck_tile::check_err(c_host_dev, c_host_gpu_ref); + + std::cout << "The GPU veification result is:" << (pass_gpu ? "correct" : "fail") + << std::flush; } std::cout << std::endl << std::flush; - return !pass; + return !pass_gpu; } diff --git a/example/ck_tile/03_gemm/gemm_basic.hpp b/example/ck_tile/03_gemm/gemm_basic.hpp index 28afb194c9..ce2e0f706d 100644 --- a/example/ck_tile/03_gemm/gemm_basic.hpp +++ b/example/ck_tile/03_gemm/gemm_basic.hpp @@ -8,6 +8,7 @@ #include "ck_tile/host/kernel_launch.hpp" #include "ck_tile/ops/epilogue.hpp" #include "ck_tile/ops/gemm.hpp" +#include "ck_tile/host.hpp" #include template diff --git a/example/ck_tile/03_gemm/script/run_full_test.sh b/example/ck_tile/03_gemm/script/run_full_test.sh new file mode 100755 index 0000000000..2e2e7fdf90 --- /dev/null +++ b/example/ck_tile/03_gemm/script/run_full_test.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# +# in order to run this script you'd first need to build the tile_example_gemm executables in ../build/bin/ +# +# run the script as "./run_full_test.sh +# input arguments: +# environment tag : a string describing the specifics of your test environment +# branch name : name of the branch in git repo (git status | grep -e 'On branch') +# host name : $hostname +# gpu architecture: e.g., gfx90a, or gfx942, etc. + +# get the command line arguments: +export env_type=$1 +echo 'Environment type: ' $env_type +export branch=$2 +echo 'Branch name: ' $branch +export host_name=$3 +echo 'Host name: ' $host_name +export GPU_arch=$4 +echo 'GPU_arch: ' $GPU_arch + +# run verification tests +example/ck_tile/03_gemm/script/smoke_test.sh + +# We do not have a performance benchmark for gemm yet. Will add it in the future. \ No newline at end of file diff --git a/example/ck_tile/03_gemm/script/smoke_test.sh b/example/ck_tile/03_gemm/script/smoke_test.sh new file mode 100755 index 0000000000..4d9a64bf40 --- /dev/null +++ b/example/ck_tile/03_gemm/script/smoke_test.sh @@ -0,0 +1,35 @@ +#!/bin/bash +EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)" +KNAME=1 + +export CK_WARMUP=0 +export CK_REPEAT=1 + +COMMON_ARGS='-v=2 -warmup=0 -repeat=1' + +run_fp16_tests() { + for batch in 1 2; do + for m in 128 1024; do + for n in 128 2048; do + for k in 32 64; do + + $EXE -b=$batch -m=$m -n=$n -k=$k -stride_a=0 -stride_b=0 -stride_c=0 -e=1e-5 -prec=fp16 $COMMON_ARGS + if [ $? -eq 0 ]; then + echo "Success: Test with batch=$batch, m=$m, n=$n, k=$k executed successfully." + else + echo "Error: Test with batch=$batch, m=$m, n=$n, k=$k failed to execute properly." + # Optionally, exit or break if you need to halt further execution + # exit 1 + fi + + done + done + done + done +} + +set -x + +run_fp16_tests + +set +x \ No newline at end of file diff --git a/include/ck_tile/host/reference/reference_gemm.hpp b/include/ck_tile/host/reference/reference_gemm.hpp index df2d719971..a0ddd02d9e 100644 --- a/include/ck_tile/host/reference/reference_gemm.hpp +++ b/include/ck_tile/host/reference/reference_gemm.hpp @@ -5,6 +5,7 @@ #include "ck_tile/core.hpp" #include "ck_tile/host/host_tensor.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" #include namespace ck_tile { @@ -56,4 +57,121 @@ CK_TILE_HOST void reference_gemm(const HostTensor& a_m_k, make_ParallelTensorFunctor(f, M)(std::thread::hardware_concurrency()); } + +template +__global__ void naive_gemm_kernel(ADataType* A, + BDataType* B, + CDataType* C, + ck_tile::index_t M, + ck_tile::index_t N, + ck_tile::index_t K, + ck_tile::index_t strideA, + ck_tile::index_t strideB, + ck_tile::index_t strideC) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int row = idx / N; // Compute row index + int col = idx % N; // Compute column index + + if(row < M && col < N) + { + AccDataType acc = 0.0; + + for(int k = 0; k < K; ++k) + { + acc += static_cast(A[row * strideA + k]) * + static_cast(B[col * strideB + k]); + } + + C[row * strideC + col] = acc; // Store as AccDataType + } +} + +template +void reference_gemm_gpu(DeviceMem& a_device, + DeviceMem& b_device, + DeviceMem& c_device, + index_t M, + index_t N, + index_t K, + index_t stride_a, + index_t stride_b, + index_t stride_c) +{ + + ADataType* d_A; + BDataType* d_B; + CDataType* d_C; + + hipError_t errA = hipMalloc(&d_A, M * K * sizeof(ADataType)); + hipError_t errB = hipMalloc(&d_B, N * K * sizeof(BDataType)); + hipError_t errC = hipMalloc(&d_C, M * N * sizeof(CDataType)); + if(errA != hipSuccess) + { + std::cerr << "Error allocating device memory for A: " << hipGetErrorString(errA) + << std::endl; + return; // Early exit on error + } + + if(errB != hipSuccess) + { + std::cerr << "Error allocating device memory for B: " << hipGetErrorString(errB) + << std::endl; + return; // Early exit on error + } + + if(errC != hipSuccess) + { + std::cerr << "Error allocating device memory for C: " << hipGetErrorString(errC) + << std::endl; + return; // Early exit on error + } + + errA = hipMemcpy( + d_A, a_device.GetDeviceBuffer(), M * K * sizeof(ADataType), hipMemcpyHostToDevice); + if(errA != hipSuccess) + { + std::cerr << "Error copying A to device: " << hipGetErrorString(errA) << std::endl; + } + + errB = hipMemcpy( + d_B, b_device.GetDeviceBuffer(), N * K * sizeof(BDataType), hipMemcpyHostToDevice); + if(errB != hipSuccess) + { + std::cerr << "Error copying B to device: " << hipGetErrorString(errB) << std::endl; + } + + int totalElements = M * N; + int numThreadsPerBlock = 256; // Common choice for threads per block + int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock; + + naive_gemm_kernel + <<>>(d_A, d_B, d_C, M, N, K, stride_a, stride_b, stride_c); + errC = hipMemcpy( + c_device.GetDeviceBuffer(), d_C, M * N * sizeof(CDataType), hipMemcpyDeviceToHost); + if(errC != hipSuccess) + { + std::cerr << "Error copying C to device: " << hipGetErrorString(errC) << std::endl; + } + + errA = hipFree(d_A); + if(errA != hipSuccess) + { + std::cerr << "Error free the A memory: " << hipGetErrorString(errA) << std::endl; + } + + errB = hipFree(d_B); + if(errB != hipSuccess) + { + std::cerr << "Error free the B memory: " << hipGetErrorString(errB) << std::endl; + } + + errC = hipFree(d_C); + if(errC != hipSuccess) + { + std::cerr << "Error free the C memory: " << hipGetErrorString(errC) << std::endl; + } + + return; +} } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp index 01d8f23288..338adfd3cf 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp @@ -76,8 +76,7 @@ struct GemmKernel CK_TILE_DEVICE void operator()(GemmCommonKargs kargs) const { - const index_t i_m = TilePartitioner::iM; - const index_t i_n = TilePartitioner::iN; + const auto [i_m, i_n] = TilePartitioner{}(); // options const ADataType* a_start = static_cast(kargs.a_ptr); const BDataType* b_start = static_cast(kargs.b_ptr); @@ -104,7 +103,7 @@ struct GemmKernel }(); auto b_tensor_view = [&]() { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) { return make_naive_tensor_view( b_start, diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp index 038d09ea35..a49ffc2911 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp @@ -15,9 +15,6 @@ struct GemmTilePartitioner static constexpr ck_tile::index_t kN = BlockGemmShape::kN; static constexpr ck_tile::index_t kK = BlockGemmShape::kK; - const index_t iM = __builtin_amdgcn_readfirstlane(i_tile_m * kM); - const index_t iN = __builtin_amdgcn_readfirstlane(i_tile_n * kN); - CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t batch_size) { @@ -29,10 +26,9 @@ struct GemmTilePartitioner CK_TILE_DEVICE auto operator()() { - const index_t i_GridDimX = blockIdx.x; - const index_t i_GridDimY = blockIdx.y; - const index_t i_GridDimZ = blockIdx.z; - return ck_tile::make_tuple(i_GridDimX, i_GridDimY, i_GridDimZ); + const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kM); + const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kN); + return ck_tile::make_tuple(iM, iN); } }; } // namespace ck_tile