mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
Ck tile GPU verification sample develop & Add the CK TILE GEMM to the CI/CD test (#1505)
* Finished the feature of gpu verification
* Add the ck_tile_gemm test in the CI CD
* add the include of tensor_layou in reference_gemm
* Comment Addressed
* split ck_tile fhma and gemm tests into separate stages
* restructure the reference gemm
* restructure a new reference_gemm api that could read the device mem
---------
Co-authored-by: carlushuang <carlus.huang@amd.com>
Co-authored-by: illsilin <Illia.Silin@amd.com>
[ROCm/composable_kernel commit: 844f5a1712]
This commit is contained in:
66
Jenkinsfile
vendored
66
Jenkinsfile
vendored
@@ -703,7 +703,7 @@ def process_results(Map conf=[:]){
|
||||
}
|
||||
|
||||
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
|
||||
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2; RUN_CK_TILE_TESTS=true
|
||||
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=;RUN_CK_TILE_GEMM_TESTS=true
|
||||
0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true
|
||||
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
|
||||
0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
|
||||
@@ -775,9 +775,13 @@ pipeline {
|
||||
defaultValue: false,
|
||||
description: "Run the grouped conv large cases tests (default: OFF)")
|
||||
booleanParam(
|
||||
name: "RUN_CK_TILE_TESTS",
|
||||
name: "RUN_CK_TILE_FMHA_TESTS",
|
||||
defaultValue: false,
|
||||
description: "Run the ck_tile tests (default: OFF)")
|
||||
description: "Run the ck_tile FMHA tests (default: OFF)")
|
||||
booleanParam(
|
||||
name: "RUN_CK_TILE_GEMM_TESTS",
|
||||
defaultValue: false,
|
||||
description: "Run the ck_tile GEMM tests (default: OFF)")
|
||||
booleanParam(
|
||||
name: "BUILD_INSTANCES_ONLY",
|
||||
defaultValue: false,
|
||||
@@ -894,15 +898,15 @@ pipeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Run CK_TILE Tests")
|
||||
stage("Run CK_TILE_FMHA Tests")
|
||||
{
|
||||
parallel
|
||||
{
|
||||
stage("Run CK_TILE Tests on gfx90a")
|
||||
stage("Run CK_TILE_FMHA Tests on gfx90a")
|
||||
{
|
||||
when {
|
||||
beforeAgent true
|
||||
expression { params.RUN_CK_TILE_TESTS.toBoolean() }
|
||||
expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx90a") }
|
||||
environment{
|
||||
@@ -917,11 +921,11 @@ pipeline {
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
stage("Run CK_TILE Tests on gfx942")
|
||||
stage("Run CK_TILE_FMHA Tests on gfx942")
|
||||
{
|
||||
when {
|
||||
beforeAgent true
|
||||
expression { params.RUN_CK_TILE_TESTS.toBoolean() }
|
||||
expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx942") }
|
||||
environment{
|
||||
@@ -937,6 +941,52 @@ pipeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Run CK_TILE_GEMM Tests")
|
||||
{
|
||||
parallel
|
||||
{
|
||||
|
||||
stage("Run CK_TILE_GEMM Tests on gfx90a")
|
||||
{
|
||||
when {
|
||||
beforeAgent true
|
||||
expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx90a") }
|
||||
environment{
|
||||
setup_args = "NO_CK_BUILD"
|
||||
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \
|
||||
make -j64 tile_example_gemm_basic && \
|
||||
cd ../ &&
|
||||
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
|
||||
}
|
||||
steps{
|
||||
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
|
||||
cleanWs()
|
||||
}
|
||||
|
||||
}
|
||||
stage("Run CK_TILE_GEMM Tests on gfx942")
|
||||
{
|
||||
when {
|
||||
beforeAgent true
|
||||
expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx942") }
|
||||
environment{
|
||||
setup_args = "NO_CK_BUILD"
|
||||
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \
|
||||
make -j64 tile_example_gemm_basic && \
|
||||
cd ../ &&
|
||||
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
|
||||
}
|
||||
steps{
|
||||
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Build CK and run Tests")
|
||||
{
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "gemm_basic.hpp"
|
||||
#include "ck_tile/host.hpp"
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
@@ -21,7 +21,7 @@ auto create_args(int argc, char* argv[])
|
||||
.insert("stride_a", "0", "Tensor A stride")
|
||||
.insert("stride_b", "0", "Tensor B stride")
|
||||
.insert("stride_c", "0", "Tensor C stride")
|
||||
.insert("v", "1", "cpu validation or not")
|
||||
.insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
|
||||
.insert("e", "1e-5", "Absolute error tolerance")
|
||||
.insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
|
||||
.insert("warmup", "10", "number of iterations before benchmark the kernel")
|
||||
@@ -32,41 +32,22 @@ auto create_args(int argc, char* argv[])
|
||||
return std::make_tuple(result, arg_parser);
|
||||
}
|
||||
|
||||
template <typename LayoutA, typename LayoutB, typename LayoutC>
|
||||
template <typename LayoutA,
|
||||
typename LayoutB,
|
||||
typename LayoutC,
|
||||
typename PipelineProblem,
|
||||
typename GemmPipeline,
|
||||
typename GemmShape>
|
||||
float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
|
||||
{
|
||||
// ToDo: This will be modified by the codegen code later.
|
||||
constexpr ck_tile::index_t M_Tile = 128;
|
||||
constexpr ck_tile::index_t N_Tile = 128;
|
||||
constexpr ck_tile::index_t K_Tile = 32;
|
||||
|
||||
constexpr ck_tile::index_t M_Warp = 2;
|
||||
constexpr ck_tile::index_t N_Warp = 2;
|
||||
constexpr ck_tile::index_t K_Warp = 1;
|
||||
|
||||
constexpr ck_tile::index_t M_Warp_Tile = 32;
|
||||
constexpr ck_tile::index_t N_Warp_Tile = 32;
|
||||
constexpr ck_tile::index_t K_Warp_Tile = 8;
|
||||
|
||||
// The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
|
||||
constexpr bool kPadA = true;
|
||||
constexpr bool kPadB = true;
|
||||
constexpr bool kPadC = false;
|
||||
|
||||
constexpr int kBlockPerCu = 1;
|
||||
|
||||
// ===============================================
|
||||
|
||||
using GemmShape =
|
||||
ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
|
||||
ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
|
||||
ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
|
||||
using TilePartitioner = ck_tile::GemmTilePartitioner<GemmShape>;
|
||||
using PipelineProblem = ck_tile::
|
||||
BlockGemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, kPadA, kPadB, kPadC>;
|
||||
// The GemmPipeline should also come from the Codegen.
|
||||
using GemmPipeline = ck_tile::BlockGemmPipelineAGmemBGmemCRegV1<PipelineProblem>;
|
||||
using GemmEpilogue = ck_tile::Default2DEpilogue<
|
||||
using GemmEpilogue = ck_tile::Default2DEpilogue<
|
||||
ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadA, kPadB>>;
|
||||
// ToDo: Will add the codegen part to test different pipeline policies in GEMM.
|
||||
// Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
|
||||
@@ -93,7 +74,13 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
template <typename DataType, typename LayoutA, typename LayoutB, typename LayoutC>
|
||||
template <typename DataType,
|
||||
typename LayoutA,
|
||||
typename LayoutB,
|
||||
typename LayoutC,
|
||||
typename PipelineProblem,
|
||||
typename GemmPipeline,
|
||||
typename GemmShape>
|
||||
float invoke_gemm(ck_tile::DeviceMem& a_buf,
|
||||
ck_tile::DeviceMem& b_buf,
|
||||
ck_tile::DeviceMem& c_buf,
|
||||
@@ -155,7 +142,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_buf,
|
||||
else
|
||||
{
|
||||
args.stride_B = [&]() {
|
||||
if constexpr(std::is_same_v<LayoutB, ck_tile::tensor_layout::gemm::ColumnMajor>)
|
||||
if constexpr(std::is_same_v<LayoutB, ck_tile::tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
return N;
|
||||
}
|
||||
@@ -184,8 +171,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_buf,
|
||||
}();
|
||||
}
|
||||
|
||||
float ave_time =
|
||||
gemm_calc<LayoutA, LayoutB, LayoutC>(args, ck_tile::stream_config{nullptr, true});
|
||||
float ave_time = gemm_calc<LayoutA, LayoutB, LayoutC, PipelineProblem, GemmPipeline, GemmShape>(
|
||||
args, ck_tile::stream_config{nullptr, true});
|
||||
std::size_t num_byte =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
|
||||
float gb_per_sec = num_byte / 1.E6 / ave_time;
|
||||
@@ -212,7 +199,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
// The Matrix Multiplication goes with Matrix A (M, K), Matrix B (N, K) = Matrix C (M, N).
|
||||
using matrix_a_layout = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
using matrix_b_layout = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
using matrix_b_layout = ck_tile::tensor_layout::gemm::ColumnMajor;
|
||||
using matrix_c_layout = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
|
||||
// host verify
|
||||
@@ -221,7 +208,7 @@ int main(int argc, char* argv[])
|
||||
? std::vector<int>{M, K}
|
||||
: std::vector<int>{K, M};
|
||||
std::vector<int> b_dimensions =
|
||||
(std::is_same_v<matrix_b_layout, ck_tile::tensor_layout::gemm::RowMajor>)
|
||||
(std::is_same_v<matrix_b_layout, ck_tile::tensor_layout::gemm::ColumnMajor>)
|
||||
? std::vector<int>{N, K}
|
||||
: std::vector<int>{K, N};
|
||||
std::vector<int> c_dimensions =
|
||||
@@ -245,12 +232,52 @@ int main(int argc, char* argv[])
|
||||
a_buf.ToDevice(a_host.data());
|
||||
b_buf.ToDevice(b_host.data());
|
||||
|
||||
invoke_gemm<ck_tile::half_t, matrix_a_layout, matrix_b_layout, matrix_c_layout>(
|
||||
a_buf, b_buf, c_buf, arg_parser);
|
||||
// The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
|
||||
constexpr bool kPadA = true;
|
||||
constexpr bool kPadB = true;
|
||||
constexpr bool kPadC = false;
|
||||
|
||||
bool pass = true;
|
||||
// This part comes from the Codegen
|
||||
constexpr ck_tile::index_t M_Tile = 128;
|
||||
constexpr ck_tile::index_t N_Tile = 128;
|
||||
constexpr ck_tile::index_t K_Tile = 32;
|
||||
|
||||
if(arg_parser.get_bool("v"))
|
||||
constexpr ck_tile::index_t M_Warp = 2;
|
||||
constexpr ck_tile::index_t N_Warp = 2;
|
||||
constexpr ck_tile::index_t K_Warp = 1;
|
||||
|
||||
constexpr ck_tile::index_t M_Warp_Tile = 32;
|
||||
constexpr ck_tile::index_t N_Warp_Tile = 32;
|
||||
constexpr ck_tile::index_t K_Warp_Tile = 8;
|
||||
|
||||
using CodegenGemmShape =
|
||||
ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
|
||||
ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
|
||||
ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
|
||||
|
||||
using CodegenPipelineProblem = ck_tile::BlockGemmPipelineProblem<ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CodegenGemmShape,
|
||||
kPadA,
|
||||
kPadB,
|
||||
kPadC>;
|
||||
|
||||
using CodegenGemmPipeline = ck_tile::BlockGemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
|
||||
|
||||
invoke_gemm<ck_tile::half_t,
|
||||
matrix_a_layout,
|
||||
matrix_b_layout,
|
||||
matrix_c_layout,
|
||||
CodegenPipelineProblem,
|
||||
CodegenGemmPipeline,
|
||||
CodegenGemmShape>(a_buf, b_buf, c_buf, arg_parser);
|
||||
|
||||
c_buf.FromDevice(c_host_dev.data());
|
||||
|
||||
bool pass_cpu = true;
|
||||
|
||||
if(arg_parser.get_int("v") == 1)
|
||||
{
|
||||
// ToDo: Will Add the Element Op (bias) verification in the future.
|
||||
ck_tile::reference_gemm<ADataType,
|
||||
@@ -261,14 +288,71 @@ int main(int argc, char* argv[])
|
||||
matrix_b_layout,
|
||||
matrix_c_layout>(a_host, b_host, c_host_ref);
|
||||
|
||||
c_buf.FromDevice(c_host_dev.data());
|
||||
pass_cpu = ck_tile::check_err(c_host_dev, c_host_ref);
|
||||
|
||||
pass = ck_tile::check_err(c_host_dev, c_host_ref);
|
||||
std::cout << "The CPU veification result is:" << (pass_cpu ? "correct" : "fail")
|
||||
<< std::flush;
|
||||
}
|
||||
|
||||
std::cout << "The veification result is:" << (pass ? "correct" : "fail") << std::flush;
|
||||
bool pass_gpu = true;
|
||||
|
||||
if(arg_parser.get_int("v") == 2)
|
||||
{
|
||||
ck_tile::index_t stride_a = arg_parser.get_int("stride_a");
|
||||
ck_tile::index_t stride_b = arg_parser.get_int("stride_b");
|
||||
ck_tile::index_t stride_c = arg_parser.get_int("stride_c");
|
||||
|
||||
if(stride_a == 0)
|
||||
{
|
||||
if constexpr(std::is_same_v<matrix_a_layout, ck_tile::tensor_layout::gemm::ColumnMajor>)
|
||||
{
|
||||
stride_a = M;
|
||||
}
|
||||
else
|
||||
{
|
||||
stride_a = K;
|
||||
}
|
||||
}
|
||||
|
||||
if(stride_b == 0)
|
||||
{
|
||||
if constexpr(std::is_same_v<matrix_b_layout, ck_tile::tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
stride_b = N;
|
||||
}
|
||||
else
|
||||
{
|
||||
stride_b = K;
|
||||
}
|
||||
}
|
||||
|
||||
if(stride_c == 0)
|
||||
{
|
||||
if constexpr(std::is_same_v<matrix_c_layout, ck_tile::tensor_layout::gemm::ColumnMajor>)
|
||||
{
|
||||
stride_c = M;
|
||||
}
|
||||
else
|
||||
{
|
||||
stride_c = N;
|
||||
}
|
||||
}
|
||||
|
||||
ck_tile::HostTensor<CDataType> c_host_gpu_ref(c_dimensions);
|
||||
ck_tile::DeviceMem c_gpu_buf(c_host_gpu_ref.get_element_space_size_in_bytes());
|
||||
|
||||
ck_tile::reference_gemm_gpu<ADataType, BDataType, AccDataType, CDataType>(
|
||||
a_buf, b_buf, c_gpu_buf, M, N, K, stride_a, stride_b, stride_c);
|
||||
|
||||
c_buf.FromDevice(c_host_gpu_ref.data());
|
||||
|
||||
pass_gpu = ck_tile::check_err(c_host_dev, c_host_gpu_ref);
|
||||
|
||||
std::cout << "The GPU veification result is:" << (pass_gpu ? "correct" : "fail")
|
||||
<< std::flush;
|
||||
}
|
||||
|
||||
std::cout << std::endl << std::flush;
|
||||
|
||||
return !pass;
|
||||
return !pass_gpu;
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
#include "ck_tile/ops/epilogue.hpp"
|
||||
#include "ck_tile/ops/gemm.hpp"
|
||||
#include "ck_tile/host.hpp"
|
||||
#include <string>
|
||||
|
||||
template <typename DataType>
|
||||
|
||||
25
example/ck_tile/03_gemm/script/run_full_test.sh
Executable file
25
example/ck_tile/03_gemm/script/run_full_test.sh
Executable file
@@ -0,0 +1,25 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# in order to run this script you'd first need to build the tile_example_gemm executables in ../build/bin/
|
||||
#
|
||||
# run the script as "./run_full_test.sh <tag for your test environment> <branch name> <host name> <gpu_arch>
|
||||
# input arguments:
|
||||
# environment tag : a string describing the specifics of your test environment
|
||||
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
|
||||
# host name : $hostname
|
||||
# gpu architecture: e.g., gfx90a, or gfx942, etc.
|
||||
|
||||
# get the command line arguments:
|
||||
export env_type=$1
|
||||
echo 'Environment type: ' $env_type
|
||||
export branch=$2
|
||||
echo 'Branch name: ' $branch
|
||||
export host_name=$3
|
||||
echo 'Host name: ' $host_name
|
||||
export GPU_arch=$4
|
||||
echo 'GPU_arch: ' $GPU_arch
|
||||
|
||||
# run verification tests
|
||||
example/ck_tile/03_gemm/script/smoke_test.sh
|
||||
|
||||
# We do not have a performance benchmark for gemm yet. Will add it in the future.
|
||||
35
example/ck_tile/03_gemm/script/smoke_test.sh
Executable file
35
example/ck_tile/03_gemm/script/smoke_test.sh
Executable file
@@ -0,0 +1,35 @@
|
||||
#!/bin/bash
|
||||
EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)"
|
||||
KNAME=1
|
||||
|
||||
export CK_WARMUP=0
|
||||
export CK_REPEAT=1
|
||||
|
||||
COMMON_ARGS='-v=2 -warmup=0 -repeat=1'
|
||||
|
||||
run_fp16_tests() {
|
||||
for batch in 1 2; do
|
||||
for m in 128 1024; do
|
||||
for n in 128 2048; do
|
||||
for k in 32 64; do
|
||||
|
||||
$EXE -b=$batch -m=$m -n=$n -k=$k -stride_a=0 -stride_b=0 -stride_c=0 -e=1e-5 -prec=fp16 $COMMON_ARGS
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Success: Test with batch=$batch, m=$m, n=$n, k=$k executed successfully."
|
||||
else
|
||||
echo "Error: Test with batch=$batch, m=$m, n=$n, k=$k failed to execute properly."
|
||||
# Optionally, exit or break if you need to halt further execution
|
||||
# exit 1
|
||||
fi
|
||||
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
set -x
|
||||
|
||||
run_fp16_tests
|
||||
|
||||
set +x
|
||||
@@ -5,6 +5,7 @@
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include "ck_tile/ops/common/tensor_layout.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
@@ -56,4 +57,121 @@ CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
|
||||
|
||||
make_ParallelTensorFunctor(f, M)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
|
||||
__global__ void naive_gemm_kernel(ADataType* A,
|
||||
BDataType* B,
|
||||
CDataType* C,
|
||||
ck_tile::index_t M,
|
||||
ck_tile::index_t N,
|
||||
ck_tile::index_t K,
|
||||
ck_tile::index_t strideA,
|
||||
ck_tile::index_t strideB,
|
||||
ck_tile::index_t strideC)
|
||||
{
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int row = idx / N; // Compute row index
|
||||
int col = idx % N; // Compute column index
|
||||
|
||||
if(row < M && col < N)
|
||||
{
|
||||
AccDataType acc = 0.0;
|
||||
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
acc += static_cast<AccDataType>(A[row * strideA + k]) *
|
||||
static_cast<AccDataType>(B[col * strideB + k]);
|
||||
}
|
||||
|
||||
C[row * strideC + col] = acc; // Store as AccDataType
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
|
||||
void reference_gemm_gpu(DeviceMem& a_device,
|
||||
DeviceMem& b_device,
|
||||
DeviceMem& c_device,
|
||||
index_t M,
|
||||
index_t N,
|
||||
index_t K,
|
||||
index_t stride_a,
|
||||
index_t stride_b,
|
||||
index_t stride_c)
|
||||
{
|
||||
|
||||
ADataType* d_A;
|
||||
BDataType* d_B;
|
||||
CDataType* d_C;
|
||||
|
||||
hipError_t errA = hipMalloc(&d_A, M * K * sizeof(ADataType));
|
||||
hipError_t errB = hipMalloc(&d_B, N * K * sizeof(BDataType));
|
||||
hipError_t errC = hipMalloc(&d_C, M * N * sizeof(CDataType));
|
||||
if(errA != hipSuccess)
|
||||
{
|
||||
std::cerr << "Error allocating device memory for A: " << hipGetErrorString(errA)
|
||||
<< std::endl;
|
||||
return; // Early exit on error
|
||||
}
|
||||
|
||||
if(errB != hipSuccess)
|
||||
{
|
||||
std::cerr << "Error allocating device memory for B: " << hipGetErrorString(errB)
|
||||
<< std::endl;
|
||||
return; // Early exit on error
|
||||
}
|
||||
|
||||
if(errC != hipSuccess)
|
||||
{
|
||||
std::cerr << "Error allocating device memory for C: " << hipGetErrorString(errC)
|
||||
<< std::endl;
|
||||
return; // Early exit on error
|
||||
}
|
||||
|
||||
errA = hipMemcpy(
|
||||
d_A, a_device.GetDeviceBuffer(), M * K * sizeof(ADataType), hipMemcpyHostToDevice);
|
||||
if(errA != hipSuccess)
|
||||
{
|
||||
std::cerr << "Error copying A to device: " << hipGetErrorString(errA) << std::endl;
|
||||
}
|
||||
|
||||
errB = hipMemcpy(
|
||||
d_B, b_device.GetDeviceBuffer(), N * K * sizeof(BDataType), hipMemcpyHostToDevice);
|
||||
if(errB != hipSuccess)
|
||||
{
|
||||
std::cerr << "Error copying B to device: " << hipGetErrorString(errB) << std::endl;
|
||||
}
|
||||
|
||||
int totalElements = M * N;
|
||||
int numThreadsPerBlock = 256; // Common choice for threads per block
|
||||
int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
|
||||
|
||||
naive_gemm_kernel<ADataType, BDataType, AccDataType, CDataType>
|
||||
<<<numBlocks, numThreadsPerBlock>>>(d_A, d_B, d_C, M, N, K, stride_a, stride_b, stride_c);
|
||||
errC = hipMemcpy(
|
||||
c_device.GetDeviceBuffer(), d_C, M * N * sizeof(CDataType), hipMemcpyDeviceToHost);
|
||||
if(errC != hipSuccess)
|
||||
{
|
||||
std::cerr << "Error copying C to device: " << hipGetErrorString(errC) << std::endl;
|
||||
}
|
||||
|
||||
errA = hipFree(d_A);
|
||||
if(errA != hipSuccess)
|
||||
{
|
||||
std::cerr << "Error free the A memory: " << hipGetErrorString(errA) << std::endl;
|
||||
}
|
||||
|
||||
errB = hipFree(d_B);
|
||||
if(errB != hipSuccess)
|
||||
{
|
||||
std::cerr << "Error free the B memory: " << hipGetErrorString(errB) << std::endl;
|
||||
}
|
||||
|
||||
errC = hipFree(d_C);
|
||||
if(errC != hipSuccess)
|
||||
{
|
||||
std::cerr << "Error free the C memory: " << hipGetErrorString(errC) << std::endl;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -76,8 +76,7 @@ struct GemmKernel
|
||||
|
||||
CK_TILE_DEVICE void operator()(GemmCommonKargs kargs) const
|
||||
{
|
||||
const index_t i_m = TilePartitioner::iM;
|
||||
const index_t i_n = TilePartitioner::iN;
|
||||
const auto [i_m, i_n] = TilePartitioner{}();
|
||||
// options
|
||||
const ADataType* a_start = static_cast<const ADataType*>(kargs.a_ptr);
|
||||
const BDataType* b_start = static_cast<const BDataType*>(kargs.b_ptr);
|
||||
@@ -104,7 +103,7 @@ struct GemmKernel
|
||||
}();
|
||||
|
||||
auto b_tensor_view = [&]() {
|
||||
if constexpr(std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
|
||||
if constexpr(std::is_same_v<LayoutB, tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
return make_naive_tensor_view<address_space_enum::global>(
|
||||
b_start,
|
||||
|
||||
@@ -15,9 +15,6 @@ struct GemmTilePartitioner
|
||||
static constexpr ck_tile::index_t kN = BlockGemmShape::kN;
|
||||
static constexpr ck_tile::index_t kK = BlockGemmShape::kK;
|
||||
|
||||
const index_t iM = __builtin_amdgcn_readfirstlane(i_tile_m * kM);
|
||||
const index_t iN = __builtin_amdgcn_readfirstlane(i_tile_n * kN);
|
||||
|
||||
CK_TILE_HOST static constexpr auto
|
||||
GridSize(ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t batch_size)
|
||||
{
|
||||
@@ -29,10 +26,9 @@ struct GemmTilePartitioner
|
||||
|
||||
CK_TILE_DEVICE auto operator()()
|
||||
{
|
||||
const index_t i_GridDimX = blockIdx.x;
|
||||
const index_t i_GridDimY = blockIdx.y;
|
||||
const index_t i_GridDimZ = blockIdx.z;
|
||||
return ck_tile::make_tuple(i_GridDimX, i_GridDimY, i_GridDimZ);
|
||||
const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kM);
|
||||
const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kN);
|
||||
return ck_tile::make_tuple(iM, iN);
|
||||
}
|
||||
};
|
||||
} // namespace ck_tile
|
||||
|
||||
Reference in New Issue
Block a user