mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-21 21:39:15 +00:00
* Optimize GEMM on MI200/300: 1. Add new blockwise gemm pipeline 2. Add irregular splitk intances * clang format + typo fix * Fix a bug * initial commit * Add more instances to irregular splitk * blkgemm pipeline v1~4 prototype * Sanity Checked. Known issue: 1. Poor performance of splitk 2. Register spill on blkgemmpipeline v3 * Sanity and Performance fix: 1. fix a bug related to sanity in grouped b2c mapping 2. fix a bug related to sanity and performance in splitk offset * Sanity and API update: 1. Remove prefetch stage 2. Fix valid check bug 3, Add first gemm_universal instance into ckProfiler * Add NN instances for gemm universal * 1. Add NT instances for gemm_universal 2. Fix a bug about Kpadding in gemm_universal * Fix a bug regarding padding Odd K number * remove kernel print * Fix KPadding bug... * Update safety check * another try to fix kpadding.. * Sanity checked * new instances.. * clang format+typo fix * remove clang format script's change * Add non-hotloop compile option * 1. Add fp16xfp8 example 2. pull packed convert f8 from pr1150 * Some miscs.. opt and fix * Add pipeline description docs * Split universal gemm instance library to cut profiler compiling time * uncomment cmakefile * Fix a bug caused by blockwise_gemm_pipe_v2 * reduce default splitk to 1 * Add 224x256x64 tile size * update, including: 1. Experiment pipeline 5~7 2. Optimization for pipeline 4 3. Organized instance library * temp save * temp save * Permuted lds layout, sanity and function checked * clang format * Move OOB check from RunRead to RunWrite, for better software pipeline. TODO: agpr spill when NN layout * clangformat * A/B splitpipe scheduler for v3 * Fix two bugs * bug fix * fix a bug in oob check * Example for mixed fp16_fp8 gemm * Clean experimental code blocks * Add mixed precision gemm into profiler * tempsave * optimize m/n major lds layout * Add RRR GEMM mixed precision instances * Optimize f8 matrix transpose * Add test_gemm_universal * A/B spilt schedule for blkpip v5 * Take ds_read2 into iglp scheduling scheme * format * fixed cmake * Add llvm-option into CI cmake flag --------- Co-authored-by: Jing Zhang <jizhan@amd.com>
92 lines
3.4 KiB
C++
92 lines
3.4 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include <string>
|
|
#include <sstream>
|
|
#include <tuple>
|
|
#include <vector>
|
|
#include <gtest/gtest.h>
|
|
|
|
#include "ck/ck.hpp"
|
|
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
|
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
|
#include "include/ck/utility/data_type.hpp"
|
|
#include "profiler/profile_gemm_universal_impl.hpp"
|
|
|
|
namespace ck {
|
|
namespace test {
|
|
|
|
template <typename Tuple>
|
|
class TestGemmUniversal : public testing::Test
|
|
{
|
|
using Row = ck::tensor_layout::gemm::RowMajor;
|
|
using F32 = float;
|
|
|
|
protected:
|
|
using ALayout = std::tuple_element_t<0, Tuple>;
|
|
using BLayout = std::tuple_element_t<1, Tuple>;
|
|
using CLayout = Row;
|
|
using ADataType = std::tuple_element_t<2, Tuple>;
|
|
using BDataType = std::tuple_element_t<3, Tuple>;
|
|
using CDataType = std::tuple_element_t<4, Tuple>;
|
|
|
|
public:
|
|
static constexpr bool verify_ = true;
|
|
static constexpr int init_method_ = 1; // decimal value initialization
|
|
static constexpr bool log_ = false;
|
|
static constexpr bool bench_ = false; // measure kernel performance
|
|
std::vector<int> k_batches_;
|
|
|
|
void SetUp() override { k_batches_ = {1, 2, 3, 5, 8}; }
|
|
|
|
void Run(const int M,
|
|
const int N,
|
|
const int K,
|
|
const int StrideA,
|
|
const int StrideB,
|
|
const int StrideC)
|
|
{
|
|
for(auto kb : k_batches_)
|
|
{
|
|
RunSingle(M, N, K, StrideA, StrideB, StrideC, kb);
|
|
}
|
|
}
|
|
|
|
void RunSingle(const int M,
|
|
const int N,
|
|
const int K,
|
|
const int StrideA,
|
|
const int StrideB,
|
|
const int StrideC,
|
|
int kbatch = 1,
|
|
int n_warmup = 1,
|
|
int n_iter = 10)
|
|
{
|
|
bool pass = ck::profiler::profile_gemm_universal_impl<ADataType,
|
|
BDataType,
|
|
F32,
|
|
CDataType,
|
|
ALayout,
|
|
BLayout,
|
|
CLayout>(verify_,
|
|
init_method_,
|
|
log_,
|
|
bench_,
|
|
M,
|
|
N,
|
|
K,
|
|
StrideA,
|
|
StrideB,
|
|
StrideC,
|
|
kbatch,
|
|
n_warmup,
|
|
n_iter);
|
|
EXPECT_TRUE(pass);
|
|
}
|
|
};
|
|
|
|
} // namespace test
|
|
} // namespace ck
|