[CK Tile] contraction multi d - kernel & example (#2901)

* Initial commit. create batched_contraction_kernel file

* initial problem definition

* implement initial example to launch kernel

* add universal gemm to contraction. initial phase

* complete implementation for special case all Dims are 1 and no Ds

* clean code

* initial changes to support multi dimensional G

* more progress in implementing multiple G

* tmp commit

* manage dynamic NumDimG in kernel

* improving example for multi M,N,K,G handling. start generalizing kernel. it is a temporary commit

* implement the example for general Multi dimension G M N K and test different reference calculation algorithms

* 2 functions for reference using multi dimensional and flat indexing

* clean the code for muti dimentional G, M, N, K contraction and add some logs

* Add Make descriptor function in kernel for merging Ms, Ns, Ks for A, B, E

* some cleaning on kernel

* clean the code for  calculating the offsets from flatten batch number

* Start adding MultiD support to kernel and example

* more changes to manage multi D in kernel and example

* manage passing multi d to kernel and testing.

* complete multi D support in kernel. modify example code to support it

* Correct algorithm to calc the correct offset values for D tensor batches and some code cleaning

* Minor fix

* Generalize example code for variable NumD tensors and apply cleanup based on review feedback

* Refactored code and addressed review feedback

* refactoring, cleaning, add documents, in kernel side and example codes

* Optimize batch offset calculation in kernel

* Inline CalculateBatchOffset in batched contraction kernel, update CHANGELOG.md

---------

Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
This commit is contained in:
msaffari-amd
2025-10-13 12:30:28 +02:00
committed by GitHub
parent 95bdc7410c
commit e9f0cc83a8
11 changed files with 1802 additions and 0 deletions

View File

@@ -0,0 +1,7 @@
add_executable(tile_example_batched_contraction EXCLUDE_FROM_ALL batched_contraction.cpp)
set(EXAMPLE_CONTRACTION_COMPILE_OPTIONS)
if(CK_USE_OCP_FP8)
list(APPEND EXAMPLE_CONTRACTION_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
endif()
target_compile_options(tile_example_batched_contraction PRIVATE ${EXAMPLE_CONTRACTION_COMPILE_OPTIONS})

View File

@@ -0,0 +1,245 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include <hip/hip_runtime.h>
#include <cstring>
#include <iostream>
#include <sstream>
#include <string>
#include <tuple>
#include "ck_tile/core.hpp"
#include "ck_tile/ops/epilogue.hpp"
#include "ck_tile/ops/gemm.hpp"
#include "ck_tile/host.hpp"
#include "ck_tile/ops/batched_contraction.hpp"
#include "contraction_utils.hpp"
template <typename ADataType,
typename BDataType,
typename DsDataType,
typename AccDataType,
typename EDataType,
typename ALayout,
typename BLayout,
typename DsLayout,
typename ELayout,
ck_tile::index_t NumDimG,
ck_tile::index_t NumDimM,
ck_tile::index_t NumDimN,
ck_tile::index_t NumDimK,
typename CDEElementWise = ck_tile::element_wise::PassThrough>
float batched_contraction_impl(const ck_tile::BatchedContractionHostArgs<DsDataType::size()>& args,
const ck_tile::stream_config& s)
{
constexpr ck_tile::index_t M_Tile = 256;
constexpr ck_tile::index_t N_Tile = 256;
constexpr ck_tile::index_t K_Tile = 64;
constexpr ck_tile::index_t M_Warp = 2;
constexpr ck_tile::index_t N_Warp = 2;
constexpr ck_tile::index_t K_Warp = 1;
constexpr ck_tile::index_t M_Warp_Tile = 32;
constexpr ck_tile::index_t N_Warp_Tile = 32;
constexpr ck_tile::index_t K_Warp_Tile = 16;
constexpr bool DoubleSmemBuffer = false;
constexpr bool kPadM = false;
constexpr bool kPadN = false;
constexpr bool kPadK = false;
constexpr bool TransposeC = false;
constexpr int kBlockPerCu = 1;
constexpr ck_tile::index_t TileParitionerGroupNum = 8;
constexpr ck_tile::index_t TileParitionerM01 = 4;
using GemmShape =
ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
using TilePartitioner = ck_tile::
GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, ELayout>;
using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
kPadN,
kPadK,
DoubleSmemBuffer,
ALayout,
BLayout,
ELayout,
TransposeC>;
using Problem = ck_tile::BatchedContractionProblem<ADataType,
BDataType,
DsDataType,
EDataType,
NumDimG, // NumDimG
NumDimM, // NumDimM
NumDimN, // NumDimN
NumDimK, // NumDimK
DsDataType::size() // NumDTensor
>;
using GemmPipelineProblem =
ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE<GemmPipelineProblem>;
ck_tile::index_t K_total = 1;
for(ck_tile::index_t i = NumDimG + NumDimM; i < NumDimG + NumDimM + NumDimK; ++i)
{
K_total *= args.A_dims[i];
}
const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_total);
const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop);
const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
float ave_time{0};
const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
constexpr bool has_hot_loop_v = has_hot_loop_.value;
constexpr auto tail_number_v = tail_number_.value;
constexpr auto scheduler = GEMM_PIPELINE_SCHEDULER;
constexpr auto memory_operation =
ck_tile::memory_operation_enum::set; // Always set (no atomic_add)
using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
BDataType,
AccDataType,
GemmShape,
GemmUniversalTraits,
scheduler,
has_hot_loop_v,
tail_number_v>;
using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
using GemmEpilogue = ck_tile::CShuffleEpilogue<
ck_tile::CShuffleEpilogueProblem<ADataType,
BDataType,
DsDataType,
AccDataType,
EDataType,
DsLayout,
ELayout,
CDEElementWise,
TilePartitioner::MPerBlock,
TilePartitioner::NPerBlock,
M_Warp,
N_Warp,
M_Warp_Tile,
N_Warp_Tile,
K_Warp_Tile,
UniversalGemmProblem::TransposeC,
memory_operation>>;
using Kernel =
ck_tile::BatchedContractionKernel<Problem, TilePartitioner, GemmPipeline, GemmEpilogue>;
auto kargs = Kernel::MakeKernelArgs(args);
const dim3 grids = Kernel::GridSize(kargs);
const dim3 blocks = Kernel::GetBlockSize();
if(!Kernel::IsSupportedArguments(kargs))
{
throw std::runtime_error("Wrong! Arguments not supported! Skipping contraction!\n");
}
if(s.log_level_ > 0)
{
std::cout << "Launching kernel with args: " << Kernel::GetKernelName() << '\n'
<< "shape: " << GemmShape::GetName() << '\n'
<< "problem: " << GemmPipelineProblem::GetName() << '\n'
<< "pipeline: " << GemmPipeline::GetName() << '\n'
<< "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
<< ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
<< std::endl;
}
auto kernel = ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs);
ave_time = ck_tile::launch_kernel(s, kernel);
return ave_time;
};
BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
return ave_time;
}
#define HANDLE_CASE(G, M, N, K) \
if(num_g_dims == G && num_m_dims == M && num_n_dims == N && num_k_dims == K) \
{ \
return batched_contraction_impl<ADataType, \
BDataType, \
DsDataType, \
AccDataType, \
EDataType, \
ALayout, \
BLayout, \
DsLayout, \
ELayout, \
G, \
M, \
N, \
K, \
CDEElementWise>(args, s); \
}
template <typename ADataType,
typename BDataType,
typename DsDataType,
typename AccDataType,
typename EDataType,
typename ALayout,
typename BLayout,
typename DsLayout,
typename ELayout,
typename CDEElementWise = ck_tile::element_wise::PassThrough>
float batched_contraction(const ck_tile::BatchedContractionHostArgs<DsDataType::size()>& args,
const ck_tile::stream_config& s,
ck_tile::index_t num_g_dims,
ck_tile::index_t num_m_dims,
ck_tile::index_t num_n_dims,
ck_tile::index_t num_k_dims)
{
std::cout << "Dimensions: G=" << num_g_dims << ", M=" << num_m_dims << ", N=" << num_n_dims
<< ", K=" << num_k_dims << std::endl;
HANDLE_CASE(1, 1, 1, 1);
HANDLE_CASE(2, 1, 1, 1);
HANDLE_CASE(2, 2, 2, 1);
HANDLE_CASE(1, 2, 1, 1);
HANDLE_CASE(1, 1, 1, 2);
HANDLE_CASE(2, 2, 2, 2);
HANDLE_CASE(4, 4, 4, 4);
throw std::runtime_error(
"Unsupported dimension combination: G=" + std::to_string(num_g_dims) +
", M=" + std::to_string(num_m_dims) + ", N=" + std::to_string(num_n_dims) +
", K=" + std::to_string(num_k_dims) + ". Please add this combination to the kernel.");
}
#include "run_batched_contraction_example.inc"
int main(int argc, char* argv[])
{
try
{
return !run_batched_contraction_example(argc, argv);
}
catch(const std::runtime_error& e)
{
std::cerr << "Runtime error: " << e.what() << '\n';
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,146 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
#include "ck_tile/core.hpp"
#include "ck_tile/host/kernel_launch.hpp"
struct AddDs
{
template <typename E, typename C, typename... Ds>
CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const Ds&... ds) const -> void
{
const float x0_f =
ck_tile::type_convert<float>(c) + (ck_tile::type_convert<float>(ds) + ...);
e = ck_tile::type_convert<E>(x0_f);
}
};
#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
template <typename DataType>
struct BatchedContractionTypeConfig
{
using ADataType = DataType;
using BDataType = DataType;
using AccDataType = float;
using EDataType = DataType;
using DDataType = DataType;
};
using ContractionTypes = BatchedContractionTypeConfig<ck_tile::half_t>;
using ADataType = ContractionTypes::ADataType;
using BDataType = ContractionTypes::BDataType;
using AccDataType = ContractionTypes::AccDataType;
using EDataType = ContractionTypes::EDataType;
using DDataType = ContractionTypes::DDataType;
auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
arg_parser.insert("m_dims", "4,256", "M dimensions separated by comma (e.g., '16,32' for 2D M)")
.insert("n_dims", "16,128", "N dimensions separated by comma (e.g., '32,32' for 2D N)")
.insert("k_dims", "64", "K dimensions separated by comma (e.g., '64,32' for 2D K)")
.insert(
"g_dims", "1,2", "G dimensions separated by comma (e.g., '4,2' for 2D, '2,3,4' for 3D)")
.insert("stride_a", "0", "Custom A tensor leading dimension stride (0 = auto)")
.insert("stride_b", "0", "Custom B tensor leading dimension stride (0 = auto)")
.insert("stride_e", "0", "Custom E tensor leading dimension stride (0 = auto)")
.insert("a_layout", "R", "A tensor data layout - Row by default")
.insert("b_layout", "C", "B tensor data layout - Col by default")
.insert("e_layout", "R", "E tensor data layout - Row by default")
.insert("v", "1", "0. No validation, 1. Validation on CPU")
.insert("prec", "fp16", "data type. fp32/fp16/bf16")
.insert("warmup", "5", "number of iterations before benchmark the kernel")
.insert("repeat", "10", "number of iterations to benchmark the kernel")
.insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
.insert("split_k", "1", "splitK value")
.insert("log", "1", "log level for debugging");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);
}
// Helper function to parse G, M, N, K dimensions from string
std::vector<ck_tile::index_t> parse_dimensions(const std::string& dims_str)
{
std::vector<ck_tile::index_t> dims;
std::stringstream ss(dims_str);
std::string token;
while(std::getline(ss, token, ','))
{
dims.push_back(std::stoi(token));
}
if(dims.empty())
{
throw std::invalid_argument("Dimensions cannot be empty");
}
return dims;
}
// Helper function to Calculate total elements from multi-dimensional vector
ck_tile::index_t calculate_total_elements(const std::vector<ck_tile::index_t>& dims)
{
ck_tile::index_t total = 1;
for(auto dim : dims)
{
total *= dim;
}
return total;
}
/**
* @brief Flattens a list of tensor dimension components into a single dimension vector.
*
* This function takes a list of dimension vectors (e.g., representing different components
* such as G, M, N, or K dimensions) and concatenates them into a single vector.
*
* Example:
* Input: {{G0, G1}, {M0, M1}, {K0}}
* Output: {G0, G1, M0, M1, K0}
*
* @param dim_components A vector of vectors, where each inner vector represents a set of tensor
* dimensions.
* @return A single vector containing all dimensions concatenated in order.
*/
std::vector<ck_tile::index_t>
concatenate_dim_components(const std::vector<std::vector<ck_tile::index_t>>& dim_components)
{
std::vector<ck_tile::index_t> result;
// Concatenate all dimension components into a single vector
for(const auto& component : dim_components)
{
result.insert(result.end(), component.begin(), component.end());
}
return result;
}
// Helper function for printing dimensions
void print_dims(const std::string& name,
const std::vector<ck_tile::index_t>& dims,
ck_tile::index_t total)
{
std::cout << name << ": [";
for(size_t i = 0; i < dims.size(); ++i)
{
std::cout << dims[i];
if(i < dims.size() - 1)
std::cout << ",";
}
std::cout << "] ";
if(total != 0)
std::cout << "(total=" << total << ")";
std::cout << std::endl;
}

View File

@@ -0,0 +1,405 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <algorithm>
#include <cmath>
#include "contraction_utils.hpp"
#include "ck_tile/host/reference/reference_batched_contraction.hpp"
template <typename ADataType, typename BDataType, typename EDataType, typename AccDataType>
auto calculate_rtol_atol(const ck_tile::index_t K,
const ck_tile::index_t kbatch,
const float max_accumulated_value)
{
using ComputeType =
std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
const auto rtol = ck_tile::get_relative_threshold<ComputeType, EDataType, AccDataType>(
ck_tile::integer_divide_ceil(K, kbatch));
const auto atol = ck_tile::get_absolute_threshold<ComputeType, EDataType, AccDataType>(
max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
const auto rtol_split_k =
ck_tile::get_relative_threshold<EDataType, EDataType, EDataType>(kbatch);
const auto atol_split_k = ck_tile::get_absolute_threshold<EDataType, EDataType, EDataType>(
max_accumulated_value, kbatch);
return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
}
template <typename ADataType,
typename BDataType,
typename DsDataType,
typename AccDataType,
typename EDataType,
typename ALayout,
typename BLayout,
typename DsLayout,
typename ELayout,
typename CDEElementWise = ck_tile::element_wise::PassThrough>
float invoke_batched_contraction_kernel(
const void* a_full_dims_dev_buf,
const void* b_full_dims_dev_buf,
const std::array<const void*, DsDataType::size()>& ds_dev_buf,
void* e_full_dims_dev_buf,
const std::vector<ck_tile::index_t>& G_dims,
const std::vector<ck_tile::index_t>& M_dims,
const std::vector<ck_tile::index_t>& N_dims,
const std::vector<ck_tile::index_t>& K_dims,
const std::vector<ck_tile::index_t>& A_dims, // [G0,G1,..,M0,M1,..,K0,K1,..]
const std::vector<ck_tile::index_t>& B_dims, // [G0,G1,..,N0,N1,..,K0,K1,..]
const std::array<std::vector<ck_tile::index_t>, DsDataType::size()>&
Ds_dims, // [G0, G1, ..., M0, M1, ... , N0, N1, ...][NumDTensor]
const std::vector<ck_tile::index_t>& E_dims, // [G0,G1,..,M0,M1,..,N0,N1,..]
const std::vector<ck_tile::index_t>& A_strides, // [G0,G1,..,M0,M1,..,K0,K1,..]
const std::vector<ck_tile::index_t>& B_strides, // [G0,G1,..,N0,N1,..,K0,K1,..]
const std::array<std::vector<ck_tile::index_t>, DsDataType::size()>& Ds_strides,
const std::vector<ck_tile::index_t>& E_strides, // [G0,G1,..,M0,M1,..,N0,N1,..]
ck_tile::index_t kbatch,
int n_warmup,
int n_repeat)
{
std::cout << "Creating BatchedContractionHostArgs..." << std::endl;
ck_tile::BatchedContractionHostArgs<DsDataType::size()> args(a_full_dims_dev_buf, // a_ptr
b_full_dims_dev_buf, // b_ptr
ds_dev_buf, // ds_ptr
e_full_dims_dev_buf, // e_ptr
kbatch, // k_batch
A_dims, // A_dims
B_dims, // B_dims
Ds_dims, // Ds_dims
E_dims, // E_dims
A_strides, // A_strides
B_strides, // B_strides
Ds_strides, // Ds_strides
E_strides // E_strides
);
std::cout << "Calling batched_contraction with dimensions: G=" << G_dims.size()
<< ", M=" << M_dims.size() << ", N=" << N_dims.size() << ", K=" << K_dims.size()
<< std::endl;
float ave_time = batched_contraction<ADataType,
BDataType,
DsDataType,
AccDataType,
EDataType,
ALayout,
BLayout,
DsLayout,
ELayout,
CDEElementWise>(
args,
ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat},
G_dims.size(), // num_g_dims
M_dims.size(), // num_m_dims
N_dims.size(), // num_n_dims
K_dims.size() // num_k_dims
);
return ave_time;
}
template <typename ALayout, typename BLayout, typename DLayout, typename ELayout>
int run_batched_contraction_example_with_layouts(
int argc,
char* argv[],
[[maybe_unused]] const ALayout a_layout = ALayout{},
[[maybe_unused]] const BLayout b_layout = BLayout{},
[[maybe_unused]] const DLayout d_layout = DLayout{},
[[maybe_unused]] const ELayout e_layout = ELayout{})
{
auto [result, arg_parser] = create_args(argc, argv);
if(!result)
return -1;
std::vector<ck_tile::index_t> G_dims = parse_dimensions(arg_parser.get_str("g_dims"));
std::vector<ck_tile::index_t> M_dims = parse_dimensions(arg_parser.get_str("m_dims"));
std::vector<ck_tile::index_t> N_dims = parse_dimensions(arg_parser.get_str("n_dims"));
std::vector<ck_tile::index_t> K_dims = parse_dimensions(arg_parser.get_str("k_dims"));
constexpr ck_tile::index_t NumDTensor = 2;
ck_tile::index_t G_total = calculate_total_elements(G_dims);
ck_tile::index_t M_total = calculate_total_elements(M_dims);
ck_tile::index_t N_total = calculate_total_elements(N_dims);
ck_tile::index_t K_total = calculate_total_elements(K_dims);
std::vector<ck_tile::index_t> A_dims =
concatenate_dim_components({G_dims, M_dims, K_dims}); // [G0,G1,..,M0,M1,..,K0,K1,..]
std::vector<ck_tile::index_t> B_dims =
concatenate_dim_components({G_dims, N_dims, K_dims}); // [G0,G1,..,N0,N1,..,K0,K1,..]
std::vector<ck_tile::index_t> E_dims =
concatenate_dim_components({G_dims, M_dims, N_dims}); // [G0,G1,..,M0,M1,..,N0,N1,..]
std::array<std::vector<ck_tile::index_t>, NumDTensor> Ds_dims;
for(ck_tile::index_t d = 0; d < NumDTensor; ++d)
{
Ds_dims[d] = E_dims;
}
auto convert_strides = [](const std::vector<std::size_t>& strides) {
std::vector<ck_tile::index_t> converted(strides.size());
std::copy(strides.begin(), strides.end(), converted.begin());
return converted;
};
ck_tile::HostTensorDescriptor a_desc(A_dims);
ck_tile::HostTensorDescriptor b_desc(B_dims);
ck_tile::HostTensorDescriptor e_desc(E_dims);
std::array<ck_tile::HostTensorDescriptor, NumDTensor> ds_descs;
for(ck_tile::index_t d = 0; d < NumDTensor; ++d)
{
ds_descs[d] = ck_tile::HostTensorDescriptor(Ds_dims[d], e_desc.get_strides());
}
std::vector<ck_tile::index_t> A_strides = convert_strides(a_desc.get_strides());
std::vector<ck_tile::index_t> B_strides = convert_strides(b_desc.get_strides());
std::vector<ck_tile::index_t> E_strides = convert_strides(e_desc.get_strides());
std::array<std::vector<ck_tile::index_t>, NumDTensor> Ds_strides;
for(ck_tile::index_t d = 0; d < NumDTensor; ++d)
{
Ds_strides[d] = convert_strides(ds_descs[d].get_strides());
}
ck_tile::index_t kbatch = arg_parser.get_int("split_k");
int n_warmup = arg_parser.get_int("warmup");
int n_repeat = arg_parser.get_int("repeat");
print_dims("G_dims", G_dims, G_total);
print_dims("M_dims", M_dims, M_total);
print_dims("N_dims", N_dims, N_total);
print_dims("K_dims", K_dims, K_total);
std::cout << "NumDTensor: " << NumDTensor << std::endl;
std::cout << "\n=== Tensor Shapes for Kernel ===" << std::endl;
print_dims("A_dims", A_dims, 0);
print_dims("B_dims", B_dims, 0);
print_dims("E_dims", E_dims, 0);
for(ck_tile::index_t d = 0; d < NumDTensor; ++d)
{
print_dims("Ds[" + std::to_string(d) + "]_dims", Ds_dims[d], 0);
}
std::cout << "\n=== Tensor Strides ===" << std::endl;
print_dims("A_strides", A_strides, 0);
print_dims("B_strides", B_strides, 0);
print_dims("E_strides", E_strides, 0);
for(ck_tile::index_t d = 0; d < NumDTensor; ++d)
{
print_dims("Ds[" + std::to_string(d) + "]_strides", Ds_strides[d], 0);
}
std::cout << "===============================================\n" << std::endl;
ck_tile::HostTensor<::ADataType> a_full_dims_host(a_desc);
ck_tile::HostTensor<::BDataType> b_full_dims_host(b_desc);
ck_tile::HostTensor<::EDataType> e_full_dims_host(e_desc);
std::vector<ck_tile::HostTensor<::DDataType>> ds_full_dims_host;
for(int d = 0; d < NumDTensor; ++d)
{
ds_full_dims_host.emplace_back(ck_tile::HostTensor<::DDataType>(ds_descs[d]));
}
ck_tile::FillUniformDistribution<::ADataType>{-5.f, 5.f, std::nullopt}(a_full_dims_host);
ck_tile::FillUniformDistribution<::BDataType>{-5.f, 5.f, std::nullopt}(b_full_dims_host);
ck_tile::DeviceMem a_full_dims_dev_buf(a_full_dims_host.get_element_space_size_in_bytes());
ck_tile::DeviceMem b_full_dims_dev_buf(b_full_dims_host.get_element_space_size_in_bytes());
ck_tile::DeviceMem e_full_dims_dev_buf(e_full_dims_host.get_element_space_size_in_bytes());
a_full_dims_dev_buf.ToDevice(a_full_dims_host.data());
b_full_dims_dev_buf.ToDevice(b_full_dims_host.data());
for(int d = 0; d < NumDTensor; ++d)
{
ck_tile::FillUniformDistribution<::DDataType>{-2.f, 2.f, std::nullopt}(
ds_full_dims_host[d]);
}
std::vector<std::unique_ptr<ck_tile::DeviceMem>> ds_full_dims_dev_buf;
for(int d = 0; d < NumDTensor; ++d)
{
ds_full_dims_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
ds_full_dims_host[d].get_element_space_size_in_bytes()));
ds_full_dims_dev_buf[d]->ToDevice(ds_full_dims_host[d].data());
}
std::array<const void*, NumDTensor> ds_ptr_buf;
for(int d = 0; d < NumDTensor; ++d)
{
ds_ptr_buf[d] = ds_full_dims_dev_buf[d]->GetDeviceBuffer();
}
e_full_dims_dev_buf.SetZero();
e_full_dims_host.SetZero();
std::cout << "\n=== Running GPU Kernel ===" << std::endl;
using DsDataType = ck_tile::tuple_array<::DDataType, NumDTensor>;
using DsLayout = ck_tile::tuple_array<DLayout, NumDTensor>;
using CDEElementWise =
std::conditional_t<NumDTensor == 0, ck_tile::element_wise::PassThrough, AddDs>;
float ave_time =
invoke_batched_contraction_kernel<::ADataType,
::BDataType,
DsDataType,
::AccDataType,
::EDataType,
ALayout,
BLayout,
DsLayout,
ELayout,
CDEElementWise>(a_full_dims_dev_buf.GetDeviceBuffer(),
b_full_dims_dev_buf.GetDeviceBuffer(),
ds_ptr_buf,
e_full_dims_dev_buf.GetDeviceBuffer(),
G_dims,
M_dims,
N_dims,
K_dims,
A_dims,
B_dims,
Ds_dims,
E_dims,
A_strides,
B_strides,
Ds_strides,
E_strides,
kbatch,
n_warmup,
n_repeat);
std::string op_name{
"Multi-Dimensional Batched Contraction : G: " + std::to_string(G_dims.size()) +
"D, M: " + std::to_string(M_dims.size()) + "D, N: " + std::to_string(N_dims.size()) +
"D, K: " + std::to_string(K_dims.size()) + "D"};
std::size_t flop = std::size_t(2) * G_total * M_total * N_total * K_total +
NumDTensor * K_total * M_total * N_total; // Number of operations
std::size_t num_byte =
sizeof(::ADataType) * G_total * M_total * K_total + // A tensor size
sizeof(::BDataType) * G_total * N_total * K_total + // B tensor size
sizeof(::DDataType) * NumDTensor * G_total * M_total * N_total + // D tensors
sizeof(::EDataType) * G_total * M_total * N_total; // E tensor size
float tflops = static_cast<float>(flop) / 1.E9 / ave_time; // TFlops calculation
float gb_per_sec = num_byte / 1.E6 / ave_time; // GB/s calculation
print_dims("G_dims", G_dims, G_total);
print_dims("M_dims", M_dims, M_total);
print_dims("N_dims", N_dims, N_total);
print_dims("K_dims", K_dims, K_total);
std::cout << " Performance: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s" << std::endl;
std::cout << "===============================================" << std::endl;
e_full_dims_dev_buf.FromDevice(e_full_dims_host.data());
std::cout << "GPU results retrieved from device." << std::endl;
bool pass = true;
if(arg_parser.get_int("v") == 1)
{
std::cout << "Computing CPU reference..." << std::endl;
ck_tile::HostTensor<::EDataType> e_full_dims_host_ref(
ck_tile::HostTensorDescriptor(E_dims, E_strides));
e_full_dims_host_ref.SetZero();
auto start_time = std::chrono::high_resolution_clock::now();
calculate_reference_flat_indexing<ADataType,
BDataType,
DDataType,
EDataType,
AccDataType,
CDEElementWise>(a_full_dims_host,
b_full_dims_host,
ds_full_dims_host,
e_full_dims_host_ref,
G_total,
M_total,
N_total,
K_total,
CDEElementWise{});
auto end_time = std::chrono::high_resolution_clock::now();
auto duration =
std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
std::cout << "CPU reference completed in " << duration.count() << "ms" << std::endl;
const float max_accumulated_value =
*std::max_element(e_full_dims_host_ref.mData.begin(), e_full_dims_host_ref.mData.end());
const auto rtol_atol =
calculate_rtol_atol<::ADataType, ::BDataType, ::EDataType, ::AccDataType>(
K_total, kbatch, max_accumulated_value);
pass = ck_tile::check_err(e_full_dims_host,
e_full_dims_host_ref,
"Error: Incorrect results!",
rtol_atol.at(ck_tile::number<0>{}),
rtol_atol.at(ck_tile::number<1>{}));
std::cout << "The CPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
std::cout << "===============================================" << std::endl;
std::cout << "\n=== Random Samples of Reference and Result ===" << std::endl;
// Generate 10 random indices
std::vector<std::size_t> random_indices;
std::size_t total_elements = e_full_dims_host_ref.mData.size();
std::mt19937 rng(std::random_device{}());
std::uniform_int_distribution<std::size_t> dist(0, total_elements - 1);
for(int i = 0; i < 10; ++i)
{
random_indices.push_back(dist(rng));
}
// Print the values at the random indices
for(std::size_t idx : random_indices)
{
std::cout << "Index " << idx << ": "
<< "ref=" << static_cast<float>(e_full_dims_host_ref.mData[idx]) << ", "
<< "GPU=" << static_cast<float>(e_full_dims_host.mData[idx]) << std::endl;
}
std::cout << "===============================================" << std::endl;
}
return pass;
}
int run_batched_contraction_example(int argc, char* argv[])
{
auto [result, arg_parser] = create_args(argc, argv);
if(!result)
return -1;
using Row = ck_tile::tensor_layout::gemm::RowMajor;
using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
std::string a_layout = arg_parser.get_str("a_layout");
std::string b_layout = arg_parser.get_str("b_layout");
if(a_layout == "R" && b_layout == "C")
{
return run_batched_contraction_example_with_layouts(argc, argv, Row{}, Col{}, Row{}, Row{});
}
else
{
throw std::runtime_error("Unsupported data layout configuration for A,B and E tensors! "
"Only R-C-R supported for now.");
}
}

View File

@@ -27,3 +27,4 @@ add_subdirectory(36_pooling)
add_subdirectory(38_block_scale_gemm)
add_subdirectory(39_copy)
add_subdirectory(40_streamk_gemm)
add_subdirectory(41_batched_contraction)