mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 10:09:41 +00:00
Reorganize project folders (#6)
This commit is contained in:
58
codegen/CMakeLists.txt
Normal file
58
codegen/CMakeLists.txt
Normal file
@@ -0,0 +1,58 @@
|
||||
cmake_minimum_required(VERSION 3.16)
|
||||
project(composable_kernel_host)
|
||||
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
|
||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
|
||||
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||
set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||
configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h)
|
||||
|
||||
find_package(ROCM)
|
||||
include(ROCMInstallTargets)
|
||||
include(ROCMTest)
|
||||
|
||||
rocm_setup_version(VERSION 1.0)
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake)
|
||||
include(Embed)
|
||||
file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
|
||||
${CK_ROOT}/include/ck/*.hpp)
|
||||
# printouts fot debug purposes
|
||||
# message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
|
||||
# message(STATUS "RELATIVE: ${CK_ROOT}/include")
|
||||
add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)
|
||||
|
||||
add_compile_options(-std=c++17)
|
||||
|
||||
file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
|
||||
# TODO: Use object library
|
||||
add_library(ck_host STATIC ${SOURCES})
|
||||
target_link_libraries(ck_host PRIVATE ck_headers)
|
||||
|
||||
set_target_properties(ck_host PROPERTIES
|
||||
LINKER_LANGUAGE CXX
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
# target_include_directories(ck_host PUBLIC
|
||||
# $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
||||
# )
|
||||
|
||||
add_executable(ck-template-driver driver/main.cpp)
|
||||
target_link_libraries(ck-template-driver ck_host)
|
||||
|
||||
rocm_install_targets(
|
||||
TARGETS ck_host ck_headers
|
||||
EXPORT ck_host_targets
|
||||
INCLUDE include
|
||||
)
|
||||
rocm_export_targets(
|
||||
EXPORT ck_host_targets
|
||||
NAMESPACE composable_kernel::
|
||||
)
|
||||
|
||||
if(BUILD_TESTING)
|
||||
add_subdirectory(test)
|
||||
endif()
|
||||
|
||||
2
codegen/README.md
Normal file
2
codegen/README.md
Normal file
@@ -0,0 +1,2 @@
|
||||
[Back to the main page](../README.md)
|
||||
# Composable Kernel codegen
|
||||
107
codegen/driver/main.cpp
Normal file
107
codegen/driver/main.cpp
Normal file
@@ -0,0 +1,107 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include "ck/host/device_gemm_multiple_d/operation.hpp"
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
|
||||
#include "ck/host/stringutils.hpp"
|
||||
|
||||
using ck::host::Transform;
|
||||
|
||||
struct Emitters
|
||||
{
|
||||
// retrieve the hard-coded instances provided, template them, and then store them in a map
|
||||
std::unordered_map<std::string, std::function<std::vector<std::string>()>> m;
|
||||
|
||||
template <class T>
|
||||
void Register(const std::string& name, const std::string& prologue, const std::string& epilogue)
|
||||
{
|
||||
m[name] = [&] {
|
||||
auto configs = T::CreateOperations(prologue, epilogue);
|
||||
|
||||
return Transform(configs, [](const auto& ops) { return ToTuple(ops); });
|
||||
};
|
||||
}
|
||||
|
||||
// takes in an operation instance and uses it to substitute the correct values into the template
|
||||
template <class T>
|
||||
static std::string ToTuple(const T& ops)
|
||||
{
|
||||
auto templates = Transform(
|
||||
ops, [](const auto& op) { return " " + op.ToSolution().ToTemplateString(); });
|
||||
return "std::tuple<\n" + ck::host::JoinStrings(templates, ",\n") + ">";
|
||||
}
|
||||
|
||||
// Join together all the strings in the map
|
||||
std::string Emit(const std::string& name) { return ck::host::JoinStrings(m.at(name)(), "\n"); }
|
||||
|
||||
std::vector<std::string> List() const
|
||||
{
|
||||
return Transform(m, [](auto&& p) { return p.first; });
|
||||
}
|
||||
};
|
||||
|
||||
int main(int argc, const char* argv[])
|
||||
{
|
||||
std::string prog = argv[0];
|
||||
std::vector<std::string> args(argv + 1, argv + argc);
|
||||
|
||||
// Specify problem type and problem size
|
||||
ck::host::device_gemm_multiple_d::Problem prob;
|
||||
prob.M = 1024;
|
||||
prob.N = 1024;
|
||||
prob.K = 1024;
|
||||
|
||||
// user provided fusion
|
||||
std::string prologue = "";
|
||||
std::string epilogue = R"(
|
||||
struct Epilogue
|
||||
{
|
||||
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};)";
|
||||
|
||||
// Load in operations into the Register
|
||||
Emitters e;
|
||||
e.Register<ck::host::device_gemm_multiple_d::Operation_Xdl_CShuffle>(
|
||||
"DeviceGemmMultipleD_Xdl_CShuffle", prologue, epilogue);
|
||||
|
||||
if(args.empty() or std::any_of(args.begin(), args.end(), [](auto arg) {
|
||||
return arg == "-h" or arg == "--help";
|
||||
}))
|
||||
{
|
||||
std::cout << "USAGE:" << std::endl;
|
||||
std::cout << " " << prog << " [TEMPLATE]" << std::endl;
|
||||
std::cout << std::endl;
|
||||
std::cout << "FLAGS:" << std::endl;
|
||||
std::cout << " -h, --help Show help" << std::endl;
|
||||
std::cout << std::endl;
|
||||
std::cout << "TEMPLATES:" << std::endl;
|
||||
for(auto x : e.List())
|
||||
std::cout << " " << x << std::endl;
|
||||
std::cout << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// print out all the instances for the operation that was chosen at the command line
|
||||
for(auto name : args)
|
||||
std::cout << e.Emit(name) << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "ck/host/types.hpp"
|
||||
#include "ck/host/operation/gemm.hpp"
|
||||
#include "ck/host/device_batched_gemm_softmax_gemm/problem.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace device_batched_gemm_softmax_gemm {
|
||||
|
||||
// defines all values need for an instance of fwd conv
|
||||
struct Operation_Xdl_CShuffle
|
||||
{
|
||||
// returns a vector of instances, only given fusion operators: will use default problem spec
|
||||
static std::vector<std::vector<Operation_Xdl_CShuffle>>
|
||||
CreateOperations(const std::string& prologue, const std::string& epilogue);
|
||||
// returns a vector of instances, given a problem spec and fusion operators
|
||||
static std::vector<Operation_Xdl_CShuffle>
|
||||
CreateOperations(const Problem& prob, const std::string& prologue, const std::string& epilogue);
|
||||
TensorDesc A{};
|
||||
TensorDesc B{};
|
||||
TensorDesc B1{};
|
||||
TensorDesc C{};
|
||||
DataType acc = DataType::Float;
|
||||
DataType cs_type = DataType::Half;
|
||||
std::string a_elem_op = PassThrough;
|
||||
std::string b_elem_op = PassThrough;
|
||||
std::string b1_elem_op = PassThrough;
|
||||
std::string c_elem_op = PassThrough;
|
||||
std::string acc_elem_op = Scale;
|
||||
std::string prologue = "";
|
||||
std::string epilogue = "";
|
||||
std::string gemm_specialization = "ck::tensor_operation::device::GemmSpecialization::Default";
|
||||
// tuning parameters
|
||||
operation::TileDescGemmGemm tile_desc{};
|
||||
operation::BlockTransferDesc a_block_transfer{};
|
||||
operation::BlockTransferDesc b0_block_transfer{};
|
||||
operation::BlockTransferDesc b1_block_transfer{};
|
||||
operation::CShuffleDesc cshuffle{};
|
||||
operation::CBlockTransferDesc c_block_transfer{};
|
||||
|
||||
bool mask_out_upper_triangle = false;
|
||||
|
||||
// functions to update fusion operators if provided
|
||||
void update_prologue(const std::string& prologue);
|
||||
void update_epilogue(const std::string& epilogue);
|
||||
/**constexpr**/ bool
|
||||
IsSupported(std::size_t MRaw_, std::size_t NRaw_, std::size_t KRaw_, std::size_t Gemm1NRaw_);
|
||||
// returns a templated instance
|
||||
Solution ToSolution() const;
|
||||
};
|
||||
|
||||
} // namespace device_batched_gemm_softmax_gemm
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,48 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "ck/host/types.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace device_batched_gemm_softmax_gemm {
|
||||
|
||||
// defines the problem specification for a GEMM operation
|
||||
struct Problem
|
||||
{
|
||||
std::size_t M = 0;
|
||||
std::size_t N = 0;
|
||||
std::size_t K = 0;
|
||||
std::size_t O = 0;
|
||||
bool TransA = false;
|
||||
bool TransB = false;
|
||||
bool TransB1 = false;
|
||||
bool TransC = false;
|
||||
DataType ADataType = DataType::Half;
|
||||
DataType BDataType = DataType::Half;
|
||||
DataType B1DataType = DataType::Half;
|
||||
DataType CDataType = DataType::Half;
|
||||
std::string AElementOp = PassThrough;
|
||||
std::string BElementOp = PassThrough;
|
||||
std::string B1ElementOp = PassThrough;
|
||||
std::string CElementOp = PassThrough;
|
||||
std::string AccElementOp = Scale;
|
||||
bool MaskOutUpperTriangle = false;
|
||||
|
||||
// returns the correct device op file for the operation
|
||||
std::string GetIncludeHeader() const;
|
||||
|
||||
// returns a list of instances based on the problem spec and provided fusion operations
|
||||
std::vector<Solution> GetSolutions(const std::string& arch,
|
||||
const std::string& prologue = "",
|
||||
const std::string& epilogue = "") const;
|
||||
};
|
||||
|
||||
} // namespace device_batched_gemm_softmax_gemm
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
42
codegen/include/ck/host/device_gemm_multiple_d.hpp
Normal file
42
codegen/include/ck/host/device_gemm_multiple_d.hpp
Normal file
@@ -0,0 +1,42 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include "ck/host/types.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace device_gemm_multiple_d {
|
||||
|
||||
struct Problem
|
||||
{
|
||||
std::size_t M = 0;
|
||||
std::size_t N = 0;
|
||||
std::size_t K = 0;
|
||||
bool TransA = false;
|
||||
bool TransB = false;
|
||||
bool TransE = false;
|
||||
std::vector<bool> DsTrans = {};
|
||||
DataType ADataType = DataType::Half;
|
||||
DataType BDataType = DataType::Half;
|
||||
DataType EDataType = DataType::Half;
|
||||
std::vector<DataType> DsDataType = {};
|
||||
std::string AElementOp = "ck::tensor_operation::element_wise::PassThrough";
|
||||
std::string BElementOp = "ck::tensor_operation::element_wise::PassThrough";
|
||||
std::string CDEElementOp = "ck::Tuple<>";
|
||||
|
||||
std::string GetIncludeHeader() const;
|
||||
|
||||
std::vector<Solution> GetSolutions(const std::string& arch) const;
|
||||
};
|
||||
|
||||
} // namespace device_gemm_multiple_d
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
57
codegen/include/ck/host/device_gemm_multiple_d/operation.hpp
Normal file
57
codegen/include/ck/host/device_gemm_multiple_d/operation.hpp
Normal file
@@ -0,0 +1,57 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "ck/host/types.hpp"
|
||||
#include "ck/host/operation/gemm.hpp"
|
||||
#include "ck/host/device_gemm_multiple_d/problem.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace device_gemm_multiple_d {
|
||||
|
||||
// defines all values need for an instance of fwd conv
|
||||
struct Operation_Xdl_CShuffle
|
||||
{
|
||||
// returns a vector of instances, only given fusion operators: will use default problem spec
|
||||
static std::vector<std::vector<Operation_Xdl_CShuffle>>
|
||||
CreateOperations(const std::string& prologue, const std::string& epilogue);
|
||||
// returns a vector of instances, given a problem spec and fusion operators
|
||||
static std::vector<Operation_Xdl_CShuffle>
|
||||
CreateOperations(const Problem& prob, const std::string& prologue, const std::string& epilogue);
|
||||
TensorDesc A{};
|
||||
TensorDesc B{};
|
||||
DataType acc = DataType::Float;
|
||||
DataType cs_type = DataType::Half;
|
||||
std::vector<TensorDesc> Ds = {};
|
||||
TensorDesc E{};
|
||||
std::string a_elem_op = PassThrough;
|
||||
std::string b_elem_op = PassThrough;
|
||||
std::string cde_elem_op = Bilinear;
|
||||
std::string prologue = "";
|
||||
std::string epilogue = "";
|
||||
std::string gemm_specialization = "ck::tensor_operation::device::GemmSpecialization::Default";
|
||||
// tuning parameters
|
||||
operation::TileDesc tile_desc{};
|
||||
operation::BlockTransferDesc a_block_transfer{};
|
||||
operation::BlockTransferDesc b_block_transfer{};
|
||||
operation::CShuffleDesc cshuffle{};
|
||||
operation::CBlockTransferDesc c_block_transfer{};
|
||||
LoopScheduler loop_scheduler{};
|
||||
PipelineVersion pipeline_version{};
|
||||
|
||||
// functions to update fusion operators if provided
|
||||
void update_prologue(const std::string& prologue);
|
||||
void update_epilogue(const std::string& epilogue);
|
||||
/**constexpr**/ bool IsSupported(std::size_t MRaw_, std::size_t NRaw_, std::size_t KRaw_);
|
||||
// returns a templated instance
|
||||
Solution ToSolution() const;
|
||||
};
|
||||
|
||||
} // namespace device_gemm_multiple_d
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
46
codegen/include/ck/host/device_gemm_multiple_d/problem.hpp
Normal file
46
codegen/include/ck/host/device_gemm_multiple_d/problem.hpp
Normal file
@@ -0,0 +1,46 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "ck/host/types.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace device_gemm_multiple_d {
|
||||
|
||||
// defines the problem specification for a GEMM operation
|
||||
struct Problem
|
||||
{
|
||||
// dimensions for GEMM operation
|
||||
std::size_t M = 0;
|
||||
std::size_t N = 0;
|
||||
std::size_t K = 0;
|
||||
// layouts for tensors
|
||||
bool TransA = false;
|
||||
bool TransB = false;
|
||||
bool TransE = false;
|
||||
std::vector<bool> DsTrans = {};
|
||||
DataType ADataType = DataType::Half;
|
||||
DataType BDataType = DataType::Half;
|
||||
DataType EDataType = DataType::Half;
|
||||
std::vector<DataType> DsDataType = {};
|
||||
std::string AElementOp = PassThrough;
|
||||
std::string BElementOp = PassThrough;
|
||||
std::string CDEElementOp = PassThrough;
|
||||
|
||||
// returns the correct device op file for the operation
|
||||
std::string GetIncludeHeader() const;
|
||||
|
||||
// returns a list of instances based on the problem spec and provided fusion operations
|
||||
std::vector<Solution> GetSolutions(const std::string& arch,
|
||||
const std::string& prologue = "",
|
||||
const std::string& epilogue = "") const;
|
||||
};
|
||||
|
||||
} // namespace device_gemm_multiple_d
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,60 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "ck/host/types.hpp"
|
||||
#include "ck/host/operation/gemm.hpp"
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace conv {
|
||||
|
||||
// defines the values needed for an instance of forward convolution and functions to return
|
||||
// (templated) instances
|
||||
struct Operation_Conv_Fwd_Xdl_Cshuffle
|
||||
{
|
||||
// returns a vector of instances given the fusion operations, uses default values for problem
|
||||
// spec
|
||||
static std::vector<Operation_Conv_Fwd_Xdl_Cshuffle>
|
||||
CreateOperations(const std::string& prologue, const std::string& epilogue);
|
||||
// returns a vector of instances, provided with a problem spec and fusion operations
|
||||
static std::vector<Operation_Conv_Fwd_Xdl_Cshuffle> CreateOperations(
|
||||
const Problem_Conv_Fwd& prob, const std::string& prologue, const std::string& epilogue);
|
||||
std::size_t NumDim;
|
||||
TensorDesc A{};
|
||||
TensorDesc B{};
|
||||
DataType acc = DataType::Float;
|
||||
DataType cs_type = DataType::Half;
|
||||
std::vector<TensorDesc> Ds = {};
|
||||
TensorDesc E{};
|
||||
std::string a_elem_op = PassThrough;
|
||||
std::string b_elem_op = PassThrough;
|
||||
std::string cde_elem_op = PassThrough;
|
||||
std::string prologue = "";
|
||||
std::string epilogue = "";
|
||||
std::string conv_specialization =
|
||||
"ck::tensor_operation::device::ConvolutionForwardSpecialization::Default";
|
||||
std::string gemm_specialization =
|
||||
"ck::tensor_operation::device::GemmSpecialization::MNKPadding";
|
||||
// tuning parameters
|
||||
operation::TileDesc tile_desc{};
|
||||
operation::BlockTransferDesc a_block_transfer{};
|
||||
operation::BlockTransferDesc b_block_transfer{};
|
||||
operation::CShuffleDesc cshuffle{};
|
||||
operation::CBlockTransferDesc c_block_transfer{};
|
||||
|
||||
// functions to update fusion operations if they are provided
|
||||
void update_prologue(const std::string& prologue);
|
||||
void update_epilogue(const std::string& epilogue);
|
||||
// returns a templated instance
|
||||
Solution ToSolution() const;
|
||||
};
|
||||
|
||||
} // namespace conv
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,56 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include "ck/host/types.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace conv {
|
||||
|
||||
// defines the problem specification for a forward convolution operation
|
||||
struct Problem_Conv_Fwd
|
||||
{
|
||||
std::size_t NumDim = 0;
|
||||
// size of a forward convolution operation
|
||||
std::size_t G = 0;
|
||||
std::size_t N = 0;
|
||||
std::size_t C = 0;
|
||||
std::size_t Hi = 0;
|
||||
std::size_t Wi = 0;
|
||||
std::size_t Ho = 0;
|
||||
std::size_t Wo = 0;
|
||||
std::size_t K = 0;
|
||||
std::size_t Y = 0;
|
||||
std::size_t X = 0;
|
||||
Layout ALayout = Layout::NHWGC;
|
||||
Layout BLayout = Layout::GKYXC;
|
||||
Layout ELayout = Layout::NHWGK;
|
||||
std::vector<Layout> DsLayout = {};
|
||||
DataType ADataType = DataType::Half;
|
||||
DataType BDataType = DataType::Half;
|
||||
DataType EDataType = DataType::Half;
|
||||
std::vector<DataType> DsDataType = {};
|
||||
std::string AElementOp = "ck::tensor_operation::element_wise::PassThrough";
|
||||
std::string BElementOp = "ck::tensor_operation::element_wise::PassThrough";
|
||||
std::string CDEElementOp = "ck::tensor_operation::element_wise::PassThrough";
|
||||
|
||||
// returns the correct device op file for the operation
|
||||
std::string GetIncludeHeader() const;
|
||||
|
||||
// returns a list of instances based on the problem spec and provided fusion operations
|
||||
std::vector<Solution> GetSolutions(const std::string& arch,
|
||||
const std::string& prologue,
|
||||
const std::string& epilogue) const;
|
||||
};
|
||||
|
||||
} // namespace conv
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
17
codegen/include/ck/host/headers.hpp
Normal file
17
codegen/include/ck/host/headers.hpp
Normal file
@@ -0,0 +1,17 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
|
||||
std::unordered_map<std::string_view, std::string_view> GetHeaders();
|
||||
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
69
codegen/include/ck/host/operation/gemm.hpp
Normal file
69
codegen/include/ck/host/operation/gemm.hpp
Normal file
@@ -0,0 +1,69 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace operation {
|
||||
|
||||
struct TileDesc
|
||||
{
|
||||
int block_size = 0;
|
||||
int m_per_block = 0;
|
||||
int n_per_block = 0;
|
||||
int k_per_block = 0;
|
||||
int ak1 = 0;
|
||||
int bk1 = 0;
|
||||
int m_per_XDL = 0;
|
||||
int n_per_XDL = 0;
|
||||
int m_Xdl_per_wave = 0;
|
||||
int n_Xdl_per_wave = 0;
|
||||
int num_gemmk_prefetch_stage = 0;
|
||||
};
|
||||
|
||||
struct TileDescGemmGemm
|
||||
{
|
||||
int block_size = 0;
|
||||
int gemm01_m_per_block = 0;
|
||||
int gemm0_n_per_block = 0;
|
||||
int gemm0_k_per_block = 0;
|
||||
int gemm1_n_per_block = 0;
|
||||
int gemm1_k_per_block = 0;
|
||||
int ak1 = 0;
|
||||
int bk1 = 0;
|
||||
int b1k1 = 0;
|
||||
int m_per_XDL = 0;
|
||||
int n_per_XDL = 0;
|
||||
int gemm0_m_Xdl_per_wave = 0;
|
||||
int gemm0_n_Xdl_per_wave = 0;
|
||||
int gemm1_n_Xdl_per_wave = 0;
|
||||
int num_gemmk_prefetch_stage = 0;
|
||||
};
|
||||
|
||||
struct BlockTransferDesc
|
||||
{
|
||||
std::string thread_cluster_length = "";
|
||||
std::string thread_cluster_arrange_order = "";
|
||||
std::string src_access_order = "";
|
||||
int src_vec_dim = 0;
|
||||
int src_scalar_per_vector = 0;
|
||||
int dst_scalar_per_vector_k1 = 0;
|
||||
int lds_add_extra_dim = 0;
|
||||
};
|
||||
struct CShuffleDesc
|
||||
{
|
||||
int m_Xdl_per_wave_per_shuffle = 0;
|
||||
int n_Xdl_per_wave_per_shuffle = 0;
|
||||
};
|
||||
struct CBlockTransferDesc
|
||||
{
|
||||
std::string cluster_lengths_m_block_m_wave_m_per_Xdl_n_block_n_wave_n_per_Xdl = "";
|
||||
int scalar_per_vector_n_wave_n_per_Xdl = 0;
|
||||
};
|
||||
|
||||
} // namespace operation
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
104
codegen/include/ck/host/stringutils.hpp
Normal file
104
codegen/include/ck/host/stringutils.hpp
Normal file
@@ -0,0 +1,104 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
|
||||
template <class F>
|
||||
std::string trim(const std::string& s, F f)
|
||||
{
|
||||
auto start = std::find_if_not(s.begin(), s.end(), f);
|
||||
auto last = std::find_if_not(s.rbegin(), std::string::const_reverse_iterator(start), f).base();
|
||||
return {start, last};
|
||||
}
|
||||
|
||||
inline std::string trim(const std::string& s)
|
||||
{
|
||||
return trim(s, [](unsigned char c) { return std::isspace(c); });
|
||||
}
|
||||
|
||||
template <class Strings>
|
||||
inline std::string JoinStrings(Strings strings, const std::string& delim)
|
||||
{
|
||||
auto it = strings.begin();
|
||||
if(it == strings.end())
|
||||
return "";
|
||||
|
||||
auto nit = std::next(it);
|
||||
return std::accumulate(nit, strings.end(), *it, [&](std::string x, std::string y) {
|
||||
return std::move(x) + delim + std::move(y);
|
||||
});
|
||||
}
|
||||
|
||||
template <class F>
|
||||
inline std::string
|
||||
InterpolateString(const std::string& input, F f, std::string start = "${", std::string end = "}")
|
||||
{
|
||||
std::string result = "";
|
||||
result.reserve(input.size());
|
||||
auto it = input.begin();
|
||||
while(it != input.end())
|
||||
{
|
||||
auto next_start = std::search(it, input.end(), start.begin(), start.end());
|
||||
auto next_end = std::search(next_start, input.end(), end.begin(), end.end());
|
||||
result.append(it, next_start);
|
||||
if(next_start == input.end())
|
||||
break;
|
||||
if(next_end == input.end())
|
||||
{
|
||||
throw std::runtime_error("Unbalanced brackets");
|
||||
}
|
||||
auto r = f(next_start + start.size(), next_end);
|
||||
result.append(r.begin(), r.end());
|
||||
it = next_end + end.size();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
inline std::string InterpolateString(const std::string& input,
|
||||
const std::unordered_map<std::string, std::string>& vars,
|
||||
std::string start = "${",
|
||||
std::string end = "}")
|
||||
{
|
||||
return InterpolateString(
|
||||
input,
|
||||
[&](auto start_it, auto last_it) {
|
||||
auto key = trim({start_it, last_it});
|
||||
auto it = vars.find(key);
|
||||
if(it == vars.end())
|
||||
throw std::runtime_error("Unknown key: " + key);
|
||||
return it->second;
|
||||
},
|
||||
std::move(start),
|
||||
std::move(end));
|
||||
}
|
||||
|
||||
template <class Range, class F>
|
||||
inline auto Transform(const Range& r, F f) -> std::vector<decltype(f(*r.begin()))>
|
||||
{
|
||||
std::vector<decltype(f(*r.begin()))> result;
|
||||
std::transform(r.begin(), r.end(), std::back_inserter(result), f);
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class Range1, class Range2, class F>
|
||||
inline auto Transform(const Range1& r1, const Range2& r2, F f)
|
||||
-> std::vector<decltype(f(*r1.begin(), *r2.begin()))>
|
||||
{
|
||||
std::vector<decltype(f(*r1.begin(), *r2.begin()))> result;
|
||||
assert(std::distance(r1.begin(), r1.end()) == std::distance(r2.begin(), r2.end()));
|
||||
std::transform(r1.begin(), r1.end(), r2.begin(), std::back_inserter(result), f);
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
104
codegen/include/ck/host/types.hpp
Normal file
104
codegen/include/ck/host/types.hpp
Normal file
@@ -0,0 +1,104 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
|
||||
// holds the templated instance, substitues values into template from instancess
|
||||
struct Solution
|
||||
{
|
||||
|
||||
Solution() = default;
|
||||
Solution(std::string str, std::unordered_map<std::string, std::string> values);
|
||||
std::string ToTemplateString() const;
|
||||
std::string GetTemplateParameter(const std::string& name) const;
|
||||
template <class T>
|
||||
T GetTemplateParameter(const std::string& name) const
|
||||
{
|
||||
T result;
|
||||
std::stringstream ss(GetTemplateParameter(name));
|
||||
ss >> result;
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
std::string template_str;
|
||||
std::unordered_map<std::string, std::string> template_values;
|
||||
};
|
||||
|
||||
// supported data types
|
||||
enum class DataType
|
||||
{
|
||||
Half,
|
||||
Float,
|
||||
Int8,
|
||||
Int32
|
||||
};
|
||||
std::string ToString(DataType dt);
|
||||
|
||||
// supported layouts: gemm and fwd conv
|
||||
enum class Layout
|
||||
{
|
||||
Row,
|
||||
Column,
|
||||
GKYXC,
|
||||
GKCYX,
|
||||
GNHWK,
|
||||
GNHWC,
|
||||
NHWGC,
|
||||
NHWGK
|
||||
};
|
||||
std::string ToString(Layout dl);
|
||||
Layout ToLayout(bool Trans); // returns the layout for gemm
|
||||
|
||||
// supported GEMM types
|
||||
enum class GemmType
|
||||
{
|
||||
Default
|
||||
};
|
||||
std::string ToString(GemmType gt);
|
||||
|
||||
enum class LoopScheduler
|
||||
{
|
||||
Default,
|
||||
Interwave,
|
||||
};
|
||||
std::string ToString(LoopScheduler ls);
|
||||
|
||||
enum class PipelineVersion
|
||||
{
|
||||
v1,
|
||||
v2
|
||||
};
|
||||
std::string ToString(PipelineVersion pv);
|
||||
|
||||
struct TensorDesc
|
||||
{
|
||||
DataType element;
|
||||
Layout layout;
|
||||
};
|
||||
|
||||
std::string SequenceStr(const std::vector<int>& v);
|
||||
|
||||
std::string MakeTuple(const std::vector<std::string>& v);
|
||||
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wglobal-constructors"
|
||||
template <int... xs>
|
||||
const std::string S = SequenceStr({xs...});
|
||||
#pragma clang diagnostic pop
|
||||
|
||||
constexpr const char* PassThrough = "ck::tensor_operation::element_wise::PassThrough";
|
||||
constexpr const char* Bilinear = "ck::tensor_operation::element_wise::Bilinear";
|
||||
constexpr const char* Scale = "ck::tensor_operation::element_wise::Scale";
|
||||
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
18
codegen/include/ck/host/utils.hpp
Normal file
18
codegen/include/ck/host/utils.hpp
Normal file
@@ -0,0 +1,18 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <unordered_set>
|
||||
#include <numeric>
|
||||
#include <iterator>
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
|
||||
std::size_t integer_divide_ceil(std::size_t x, std::size_t y);
|
||||
|
||||
const std::unordered_set<std::string>& get_xdlop_archs();
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
38
codegen/src/device_batched_gemm_softmax_gemm.cpp
Normal file
38
codegen/src/device_batched_gemm_softmax_gemm.cpp
Normal file
@@ -0,0 +1,38 @@
|
||||
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/host/device_batched_gemm_softmax_gemm/problem.hpp"
|
||||
#include "ck/host/device_batched_gemm_softmax_gemm/operation.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include <algorithm>
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace device_batched_gemm_softmax_gemm {
|
||||
|
||||
// return the relevant device op file based on the operation
|
||||
std::string Problem::GetIncludeHeader() const
|
||||
{
|
||||
return "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp";
|
||||
}
|
||||
|
||||
// returns templated instances when provided with a problem specification
|
||||
std::vector<Solution> Problem::GetSolutions(const std::string& arch,
|
||||
const std::string& prologue,
|
||||
const std::string& epilogue) const
|
||||
{
|
||||
if(get_xdlop_archs().count(arch) == 0)
|
||||
return {};
|
||||
auto ops = ck::host::device_batched_gemm_softmax_gemm::Operation_Xdl_CShuffle::CreateOperations(
|
||||
*this, prologue, epilogue); // obtains vector of instances
|
||||
std::vector<Solution> result;
|
||||
std::transform(ops.begin(), ops.end(), std::back_inserter(result), [&](const auto& op) {
|
||||
return op.ToSolution(); // template instance with correct values
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace device_batched_gemm_softmax_gemm
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,412 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/host/device_batched_gemm_softmax_gemm/operation.hpp"
|
||||
#include "ck/host/stringutils.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include <cassert>
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace device_batched_gemm_softmax_gemm {
|
||||
|
||||
// calculate appropriate Gemm Specification based on input tensor dimensions
|
||||
std::string GetGemmSpec(const std::size_t m,
|
||||
const std::size_t n,
|
||||
const std::size_t k,
|
||||
const std::size_t n1,
|
||||
const std::size_t m_per_block,
|
||||
const std::size_t n_per_block,
|
||||
const std::size_t k_per_block,
|
||||
const std::size_t n1_per_block)
|
||||
{
|
||||
std::string spec = "";
|
||||
if(integer_divide_ceil(m, m_per_block) * m_per_block - m != 0)
|
||||
spec += "M";
|
||||
if(integer_divide_ceil(n, n_per_block) * n_per_block - n != 0)
|
||||
spec += "N";
|
||||
if(integer_divide_ceil(k, k_per_block) * k_per_block - k != 0)
|
||||
spec += "K";
|
||||
if(integer_divide_ceil(n1, n1_per_block) * n1_per_block - n1 != 0)
|
||||
spec += "O";
|
||||
if(spec == "")
|
||||
return "ck::tensor_operation::device::GemmSpecialization::Default";
|
||||
|
||||
return "ck::tensor_operation::device::GemmSpecialization::" + spec + "Padding";
|
||||
}
|
||||
|
||||
// function to update prologue/epilogue with user provided operation
|
||||
void Operation_Xdl_CShuffle::update_prologue(const std::string& pro)
|
||||
{
|
||||
if(!prologue.empty())
|
||||
{
|
||||
this->prologue = pro;
|
||||
}
|
||||
else
|
||||
{
|
||||
this->prologue = "";
|
||||
}
|
||||
}
|
||||
|
||||
void Operation_Xdl_CShuffle::update_epilogue(const std::string& epi)
|
||||
{
|
||||
if(!epilogue.empty())
|
||||
{
|
||||
this->epilogue = epi;
|
||||
}
|
||||
else
|
||||
{
|
||||
this->epilogue = "";
|
||||
}
|
||||
}
|
||||
|
||||
// accounts for all possible combinations of Row/Col major
|
||||
static Layout ToLayout(bool Trans) { return Trans ? Layout::Column : Layout::Row; }
|
||||
|
||||
// Hard-code tuning parameters in modularized fashion, string them together into a vector of
|
||||
// instances
|
||||
std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
|
||||
const Problem& prob, const std::string& prologue, const std::string& epilogue)
|
||||
{
|
||||
std::vector<Operation_Xdl_CShuffle> result;
|
||||
|
||||
std::vector<operation::TileDescGemmGemm> tile_descriptions = {
|
||||
// clang-format off
|
||||
// Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1| NumGemmK|
|
||||
// Size| MPer| NPer| KPer| NPer| KPer| | | | XDL| XDL| MXdl| NXdl| NXdl| Prefetch|
|
||||
// | Block| Block| Block| Block| Block| | | | | | Per| Per| Per| Stage|
|
||||
// | | | | | | | | | | | Wave| Wave| Wave| |
|
||||
{ 256, 256, 128, 32, 64, 32, 8, 8, 2, 32, 32, 2, 4, 2, 1},
|
||||
{ 256, 256, 128, 32, 128, 32, 8, 8, 2, 32, 32, 2, 4, 4, 1},
|
||||
{ 256, 128, 256, 32, 64, 32, 8, 8, 2, 32, 32, 1, 8, 2, 1},
|
||||
{ 256, 128, 256, 32, 128, 32, 8, 8, 2, 32, 32, 1, 8, 4, 1},
|
||||
{ 256, 128, 128, 64, 64, 32, 8, 8, 2, 32, 32, 1, 4, 2, 1},
|
||||
{ 256, 128, 128, 32, 64, 32, 8, 8, 2, 32, 32, 1, 4, 2, 1},
|
||||
{ 256, 128, 128, 64, 128, 32, 8, 8, 2, 32, 32, 1, 4, 4, 1},
|
||||
{ 256, 128, 128, 32, 128, 32, 8, 8, 2, 32, 32, 1, 4, 4, 1},
|
||||
{ 256, 64, 256, 32, 128, 32, 8, 8, 2, 16, 16, 1, 16, 8, 1},
|
||||
{ 256, 64, 256, 32, 64, 32, 8, 8, 2, 16, 16, 1, 16, 4, 1},
|
||||
{ 256, 64, 256, 64, 128, 32, 8, 8, 2, 16, 16, 1, 16, 8, 1},
|
||||
{ 256, 64, 256, 64, 64, 32, 8, 8, 2, 16, 16, 1, 16, 4, 1},
|
||||
// Padded fallback kernel
|
||||
{ 256, 128, 128, 64, 128, 32, 8, 8, 2, 32, 32, 1, 4, 4, 1},
|
||||
{ 256, 128, 64, 32, 128, 32, 8, 8, 2, 32, 32, 1, 2, 4, 1},
|
||||
// Irregular k
|
||||
{ 256, 256, 128, 40, 64, 32, 4, 4, 2, 32, 32, 2, 4, 2, 1},
|
||||
{ 256, 256, 128, 40, 128, 32, 4, 4, 2, 32, 32, 2, 4, 4, 1},
|
||||
{ 256, 128, 256, 40, 64, 32, 4, 4, 2, 32, 32, 1, 8, 2, 1},
|
||||
{ 256, 128, 256, 40, 128, 32, 4, 4, 2, 32, 32, 1, 8, 4, 1},
|
||||
{ 256, 128, 128, 40, 64, 32, 4, 4, 2, 32, 32, 1, 4, 2, 1},
|
||||
{ 256, 128, 128, 40, 128, 32, 4, 4, 2, 32, 32, 1, 4, 4, 1},
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
const std::vector<operation::BlockTransferDesc> a_block_descriptions = {
|
||||
// clang-format off
|
||||
// ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|
|
||||
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM|
|
||||
// Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
|
||||
// | | | | | | |
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true},
|
||||
{ S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true},
|
||||
{ S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true},
|
||||
{ S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true},
|
||||
{ S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true},
|
||||
// Padded fallback kernel
|
||||
{ S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true},
|
||||
// Irregular k
|
||||
{ S<2,128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, false},
|
||||
{ S<2,128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, false},
|
||||
{ S<2,128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, false},
|
||||
{ S<2,128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, false},
|
||||
{ S<2,128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, false},
|
||||
{ S<2,128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, false},
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
const std::vector<operation::BlockTransferDesc> b1_block_descriptions = {
|
||||
// clang-format off
|
||||
// B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|
|
||||
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN|
|
||||
// Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
|
||||
// | | | | | | |
|
||||
{ S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S< 8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S< 8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S< 8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S< 8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S< 8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S< 8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
// Padded fallback kernel
|
||||
{ S< 8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S< 8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
// Irregular k
|
||||
{ S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S< 8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S< 8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
{ S< 8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, false},
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
std::vector<operation::CShuffleDesc> cshuffle_descriptions = {
|
||||
// clang-format off
|
||||
// CShuffle| CShuffle|
|
||||
// MXdlPerWave| NXdlPerWave|
|
||||
// PerShuffle| PerShuffle|
|
||||
// | |
|
||||
{ 1, 2},
|
||||
{ 1, 2},
|
||||
{ 1, 2},
|
||||
{ 1, 2},
|
||||
{ 1, 2},
|
||||
{ 1, 2},
|
||||
{ 1, 2},
|
||||
{ 1, 2},
|
||||
{ 1, 8},
|
||||
{ 1, 4},
|
||||
{ 1, 8},
|
||||
{ 1, 4},
|
||||
// Padded fallback kernel
|
||||
{ 1, 2},
|
||||
{ 1, 2},
|
||||
// Irregular k
|
||||
{ 1, 2},
|
||||
{ 1, 2},
|
||||
{ 1, 2},
|
||||
{ 1, 2},
|
||||
{ 1, 2},
|
||||
{ 1, 2},
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
std::vector<operation::CBlockTransferDesc> c_block_descriptions = {
|
||||
// clang-format off
|
||||
// CBlockTransferClusterLengths| CBlockTransfer
|
||||
// _MBlock_MWaveMPerXdl| ScalarPerVector
|
||||
// _NBlock_NWaveNPerXdl| _NWaveNPerXdl
|
||||
// |
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 16, 1,16>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 16, 1,16>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
// Padded fallback kernel
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
// Irregular k
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
assert(tile_descriptions.size() == a_block_descriptions.size());
|
||||
assert(tile_descriptions.size() == b1_block_descriptions.size());
|
||||
assert(tile_descriptions.size() == cshuffle_descriptions.size());
|
||||
assert(tile_descriptions.size() == c_block_descriptions.size());
|
||||
|
||||
// Put all values together into a single operation > store into the result vector
|
||||
for(std::size_t i = 0; i < tile_descriptions.size(); i++)
|
||||
{
|
||||
Operation_Xdl_CShuffle x;
|
||||
x.tile_desc = tile_descriptions[i];
|
||||
x.a_block_transfer = a_block_descriptions[i];
|
||||
x.b0_block_transfer = a_block_descriptions[i]; // b0 same as a
|
||||
x.b1_block_transfer = b1_block_descriptions[i];
|
||||
x.cshuffle = cshuffle_descriptions[i];
|
||||
x.c_block_transfer = c_block_descriptions[i];
|
||||
x.A = TensorDesc{prob.ADataType, ToLayout(prob.TransA)};
|
||||
x.B = TensorDesc{prob.BDataType, ToLayout(prob.TransB)};
|
||||
x.B1 = TensorDesc{prob.B1DataType, ToLayout(prob.TransB1)};
|
||||
x.C = TensorDesc{prob.CDataType, ToLayout(prob.TransC)};
|
||||
x.a_elem_op = prob.AElementOp;
|
||||
x.b_elem_op = prob.BElementOp;
|
||||
x.b1_elem_op = prob.B1ElementOp;
|
||||
x.c_elem_op = prob.CElementOp;
|
||||
x.acc_elem_op = prob.AccElementOp;
|
||||
x.gemm_specialization = GetGemmSpec(prob.M,
|
||||
prob.N,
|
||||
prob.K,
|
||||
prob.O,
|
||||
x.tile_desc.gemm01_m_per_block,
|
||||
x.tile_desc.gemm0_n_per_block,
|
||||
x.tile_desc.gemm0_k_per_block,
|
||||
x.tile_desc.gemm1_n_per_block);
|
||||
x.update_prologue(prologue);
|
||||
x.update_epilogue(epilogue);
|
||||
x.mask_out_upper_triangle = prob.MaskOutUpperTriangle;
|
||||
result.push_back(x);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// set up instances when not provided with a problem specification, use default operation values and
|
||||
// all possible layout combinations
|
||||
std::vector<std::vector<Operation_Xdl_CShuffle>>
|
||||
Operation_Xdl_CShuffle::CreateOperations(const std::string& prologue, const std::string& epilogue)
|
||||
{
|
||||
std::vector<Problem> problems;
|
||||
|
||||
Problem prob;
|
||||
prob.TransA = false;
|
||||
prob.TransB = true;
|
||||
prob.TransB1 = false;
|
||||
prob.TransC = false;
|
||||
problems.push_back(prob);
|
||||
|
||||
prob.MaskOutUpperTriangle = true;
|
||||
problems.push_back(prob);
|
||||
|
||||
return Transform(problems,
|
||||
[&](const Problem& p) { return CreateOperations(p, prologue, epilogue); });
|
||||
}
|
||||
|
||||
static const char* const DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffleTemplate =
|
||||
"ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<${LayoutA}, "
|
||||
"${LayoutB0}, ${LayoutB1}, ${LayoutC}, ${ADataType}, ${B0DataType}, ${B1DataType}, "
|
||||
"${CDataType}, ${AccDataType}, ${CShuffleDataType}, ${AElementwiseOperation}, "
|
||||
"${B0ElementwiseOperation}, ${Acc0ElementwiseOperation}, ${B1ElementwiseOperation}, "
|
||||
"${CElementwiseOperation}, ${GemmSpecialization}, ${NumGemmkPrefetchStage}, ${BlockSize}, "
|
||||
"${Gemm01MPerBlock}, ${Gemm0NPerBlock}, ${Gemm0KPerBlock}, ${Gemm1NPerBlock}, "
|
||||
"${Gemm1KPerBlock}, ${AK1}, ${BK1}, ${B1K1}, ${MPerXDL}, ${NPerXDL}, ${Gemm0MXdlPerWave}, "
|
||||
"${Gemm0NXdlPerWave}, ${Gemm1NXdlPerWave}, ${ABlockTransferThreadClusterLengths_AK0_M_AK1}, "
|
||||
"${ABlockTransferThreadClusterArrangeOrder}, ${ABlockTransferSrcAccessOrder}, "
|
||||
"${ABlockTransferSrcVectorDim}, ${ABlockTransferSrcScalarPerVector}, "
|
||||
"${ABlockTransferDstScalarPerVector_AK1}, ${ABlockLdsExtraM}, "
|
||||
"${B0BlockTransferThreadClusterLengths_BK0_N_BK1}, "
|
||||
"${B0BlockTransferThreadClusterArrangeOrder}, ${B0BlockTransferSrcAccessOrder}, "
|
||||
"${B0BlockTransferSrcVectorDim}, ${B0BlockTransferSrcScalarPerVector}, "
|
||||
"${B0BlockTransferDstScalarPerVector_BK1}, ${B0BlockLdsExtraN}, "
|
||||
"${B1BlockTransferThreadClusterLengths_BK0_N_BK1}, "
|
||||
"${B1BlockTransferThreadClusterArrangeOrder}, ${B1BlockTransferSrcAccessOrder}, "
|
||||
"${B1BlockTransferSrcVectorDim}, ${B1BlockTransferSrcScalarPerVector}, "
|
||||
"${B1BlockTransferDstScalarPerVector_BK1}, ${B1BlockLdsExtraN}, "
|
||||
"${CShuffleMXdlPerWavePerShuffle}, ${CShuffleNXdlPerWavePerShuffle}, "
|
||||
"${CBlockTransferClusterLengths_MBlock_MWaveMPerXdl_NBlock_NWaveNPerXdl}, "
|
||||
"${CBlockTransferScalarPerVector_NWaveNPerXdl}, ${MaskOutUpperTriangle}>";
|
||||
|
||||
// use hardcoded instances from vector of operations to substitute values into instance template
|
||||
Solution Operation_Xdl_CShuffle::ToSolution() const
|
||||
{
|
||||
std::unordered_map<std::string, std::string> values = {
|
||||
{"name",
|
||||
std::to_string(this->tile_desc.block_size) + "_" +
|
||||
std::to_string(this->tile_desc.gemm01_m_per_block) + "_" +
|
||||
std::to_string(this->tile_desc.gemm0_n_per_block) + "_" +
|
||||
std::to_string(this->tile_desc.gemm0_k_per_block) + "_" +
|
||||
std::to_string(this->tile_desc.gemm1_n_per_block) + "_" +
|
||||
std::to_string(this->tile_desc.gemm1_k_per_block) + "_" +
|
||||
std::to_string(this->tile_desc.ak1) + "_" + std::to_string(this->tile_desc.bk1) + "_" +
|
||||
std::to_string(this->tile_desc.b1k1) + "_" +
|
||||
std::to_string(this->tile_desc.m_per_XDL) + "_" +
|
||||
std::to_string(this->tile_desc.n_per_XDL) + "_" +
|
||||
std::to_string(this->tile_desc.gemm0_m_Xdl_per_wave) + "_" +
|
||||
std::to_string(this->tile_desc.gemm0_n_Xdl_per_wave) + "_" +
|
||||
std::to_string(this->tile_desc.gemm1_n_Xdl_per_wave)},
|
||||
{"LayoutA", ToString(this->A.layout)},
|
||||
{"LayoutB0", ToString(this->B.layout)},
|
||||
{"LayoutB1", ToString(this->B1.layout)},
|
||||
{"LayoutC", ToString(this->C.layout)},
|
||||
{"ADataType", ToString(this->A.element)},
|
||||
{"B0DataType", ToString(this->B.element)},
|
||||
{"B1DataType", ToString(this->B1.element)},
|
||||
{"CDataType", ToString(this->C.element)},
|
||||
{"AccDataType", ToString(this->acc)},
|
||||
{"CShuffleDataType", ToString(this->cs_type)},
|
||||
{"AElementwiseOperation", this->a_elem_op},
|
||||
{"B0ElementwiseOperation", this->b_elem_op},
|
||||
{"Acc0ElementwiseOperation", this->acc_elem_op},
|
||||
{"B1ElementwiseOperation", this->b1_elem_op},
|
||||
{"CElementwiseOperation", this->c_elem_op},
|
||||
{"GemmSpecialization", this->gemm_specialization},
|
||||
{"NumGemmkPrefetchStage", std::to_string(this->tile_desc.num_gemmk_prefetch_stage)},
|
||||
{"BlockSize", std::to_string(this->tile_desc.block_size)},
|
||||
{"Gemm01MPerBlock", std::to_string(this->tile_desc.gemm01_m_per_block)},
|
||||
{"Gemm0NPerBlock", std::to_string(this->tile_desc.gemm0_n_per_block)},
|
||||
{"Gemm0KPerBlock", std::to_string(this->tile_desc.gemm0_k_per_block)},
|
||||
{"Gemm1NPerBlock", std::to_string(this->tile_desc.gemm1_n_per_block)},
|
||||
{"Gemm1KPerBlock", std::to_string(this->tile_desc.gemm1_k_per_block)},
|
||||
{"AK1", std::to_string(this->tile_desc.ak1)},
|
||||
{"BK1", std::to_string(this->tile_desc.bk1)},
|
||||
{"B1K1", std::to_string(this->tile_desc.b1k1)},
|
||||
{"MPerXDL", std::to_string(this->tile_desc.m_per_XDL)},
|
||||
{"NPerXDL", std::to_string(this->tile_desc.n_per_XDL)},
|
||||
{"Gemm0MXdlPerWave", std::to_string(this->tile_desc.gemm0_m_Xdl_per_wave)},
|
||||
{"Gemm0NXdlPerWave", std::to_string(this->tile_desc.gemm0_n_Xdl_per_wave)},
|
||||
{"Gemm1NXdlPerWave", std::to_string(this->tile_desc.gemm1_n_Xdl_per_wave)},
|
||||
{"ABlockTransferThreadClusterLengths_AK0_M_AK1",
|
||||
this->a_block_transfer.thread_cluster_length},
|
||||
{"ABlockTransferThreadClusterArrangeOrder",
|
||||
this->a_block_transfer.thread_cluster_arrange_order},
|
||||
{"ABlockTransferSrcAccessOrder", this->a_block_transfer.src_access_order},
|
||||
{"ABlockTransferSrcVectorDim", std::to_string(this->a_block_transfer.src_vec_dim)},
|
||||
{"ABlockTransferSrcScalarPerVector",
|
||||
std::to_string(this->a_block_transfer.src_scalar_per_vector)},
|
||||
{"ABlockTransferDstScalarPerVector_AK1",
|
||||
std::to_string(this->a_block_transfer.dst_scalar_per_vector_k1)},
|
||||
{"ABlockLdsExtraM", std::to_string(this->a_block_transfer.lds_add_extra_dim)},
|
||||
{"B0BlockTransferThreadClusterLengths_BK0_N_BK1",
|
||||
this->b0_block_transfer.thread_cluster_length},
|
||||
{"B0BlockTransferThreadClusterArrangeOrder",
|
||||
this->b0_block_transfer.thread_cluster_arrange_order},
|
||||
{"B0BlockTransferSrcAccessOrder", this->b0_block_transfer.src_access_order},
|
||||
{"B0BlockTransferSrcVectorDim", std::to_string(this->b0_block_transfer.src_vec_dim)},
|
||||
{"B0BlockTransferSrcScalarPerVector",
|
||||
std::to_string(this->b0_block_transfer.src_scalar_per_vector)},
|
||||
{"B0BlockTransferDstScalarPerVector_BK1",
|
||||
std::to_string(this->b0_block_transfer.dst_scalar_per_vector_k1)},
|
||||
{"B0BlockLdsExtraN", std::to_string(this->b0_block_transfer.lds_add_extra_dim)},
|
||||
{"B1BlockTransferThreadClusterLengths_BK0_N_BK1",
|
||||
this->b1_block_transfer.thread_cluster_length},
|
||||
{"B1BlockTransferThreadClusterArrangeOrder",
|
||||
this->b1_block_transfer.thread_cluster_arrange_order},
|
||||
{"B1BlockTransferSrcAccessOrder", this->b1_block_transfer.src_access_order},
|
||||
{"B1BlockTransferSrcVectorDim", std::to_string(this->b1_block_transfer.src_vec_dim)},
|
||||
{"B1BlockTransferSrcScalarPerVector",
|
||||
std::to_string(this->b1_block_transfer.src_scalar_per_vector)},
|
||||
{"B1BlockTransferDstScalarPerVector_BK1",
|
||||
std::to_string(this->b1_block_transfer.dst_scalar_per_vector_k1)},
|
||||
{"B1BlockLdsExtraN", std::to_string(this->b1_block_transfer.lds_add_extra_dim)},
|
||||
{"CShuffleMXdlPerWavePerShuffle",
|
||||
std::to_string(this->cshuffle.m_Xdl_per_wave_per_shuffle)},
|
||||
{"CShuffleNXdlPerWavePerShuffle",
|
||||
std::to_string(this->cshuffle.n_Xdl_per_wave_per_shuffle)},
|
||||
{"CBlockTransferClusterLengths_MBlock_MWaveMPerXdl_NBlock_NWaveNPerXdl",
|
||||
this->c_block_transfer.cluster_lengths_m_block_m_wave_m_per_Xdl_n_block_n_wave_n_per_Xdl},
|
||||
{"CBlockTransferScalarPerVector_NWaveNPerXdl",
|
||||
std::to_string(this->c_block_transfer.scalar_per_vector_n_wave_n_per_Xdl)},
|
||||
{"MaskOutUpperTriangle", std::to_string(this->mask_out_upper_triangle)},
|
||||
};
|
||||
|
||||
return Solution{InterpolateString(DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffleTemplate, values),
|
||||
std::move(values)};
|
||||
}
|
||||
|
||||
} // namespace device_batched_gemm_softmax_gemm
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
38
codegen/src/device_gemm_multiple_d.cpp
Normal file
38
codegen/src/device_gemm_multiple_d.cpp
Normal file
@@ -0,0 +1,38 @@
|
||||
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/host/device_gemm_multiple_d/problem.hpp"
|
||||
#include "ck/host/device_gemm_multiple_d/operation.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include <algorithm>
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace device_gemm_multiple_d {
|
||||
|
||||
// return the relevant device op file based on the operation
|
||||
std::string Problem::GetIncludeHeader() const
|
||||
{
|
||||
return "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp";
|
||||
}
|
||||
|
||||
// returns templated instances when provided with a problem specification
|
||||
std::vector<Solution> Problem::GetSolutions(const std::string& arch,
|
||||
const std::string& prologue,
|
||||
const std::string& epilogue) const
|
||||
{
|
||||
if(get_xdlop_archs().count(arch) == 0)
|
||||
return {};
|
||||
auto ops = ck::host::device_gemm_multiple_d::Operation_Xdl_CShuffle::CreateOperations(
|
||||
*this, prologue, epilogue); // obtains vector of instances
|
||||
std::vector<Solution> result;
|
||||
std::transform(ops.begin(), ops.end(), std::back_inserter(result), [&](const auto& op) {
|
||||
return op.ToSolution(); // template instance with correct values
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace device_gemm_multiple_d
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
379
codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
Normal file
379
codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
Normal file
@@ -0,0 +1,379 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/host/device_gemm_multiple_d/operation.hpp"
|
||||
#include "ck/host/stringutils.hpp"
|
||||
#include "ck/host/types.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include <cassert>
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace device_gemm_multiple_d {
|
||||
|
||||
// calculate appropriate Gemm Specification based on input tensor dimensions
|
||||
static std::string GetGemmSpec(const std::size_t m,
|
||||
const std::size_t n,
|
||||
const std::size_t k,
|
||||
const std::size_t m_per_block,
|
||||
const std::size_t n_per_block,
|
||||
const std::size_t k_per_block)
|
||||
{
|
||||
std::string spec = "";
|
||||
if(integer_divide_ceil(m, m_per_block) * m_per_block - m != 0)
|
||||
spec += "M";
|
||||
if(integer_divide_ceil(n, n_per_block) * n_per_block - n != 0)
|
||||
spec += "N";
|
||||
if(integer_divide_ceil(k, k_per_block) * k_per_block - k != 0)
|
||||
spec += "K";
|
||||
if(spec == "")
|
||||
return "ck::tensor_operation::device::GemmSpecialization::Default";
|
||||
|
||||
return "ck::tensor_operation::device::GemmSpecialization::" + spec + "Padding";
|
||||
}
|
||||
|
||||
// function to update prologue/epilogue with user provided operation
|
||||
void Operation_Xdl_CShuffle::update_prologue(const std::string& pro)
|
||||
{
|
||||
if(!pro.empty())
|
||||
{
|
||||
this->prologue = pro;
|
||||
this->cde_elem_op = "CDEElementOp";
|
||||
}
|
||||
else
|
||||
{
|
||||
this->prologue = "";
|
||||
}
|
||||
}
|
||||
|
||||
void Operation_Xdl_CShuffle::update_epilogue(const std::string& epi)
|
||||
{
|
||||
if(!epi.empty())
|
||||
{
|
||||
this->epilogue = epi;
|
||||
this->cde_elem_op = "CDEElementOp";
|
||||
}
|
||||
else
|
||||
{
|
||||
this->epilogue = "";
|
||||
}
|
||||
}
|
||||
|
||||
// accounts for all possible combinations of Row/Col major
|
||||
static Layout ToLayout(bool Trans) { return Trans ? Layout::Column : Layout::Row; }
|
||||
|
||||
// clang-format off
|
||||
// DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1,
|
||||
|
||||
// DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1, LoopScheduler::Default, PipelineVersion::v1>
|
||||
// clang-format on
|
||||
|
||||
// Hard-code tuning parameters in modularized fashion, string them together into a vector of
|
||||
// instances
|
||||
std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
|
||||
const Problem& prob, const std::string& prologue, const std::string& epilogue)
|
||||
{
|
||||
std::vector<Operation_Xdl_CShuffle> result;
|
||||
|
||||
std::vector<operation::TileDesc> tile_descriptions = {
|
||||
// clang-format off
|
||||
// Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| NumGemmK|
|
||||
// Size| Block| Block| Block| | | XDL| XDL| Per| Per| Prefetch|
|
||||
// | | | | | | | | Wave| Wave| Stage|
|
||||
// | | | | | | | | | | |
|
||||
{ 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, 1},
|
||||
{ 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, 1},
|
||||
{ 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, 1},
|
||||
{ 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, 1},
|
||||
{ 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, 1},
|
||||
{ 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, 1},
|
||||
{ 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 1},
|
||||
{ 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, 1},
|
||||
// Irregular tile
|
||||
{ 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, 1},
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
std::vector<operation::BlockTransferDesc> a_block_descriptions_rowmajor = {
|
||||
// clang-format off
|
||||
// ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|
|
||||
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM|
|
||||
// Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
|
||||
// | | | | | | |
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
// Irregular tile
|
||||
{ S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1},
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
std::vector<operation::BlockTransferDesc> a_block_descriptions_colmajor = {
|
||||
// clang-format off
|
||||
// ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|
|
||||
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM|
|
||||
// Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
|
||||
// | | | | | | |
|
||||
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
|
||||
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
|
||||
{ S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
|
||||
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
|
||||
{ S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
|
||||
{ S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
|
||||
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
|
||||
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1},
|
||||
// Irregular tile
|
||||
{ S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1},
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
std::vector<operation::BlockTransferDesc> b_block_descriptions_rowmajor = {
|
||||
// clang-format off
|
||||
// BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|
|
||||
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN|
|
||||
// Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
|
||||
// | | | | | | |
|
||||
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
|
||||
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
|
||||
{ S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
|
||||
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
|
||||
{ S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
|
||||
{ S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
|
||||
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1},
|
||||
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
|
||||
// Irregular tile
|
||||
{ S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1},
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
std::vector<operation::BlockTransferDesc> b_block_descriptions_colmajor = {
|
||||
// clang-format off
|
||||
// BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|
|
||||
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN|
|
||||
// Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
|
||||
// | | | | | | |
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
// Irregular tile
|
||||
{ S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1},
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
std::vector<operation::CShuffleDesc> cshuffle_descriptions = {
|
||||
// clang-format off
|
||||
// CShuffle| CShuffle|
|
||||
// MXdlPerWave| NXdlPerWave|
|
||||
// PerShuffle| PerShuffle|
|
||||
// | |
|
||||
{ 1, 1},
|
||||
{ 1, 1},
|
||||
{ 1, 1},
|
||||
{ 1, 1},
|
||||
{ 1, 1},
|
||||
{ 1, 1},
|
||||
{ 1, 1},
|
||||
{ 1, 1},
|
||||
{ 1, 1},
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
std::vector<operation::CBlockTransferDesc> c_block_descriptions = {
|
||||
// clang-format off
|
||||
// CBlockTransferClusterLengths| CBlockTransfer
|
||||
// _MBlock_MWaveMPerXdl| ScalarPerVector
|
||||
// _NBlock_NWaveNPerXdl| _NWaveNPerXdl
|
||||
// |
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 16, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 4>, 8},
|
||||
{ S<1, 16, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
// Irregular tile
|
||||
{ S<1, 16, 1, 4>, 1},
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
// choose correct arrangement of tuning parameters based on the layout of each tensor
|
||||
const auto a_block_descriptions =
|
||||
prob.TransA ? a_block_descriptions_colmajor : a_block_descriptions_rowmajor;
|
||||
const auto b_block_descriptions =
|
||||
prob.TransB ? b_block_descriptions_colmajor : b_block_descriptions_rowmajor;
|
||||
|
||||
assert(tile_descriptions.size() == a_block_descriptions.size());
|
||||
assert(tile_descriptions.size() == b_block_descriptions.size());
|
||||
assert(tile_descriptions.size() == cshuffle_descriptions.size());
|
||||
assert(tile_descriptions.size() == c_block_descriptions.size());
|
||||
|
||||
const std::vector<std::tuple<LoopScheduler, PipelineVersion>> scheduler_pipeline_descriptions =
|
||||
{
|
||||
{LoopScheduler::Default, PipelineVersion::v1},
|
||||
{LoopScheduler::Interwave, PipelineVersion::v1},
|
||||
{LoopScheduler::Default, PipelineVersion::v2},
|
||||
};
|
||||
for(auto [loop_scheduler, pipeline_version] : scheduler_pipeline_descriptions)
|
||||
{
|
||||
// Put all values together into a single operation > store into the result vector
|
||||
for(std::size_t i = 0; i < tile_descriptions.size(); i++)
|
||||
{
|
||||
Operation_Xdl_CShuffle x;
|
||||
x.tile_desc = tile_descriptions[i];
|
||||
x.a_block_transfer = a_block_descriptions[i];
|
||||
x.b_block_transfer = b_block_descriptions[i];
|
||||
x.cshuffle = cshuffle_descriptions[i];
|
||||
x.c_block_transfer = c_block_descriptions[i];
|
||||
x.A = TensorDesc{prob.ADataType, ToLayout(prob.TransA)};
|
||||
x.B = TensorDesc{prob.BDataType, ToLayout(prob.TransB)};
|
||||
x.E = TensorDesc{prob.EDataType, ToLayout(prob.TransE)};
|
||||
x.Ds = Transform(prob.DsTrans, prob.DsDataType, [](auto trans, auto dt) {
|
||||
return TensorDesc{dt, ToLayout(trans)};
|
||||
});
|
||||
x.a_elem_op = prob.AElementOp;
|
||||
x.b_elem_op = prob.BElementOp;
|
||||
x.cde_elem_op = prob.CDEElementOp;
|
||||
x.gemm_specialization = GetGemmSpec(prob.M,
|
||||
prob.N,
|
||||
prob.K,
|
||||
x.tile_desc.m_per_block,
|
||||
x.tile_desc.n_per_block,
|
||||
x.tile_desc.k_per_block);
|
||||
x.loop_scheduler = loop_scheduler;
|
||||
x.pipeline_version = pipeline_version;
|
||||
x.update_prologue(prologue);
|
||||
x.update_epilogue(epilogue);
|
||||
result.push_back(x);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// set up instances when not provided with a problem specification, use default operation values and
|
||||
// all possible layout combinations
|
||||
std::vector<std::vector<Operation_Xdl_CShuffle>>
|
||||
Operation_Xdl_CShuffle::CreateOperations(const std::string& prologue, const std::string& epilogue)
|
||||
{
|
||||
std::vector<Problem> problems;
|
||||
for(bool TransA : {true, false})
|
||||
for(bool TransB : {true, false})
|
||||
{
|
||||
Problem prob;
|
||||
prob.TransA = TransA;
|
||||
prob.TransB = TransB;
|
||||
problems.push_back(prob);
|
||||
}
|
||||
return Transform(problems,
|
||||
[&](const Problem& p) { return CreateOperations(p, prologue, epilogue); });
|
||||
}
|
||||
|
||||
static const char* const DeviceGemmMultipleD_Xdl_CShuffleTemplate =
|
||||
"ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<${LayoutA}, ${LayoutB}, "
|
||||
"${LayoutDs}, ${LayoutE}, ${ADataType}, ${BDataType}, ${AccDataType}, ${CShuffleDataType}, "
|
||||
"${DsDataType}, ${EDataType}, ${AElementwiseOperation}, ${BElementwiseOperation}, "
|
||||
"${CDEElementwiseOperation}, ${GemmSpecialization}, ${NumGemmkPrefetchStage}, ${BlockSize}, "
|
||||
"${MPerBlock}, ${NPerBlock}, ${KPerBlock}, ${AK1}, ${BK1}, ${MPerXDL}, ${NPerXDL}, "
|
||||
"${MXdlPerWave}, ${NXdlPerWave}, ${ABlockTransferThreadClusterLengths_AK0_M_AK1}, "
|
||||
"${ABlockTransferThreadClusterArrangeOrder}, ${ABlockTransferSrcAccessOrder}, "
|
||||
"${ABlockTransferSrcVectorDim}, ${ABlockTransferSrcScalarPerVector}, "
|
||||
"${ABlockTransferDstScalarPerVector_AK1}, ${ABlockLdsExtraM}, "
|
||||
"${BBlockTransferThreadClusterLengths_BK0_N_BK1}, ${BBlockTransferThreadClusterArrangeOrder}, "
|
||||
"${BBlockTransferSrcAccessOrder}, ${BBlockTransferSrcVectorDim}, "
|
||||
"${BBlockTransferSrcScalarPerVector}, ${BBlockTransferDstScalarPerVector_BK1}, "
|
||||
"${BBlockLdsExtraN}, ${CShuffleMXdlPerWavePerShuffle}, ${CShuffleNXdlPerWavePerShuffle}, "
|
||||
"${CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock}, "
|
||||
"${CDEBlockTransferScalarPerVector_NPerBlock}, ${LoopScheduler}, ${PipelineVersion}>";
|
||||
|
||||
// use hardcoded instances from vector of operations to substitute values into instance template
|
||||
Solution Operation_Xdl_CShuffle::ToSolution() const
|
||||
{
|
||||
std::unordered_map<std::string, std::string> values = {
|
||||
{"name",
|
||||
std::to_string(this->tile_desc.block_size) + "_" +
|
||||
std::to_string(this->tile_desc.m_per_block) + "_" +
|
||||
std::to_string(this->tile_desc.n_per_block) + "_" +
|
||||
std::to_string(this->tile_desc.k_per_block) + "_" +
|
||||
std::to_string(this->tile_desc.ak1) + "_" + std::to_string(this->tile_desc.bk1) + "_" +
|
||||
std::to_string(this->tile_desc.m_per_XDL) + "_" +
|
||||
std::to_string(this->tile_desc.n_per_XDL) + "_" +
|
||||
std::to_string(this->tile_desc.m_Xdl_per_wave) + "_" +
|
||||
std::to_string(this->tile_desc.n_Xdl_per_wave)},
|
||||
{"LayoutA", ToString(this->A.layout)},
|
||||
{"LayoutB", ToString(this->B.layout)},
|
||||
{"LayoutDs",
|
||||
MakeTuple(Transform(this->Ds, [](auto tensor) { return ToString(tensor.layout); }))},
|
||||
{"LayoutE", ToString(this->E.layout)},
|
||||
{"ADataType", ToString(this->A.element)},
|
||||
{"BDataType", ToString(this->B.element)},
|
||||
{"AccDataType", ToString(this->acc)},
|
||||
{"CShuffleDataType", ToString(this->cs_type)},
|
||||
{"DsDataType",
|
||||
MakeTuple(Transform(this->Ds, [](auto tensor) { return ToString(tensor.element); }))},
|
||||
{"EDataType", ToString(this->E.element)},
|
||||
{"AElementwiseOperation", this->a_elem_op},
|
||||
{"BElementwiseOperation", this->b_elem_op},
|
||||
{"CDEElementwiseOperation", this->cde_elem_op},
|
||||
{"GemmSpecialization", this->gemm_specialization},
|
||||
{"NumGemmkPrefetchStage", std::to_string(this->tile_desc.num_gemmk_prefetch_stage)},
|
||||
{"BlockSize", std::to_string(this->tile_desc.block_size)},
|
||||
{"MPerBlock", std::to_string(this->tile_desc.m_per_block)},
|
||||
{"NPerBlock", std::to_string(this->tile_desc.n_per_block)},
|
||||
{"KPerBlock", std::to_string(this->tile_desc.k_per_block)},
|
||||
{"AK1", std::to_string(this->tile_desc.ak1)},
|
||||
{"BK1", std::to_string(this->tile_desc.bk1)},
|
||||
{"MPerXDL", std::to_string(this->tile_desc.m_per_XDL)},
|
||||
{"NPerXDL", std::to_string(this->tile_desc.n_per_XDL)},
|
||||
{"MXdlPerWave", std::to_string(this->tile_desc.m_Xdl_per_wave)},
|
||||
{"NXdlPerWave", std::to_string(this->tile_desc.n_Xdl_per_wave)},
|
||||
{"ABlockTransferThreadClusterLengths_AK0_M_AK1",
|
||||
this->a_block_transfer.thread_cluster_length},
|
||||
{"ABlockTransferThreadClusterArrangeOrder",
|
||||
this->a_block_transfer.thread_cluster_arrange_order},
|
||||
{"ABlockTransferSrcAccessOrder", this->a_block_transfer.src_access_order},
|
||||
{"ABlockTransferSrcVectorDim", std::to_string(this->a_block_transfer.src_vec_dim)},
|
||||
{"ABlockTransferSrcScalarPerVector",
|
||||
std::to_string(this->a_block_transfer.src_scalar_per_vector)},
|
||||
{"ABlockTransferDstScalarPerVector_AK1",
|
||||
std::to_string(this->a_block_transfer.dst_scalar_per_vector_k1)},
|
||||
{"ABlockLdsExtraM", std::to_string(this->a_block_transfer.lds_add_extra_dim)},
|
||||
{"BBlockTransferThreadClusterLengths_BK0_N_BK1",
|
||||
this->b_block_transfer.thread_cluster_length},
|
||||
{"BBlockTransferThreadClusterArrangeOrder",
|
||||
this->b_block_transfer.thread_cluster_arrange_order},
|
||||
{"BBlockTransferSrcAccessOrder", this->b_block_transfer.src_access_order},
|
||||
{"BBlockTransferSrcVectorDim", std::to_string(this->b_block_transfer.src_vec_dim)},
|
||||
{"BBlockTransferSrcScalarPerVector",
|
||||
std::to_string(this->b_block_transfer.src_scalar_per_vector)},
|
||||
{"BBlockTransferDstScalarPerVector_BK1",
|
||||
std::to_string(this->b_block_transfer.dst_scalar_per_vector_k1)},
|
||||
{"BBlockLdsExtraN", std::to_string(this->b_block_transfer.lds_add_extra_dim)},
|
||||
{"CShuffleMXdlPerWavePerShuffle",
|
||||
std::to_string(this->cshuffle.m_Xdl_per_wave_per_shuffle)},
|
||||
{"CShuffleNXdlPerWavePerShuffle",
|
||||
std::to_string(this->cshuffle.n_Xdl_per_wave_per_shuffle)},
|
||||
{"CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock",
|
||||
this->c_block_transfer.cluster_lengths_m_block_m_wave_m_per_Xdl_n_block_n_wave_n_per_Xdl},
|
||||
{"CDEBlockTransferScalarPerVector_NPerBlock",
|
||||
std::to_string(this->c_block_transfer.scalar_per_vector_n_wave_n_per_Xdl)},
|
||||
{"LoopScheduler", ToString(this->loop_scheduler)},
|
||||
{"PipelineVersion", ToString(this->pipeline_version)},
|
||||
};
|
||||
|
||||
return Solution{InterpolateString(DeviceGemmMultipleD_Xdl_CShuffleTemplate, values),
|
||||
std::move(values)};
|
||||
}
|
||||
|
||||
} // namespace device_gemm_multiple_d
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
42
codegen/src/device_grouped_conv_fwd_multiple_abd.cpp
Normal file
42
codegen/src/device_grouped_conv_fwd_multiple_abd.cpp
Normal file
@@ -0,0 +1,42 @@
|
||||
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace conv {
|
||||
|
||||
// return the relevant device op file based on the operation
|
||||
// NOTE: this is a modified version of the original CK file that calls the kernel from a device
|
||||
// function and makes the Argument class accessible on the device
|
||||
std::string Problem_Conv_Fwd::GetIncludeHeader() const
|
||||
{
|
||||
return "ck/tensor_operation/gpu/device/impl/"
|
||||
"codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp";
|
||||
}
|
||||
|
||||
// return vector of forward convolution instances when provided with a problem instance
|
||||
std::vector<Solution> Problem_Conv_Fwd::GetSolutions(const std::string& arch,
|
||||
const std::string& prologue,
|
||||
const std::string& epilogue) const
|
||||
{
|
||||
if(get_xdlop_archs().count(arch) == 0)
|
||||
return {};
|
||||
auto ops = ck::host::conv::Operation_Conv_Fwd_Xdl_Cshuffle::CreateOperations(
|
||||
*this, prologue, epilogue);
|
||||
std::vector<Solution> result;
|
||||
std::transform(ops.begin(), ops.end(), std::back_inserter(result), [&](const auto& op) {
|
||||
return op.ToSolution();
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace conv
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,352 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
|
||||
#include <iostream>
|
||||
#include "ck/host/stringutils.hpp"
|
||||
#include "ck/host/types.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include <cassert>
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace conv {
|
||||
|
||||
// NOTE: in CK, MNKPadding is always used for forward convolution, so didn't
|
||||
// add GemmSpec function here
|
||||
|
||||
// function to update prologue/epilogue with user provided operation
|
||||
void Operation_Conv_Fwd_Xdl_Cshuffle::update_prologue(const std::string& pro)
|
||||
{
|
||||
if(!pro.empty())
|
||||
{
|
||||
this->prologue = pro;
|
||||
this->cde_elem_op = "CDEElementOp";
|
||||
}
|
||||
else
|
||||
{
|
||||
this->prologue = "";
|
||||
}
|
||||
}
|
||||
|
||||
void Operation_Conv_Fwd_Xdl_Cshuffle::update_epilogue(const std::string& epi)
|
||||
{
|
||||
if(!epi.empty())
|
||||
{
|
||||
this->epilogue = epi;
|
||||
this->cde_elem_op = "CDEElementOp";
|
||||
}
|
||||
else
|
||||
{
|
||||
this->epilogue = "";
|
||||
}
|
||||
}
|
||||
|
||||
// Hard-code tuning parameters in modularized fashion, string them together into a vector of
|
||||
// instances
|
||||
std::vector<Operation_Conv_Fwd_Xdl_Cshuffle> Operation_Conv_Fwd_Xdl_Cshuffle::CreateOperations(
|
||||
const Problem_Conv_Fwd& prob, const std::string& prologue, const std::string& epilogue)
|
||||
{
|
||||
std::vector<Operation_Conv_Fwd_Xdl_Cshuffle> result;
|
||||
|
||||
std::vector<operation::TileDesc> tile_descriptions = {
|
||||
// clang-format off
|
||||
// Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| NumGemmK|
|
||||
// Size| Block| Block| Block| | | XDL| XDL| Per| Per| Prefetch|
|
||||
// | | | | | | | | Wave| Wave| Stage|
|
||||
// | | | | | | | | | | |
|
||||
{ 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, 1},
|
||||
{ 256, 128, 256, 32, 8, 8, 32, 32, 4, 2, 1},
|
||||
{ 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, 1},
|
||||
{ 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, 1},
|
||||
{ 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, 1},
|
||||
{ 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, 1}
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
std::vector<operation::BlockTransferDesc> a_block_descriptions = {
|
||||
// clang-format off
|
||||
// ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|
|
||||
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM|
|
||||
// Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
|
||||
// | | | | | | |
|
||||
{ S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1},
|
||||
{ S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1}
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
std::vector<operation::BlockTransferDesc> b_block_descriptions = {
|
||||
// clang-format off
|
||||
// BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|
|
||||
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN|
|
||||
// Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
|
||||
// | | | | | | |
|
||||
{ S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1},
|
||||
{ S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1},
|
||||
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
|
||||
{ S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1}
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
std::vector<operation::CShuffleDesc> cshuffle_descriptions = {
|
||||
// clang-format off
|
||||
// CShuffle| CShuffle|
|
||||
// MXdlPerWave| NXdlPerWave|
|
||||
// PerShuffle| PerShuffle|
|
||||
// | |
|
||||
{ 1, 1},
|
||||
{ 1, 1},
|
||||
{ 1, 1},
|
||||
{ 1, 1},
|
||||
{ 1, 1},
|
||||
{ 1, 1}
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
std::vector<operation::CBlockTransferDesc> c_block_descriptions = {
|
||||
// clang-format off
|
||||
// CBlockTransferClusterLengths| CBlockTransfer
|
||||
// _MBlock_MWaveMPerXdl| ScalarPerVector
|
||||
// _NBlock_NWaveNPerXdl| _NWaveNPerXdl
|
||||
// |
|
||||
{ S<1, 16, 1, 4>, 1},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 16, 1, 4>, 1},
|
||||
{ S<1, 32, 1, 8>, 8},
|
||||
{ S<1, 16, 1, 8>, 8}
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
assert(tile_descriptions.size() == a_block_descriptions.size());
|
||||
assert(tile_descriptions.size() == b_block_descriptions.size());
|
||||
assert(tile_descriptions.size() == cshuffle_descriptions.size());
|
||||
assert(tile_descriptions.size() == c_block_descriptions.size());
|
||||
|
||||
// Put all values together into a single operation > store into the result vector
|
||||
for(std::size_t i = 0; i < tile_descriptions.size(); i++)
|
||||
{
|
||||
Operation_Conv_Fwd_Xdl_Cshuffle x;
|
||||
x.NumDim = prob.NumDim;
|
||||
x.tile_desc = tile_descriptions[i];
|
||||
x.a_block_transfer = a_block_descriptions[i];
|
||||
x.b_block_transfer = b_block_descriptions[i];
|
||||
x.cshuffle = cshuffle_descriptions[i];
|
||||
x.c_block_transfer = c_block_descriptions[i];
|
||||
x.A = TensorDesc{prob.ADataType, prob.ALayout};
|
||||
x.B = TensorDesc{prob.BDataType, prob.BLayout};
|
||||
x.E = TensorDesc{prob.EDataType, prob.ELayout};
|
||||
x.Ds = Transform(prob.DsLayout, prob.DsDataType, [](auto lo, auto dt) {
|
||||
return TensorDesc{dt, lo};
|
||||
});
|
||||
x.a_elem_op = prob.AElementOp;
|
||||
x.b_elem_op = prob.BElementOp;
|
||||
x.cde_elem_op = prob.CDEElementOp;
|
||||
x.update_prologue(prologue);
|
||||
x.update_epilogue(epilogue);
|
||||
result.push_back(x);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// set up instances when not provided with a problem specification, use default operation values
|
||||
std::vector<Operation_Conv_Fwd_Xdl_Cshuffle>
|
||||
Operation_Conv_Fwd_Xdl_Cshuffle::CreateOperations(const std::string& prologue,
|
||||
const std::string& epilogue)
|
||||
{
|
||||
Problem_Conv_Fwd prob;
|
||||
return CreateOperations(prob, prologue, epilogue);
|
||||
}
|
||||
|
||||
static const char* const CopyDevice_ConvTemplate =
|
||||
R"(
|
||||
${Prologue}
|
||||
${Epilogue}
|
||||
|
||||
using CDEElementOp = Epilogue;
|
||||
using DeviceConv = ck::tensor_operation::device::CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<${NumDim}, ${LayoutA}, ${LayoutB}, ${LayoutDs}, ${LayoutE}, ${ADataType}, ${BDataType}, ${AccDataType}, ${CShuffleDataType}, ${DsDataType}, ${EDataType}, ${AElementwiseOperation}, ${BElementwiseOperation}, ${CDEElementwiseOperation}, ${ConvSpecialization}, ${GemmSpecialization}, ${NumGemmkPrefetchStage}, ${BlockSize}, ${MPerBlock}, ${NPerBlock}, ${KPerBlock}, ${AK1}, ${BK1}, ${MPerXDL}, ${NPerXDL}, ${MXdlPerWave}, ${NXdlPerWave}, ${ABlockTransferThreadClusterLengths_AK0_M_AK1}, ${ABlockTransferThreadClusterArrangeOrder}, ${ABlockTransferSrcAccessOrder}, ${ABlockTransferSrcVectorDim}, ${ABlockTransferSrcScalarPerVector}, ${ABlockTransferDstScalarPerVector_AK1}, ${ABlockLdsExtraM}, ${BBlockTransferThreadClusterLengths_BK0_N_BK1}, ${BBlockTransferThreadClusterArrangeOrder}, ${BBlockTransferSrcAccessOrder}, ${BBlockTransferSrcVectorDim}, ${BBlockTransferSrcScalarPerVector}, ${BBlockTransferDstScalarPerVector_BK1}, ${BBlockLdsExtraN}, ${CShuffleMXdlPerWavePerShuffle}, ${CShuffleNXdlPerWavePerShuffle}, ${CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock}, ${CDEBlockTransferScalarPerVector_NPerBlock}>;
|
||||
|
||||
constexpr ck::index_t NumATensor = ck::tensor_operation::device::GetNumABTensors<false, ${ADataType}>();
|
||||
constexpr ck::index_t NumBTensor = ck::tensor_operation::device::GetNumABTensors<false, ${BDataType}>();
|
||||
|
||||
extern "C" __global__ void run_${name}(
|
||||
const ${ADataType}* in_dev,
|
||||
const ${BDataType}* wei_dev,
|
||||
${EDataType}* __restrict__ out_dev,
|
||||
ck::Array<ck::index_t, ${NumDim} + 3> in_lengths,
|
||||
ck::Array<ck::index_t, ${NumDim} + 3> in_strides,
|
||||
ck::Array<ck::index_t, ${NumDim} + 3> wei_lengths,
|
||||
ck::Array<ck::index_t, ${NumDim} + 3> wei_strides,
|
||||
ck::Array<ck::index_t, ${NumDim} + 3> out_lengths,
|
||||
ck::Array<ck::index_t, ${NumDim} + 3> out_strides,
|
||||
ck::Array<ck::index_t, ${NumDim}> conv_filter_strides,
|
||||
ck::Array<ck::index_t, ${NumDim}> conv_filter_dilations,
|
||||
ck::Array<ck::index_t, ${NumDim}> input_left_pads,
|
||||
ck::Array<ck::index_t, ${NumDim}> input_right_pads,
|
||||
const ${AElementwiseOperation} a_element_op,
|
||||
const ${BElementwiseOperation} b_element_op,
|
||||
const ${CDEElementwiseOperation} cde_element_op
|
||||
){
|
||||
|
||||
|
||||
auto arg = DeviceConv::Argument(in_dev,
|
||||
wei_dev,
|
||||
ck::Array<const void*, 0>{},
|
||||
out_dev,
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
ck::Array<ck::Array<ck::index_t, ${NumDim} + 3>, 0>{},
|
||||
ck::Array<ck::Array<ck::index_t, ${NumDim} + 3>, 0>{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
${AElementwiseOperation}{},
|
||||
${BElementwiseOperation}{},
|
||||
${CDEElementwiseOperation}{1.0f, 1.0f});
|
||||
|
||||
if(!DeviceConv::IsSupportedArgument(arg))
|
||||
{
|
||||
printf("Arguement is not supported.\n");
|
||||
return;
|
||||
};
|
||||
|
||||
constexpr ck::LoopScheduler LoopSched = ck::make_default_loop_scheduler();
|
||||
|
||||
// GridwiseGemm
|
||||
using GridwiseGemm = DeviceConv::GridwiseGemm;
|
||||
|
||||
static constexpr auto I0 = ck::Number<0>{};
|
||||
|
||||
ck::tensor_operation::device::device_grouped_conv_fwd_multiple_abd_xdl_cshuffle<
|
||||
GridwiseGemm,
|
||||
const ${ADataType}*,
|
||||
const ${BDataType}*,
|
||||
typename GridwiseGemm::DsGridPointer,
|
||||
${EDataType},
|
||||
${AElementwiseOperation},
|
||||
${BElementwiseOperation},
|
||||
${CDEElementwiseOperation},
|
||||
DeviceConv::AGridDesc_AK0_M_AK1,
|
||||
DeviceConv::BGridDesc_BK0_N_BK1,
|
||||
DeviceConv::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
|
||||
DeviceConv::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
|
||||
DeviceConv::Block2ETileMap,
|
||||
ck::tensor_operation::device::ComputePtrOffsetOfStridedBatch<NumATensor, NumBTensor, 0>,
|
||||
ck::integral_constant<bool, true>{},
|
||||
false,
|
||||
false>
|
||||
(
|
||||
arg.p_as_grid_.At(I0),
|
||||
arg.p_bs_grid_.At(I0),
|
||||
arg.p_ds_grid_,
|
||||
arg.p_e_grid_,
|
||||
arg.a_element_op_,
|
||||
arg.b_element_op_,
|
||||
arg.cde_element_op_,
|
||||
arg.a_g_n_c_wis_lengths_[0], // Group count
|
||||
arg.a_grid_desc_ak0_m_ak1_,
|
||||
arg.b_grid_desc_bk0_n_bk1_,
|
||||
arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
|
||||
arg.block_2_etile_map_,
|
||||
arg.compute_ptr_offset_of_batch_
|
||||
);
|
||||
|
||||
}
|
||||
)";
|
||||
|
||||
// use hardcoded instances from vector of operations to substitute values into instance template
|
||||
Solution Operation_Conv_Fwd_Xdl_Cshuffle::ToSolution() const
|
||||
{
|
||||
std::unordered_map<std::string, std::string> values = {
|
||||
{"name",
|
||||
std::to_string(this->tile_desc.block_size) + "_" +
|
||||
std::to_string(this->tile_desc.m_per_block) + "_" +
|
||||
std::to_string(this->tile_desc.n_per_block) + "_" +
|
||||
std::to_string(this->tile_desc.k_per_block) + "_" +
|
||||
std::to_string(this->tile_desc.ak1) + "_" + std::to_string(this->tile_desc.bk1) + "_" +
|
||||
std::to_string(this->tile_desc.m_per_XDL) + "_" +
|
||||
std::to_string(this->tile_desc.n_per_XDL) + "_" +
|
||||
std::to_string(this->tile_desc.m_Xdl_per_wave) + "_" +
|
||||
std::to_string(this->tile_desc.n_Xdl_per_wave)},
|
||||
{"NumDim", std::to_string(this->NumDim)},
|
||||
{"LayoutA", ToString(this->A.layout)},
|
||||
{"LayoutB", ToString(this->B.layout)},
|
||||
{"LayoutDs",
|
||||
MakeTuple(Transform(this->Ds, [](auto tensor) { return ToString(tensor.layout); }))},
|
||||
{"LayoutE", ToString(this->E.layout)},
|
||||
{"ADataType", ToString(this->A.element)},
|
||||
{"BDataType", ToString(this->B.element)},
|
||||
{"AccDataType", ToString(this->acc)},
|
||||
{"ComputeDataType", ToString(this->A.element)},
|
||||
{"CShuffleDataType", ToString(this->cs_type)},
|
||||
{"DsDataType",
|
||||
MakeTuple(Transform(this->Ds, [](auto tensor) { return ToString(tensor.element); }))},
|
||||
{"EDataType", ToString(this->E.element)},
|
||||
{"AElementwiseOperation", this->a_elem_op},
|
||||
{"BElementwiseOperation", this->b_elem_op},
|
||||
{"CDEElementwiseOperation", this->cde_elem_op},
|
||||
{"Prologue", this->prologue},
|
||||
{"Epilogue", this->epilogue},
|
||||
{"ConvSpecialization", this->conv_specialization},
|
||||
{"GemmSpecialization", this->gemm_specialization},
|
||||
{"NumGemmkPrefetchStage", std::to_string(this->tile_desc.num_gemmk_prefetch_stage)},
|
||||
{"BlockSize", std::to_string(this->tile_desc.block_size)},
|
||||
{"MPerBlock", std::to_string(this->tile_desc.m_per_block)},
|
||||
{"NPerBlock", std::to_string(this->tile_desc.n_per_block)},
|
||||
{"KPerBlock", std::to_string(this->tile_desc.k_per_block)},
|
||||
{"AK1", std::to_string(this->tile_desc.ak1)},
|
||||
{"BK1", std::to_string(this->tile_desc.bk1)},
|
||||
{"MPerXDL", std::to_string(this->tile_desc.m_per_XDL)},
|
||||
{"NPerXDL", std::to_string(this->tile_desc.n_per_XDL)},
|
||||
{"MXdlPerWave", std::to_string(this->tile_desc.m_Xdl_per_wave)},
|
||||
{"NXdlPerWave", std::to_string(this->tile_desc.n_Xdl_per_wave)},
|
||||
{"ABlockTransferThreadClusterLengths_AK0_M_AK1",
|
||||
this->a_block_transfer.thread_cluster_length},
|
||||
{"ABlockTransferThreadClusterArrangeOrder",
|
||||
this->a_block_transfer.thread_cluster_arrange_order},
|
||||
{"ABlockTransferSrcAccessOrder", this->a_block_transfer.src_access_order},
|
||||
{"ABlockTransferSrcVectorDim", std::to_string(this->a_block_transfer.src_vec_dim)},
|
||||
{"ABlockTransferSrcScalarPerVector",
|
||||
std::to_string(this->a_block_transfer.src_scalar_per_vector)},
|
||||
{"ABlockTransferDstScalarPerVector_AK1",
|
||||
std::to_string(this->a_block_transfer.dst_scalar_per_vector_k1)},
|
||||
{"ABlockLdsExtraM", std::to_string(this->a_block_transfer.lds_add_extra_dim)},
|
||||
{"BBlockTransferThreadClusterLengths_BK0_N_BK1",
|
||||
this->b_block_transfer.thread_cluster_length},
|
||||
{"BBlockTransferThreadClusterArrangeOrder",
|
||||
this->b_block_transfer.thread_cluster_arrange_order},
|
||||
{"BBlockTransferSrcAccessOrder", this->b_block_transfer.src_access_order},
|
||||
{"BBlockTransferSrcVectorDim", std::to_string(this->b_block_transfer.src_vec_dim)},
|
||||
{"BBlockTransferSrcScalarPerVector",
|
||||
std::to_string(this->b_block_transfer.src_scalar_per_vector)},
|
||||
{"BBlockTransferDstScalarPerVector_BK1",
|
||||
std::to_string(this->b_block_transfer.dst_scalar_per_vector_k1)},
|
||||
{"BBlockLdsExtraN", std::to_string(this->b_block_transfer.lds_add_extra_dim)},
|
||||
{"CShuffleMXdlPerWavePerShuffle",
|
||||
std::to_string(this->cshuffle.m_Xdl_per_wave_per_shuffle)},
|
||||
{"CShuffleNXdlPerWavePerShuffle",
|
||||
std::to_string(this->cshuffle.n_Xdl_per_wave_per_shuffle)},
|
||||
{"CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock",
|
||||
this->c_block_transfer.cluster_lengths_m_block_m_wave_m_per_Xdl_n_block_n_wave_n_per_Xdl},
|
||||
{"CDEBlockTransferScalarPerVector_NPerBlock",
|
||||
std::to_string(this->c_block_transfer.scalar_per_vector_n_wave_n_per_Xdl)},
|
||||
};
|
||||
|
||||
return Solution{InterpolateString(CopyDevice_ConvTemplate, values), std::move(values)};
|
||||
}
|
||||
|
||||
} // namespace conv
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
23
codegen/src/headers.cpp
Normal file
23
codegen/src/headers.cpp
Normal file
@@ -0,0 +1,23 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/host/headers.hpp"
|
||||
#include "ck_headers.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wglobal-constructors"
|
||||
const std::string config_header = "";
|
||||
#pragma clang diagnostic pop
|
||||
|
||||
std::unordered_map<std::string_view, std::string_view> GetHeaders()
|
||||
{
|
||||
auto headers = ck_headers();
|
||||
headers.insert(std::make_pair("ck/config.h", config_header));
|
||||
return headers;
|
||||
}
|
||||
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
94
codegen/src/types.cpp
Normal file
94
codegen/src/types.cpp
Normal file
@@ -0,0 +1,94 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/host/types.hpp"
|
||||
#include "ck/host/stringutils.hpp"
|
||||
#include <algorithm>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
|
||||
Solution::Solution(std::string str, std::unordered_map<std::string, std::string> values)
|
||||
: template_str(std::move(str)), template_values(std::move(values))
|
||||
{
|
||||
}
|
||||
|
||||
std::string Solution::ToTemplateString() const { return this->template_str; }
|
||||
std::string Solution::GetTemplateParameter(const std::string& name) const
|
||||
{
|
||||
return this->template_values.at(name);
|
||||
}
|
||||
|
||||
std::string ToString(DataType dt)
|
||||
{
|
||||
switch(dt)
|
||||
{
|
||||
case DataType::Float: return "float";
|
||||
case DataType::Half: return "ck::half_t";
|
||||
case DataType::Int8: return "int8_t";
|
||||
case DataType::Int32: return "int32_t";
|
||||
}
|
||||
throw std::runtime_error("Incorrect data type");
|
||||
}
|
||||
|
||||
Layout ToLayout(bool Trans) { return Trans ? Layout::Column : Layout::Row; }
|
||||
|
||||
std::string ToString(Layout dl)
|
||||
{
|
||||
switch(dl)
|
||||
{
|
||||
case Layout::Row: return "ck::tensor_layout::gemm::RowMajor";
|
||||
case Layout::Column: return "ck::tensor_layout::gemm::ColumnMajor";
|
||||
case Layout::GKCYX: return "ck::tensor_layout::convolution::GKCYX";
|
||||
case Layout::GKYXC: return "ck::tensor_layout::convolution::GKYXC";
|
||||
case Layout::GNHWK: return "ck::tensor_layout::convolution::GNHWK";
|
||||
case Layout::GNHWC: return "ck::tensor_layout::convolution::GNHWC";
|
||||
case Layout::NHWGC: return "ck::tensor_layout::convolution::NHWGC";
|
||||
case Layout::NHWGK: return "ck::tensor_layout::convolution::NHWGK";
|
||||
}
|
||||
throw std::runtime_error("Incorrect layout");
|
||||
}
|
||||
|
||||
std::string ToString(GemmType gt)
|
||||
{
|
||||
switch(gt)
|
||||
{
|
||||
case GemmType::Default: return "ck::tensor_operation::device::GemmSpecialization::Default";
|
||||
}
|
||||
throw std::runtime_error("Incorrect gemm type");
|
||||
}
|
||||
|
||||
std::string ToString(LoopScheduler ls)
|
||||
{
|
||||
switch(ls)
|
||||
{
|
||||
case LoopScheduler::Default: return "ck::LoopScheduler::Default";
|
||||
case LoopScheduler::Interwave: return "ck::LoopScheduler::Interwave";
|
||||
}
|
||||
throw std::runtime_error("Incorrect LoopScheduler type");
|
||||
}
|
||||
|
||||
std::string ToString(PipelineVersion pv)
|
||||
{
|
||||
switch(pv)
|
||||
{
|
||||
case PipelineVersion::v1: return "ck::PipelineVersion::v1";
|
||||
case PipelineVersion::v2: return "ck::PipelineVersion::v2";
|
||||
}
|
||||
throw std::runtime_error("Incorrect PipelineVersion type");
|
||||
}
|
||||
|
||||
std::string SequenceStr(const std::vector<int>& v)
|
||||
{
|
||||
return "ck::Sequence<" +
|
||||
JoinStrings(Transform(v, [](int x) { return std::to_string(x); }), ", ") + ">";
|
||||
}
|
||||
|
||||
std::string MakeTuple(const std::vector<std::string>& v)
|
||||
{
|
||||
return "ck::Tuple<" + JoinStrings(v, ", ") + ">";
|
||||
}
|
||||
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
21
codegen/src/utils.cpp
Normal file
21
codegen/src/utils.cpp
Normal file
@@ -0,0 +1,21 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/host/utils.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
|
||||
std::size_t integer_divide_ceil(std::size_t x, std::size_t y)
|
||||
{
|
||||
return (x + y - std::size_t{1}) / y;
|
||||
}
|
||||
|
||||
const std::unordered_set<std::string>& get_xdlop_archs()
|
||||
{
|
||||
static std::unordered_set<std::string> supported_archs{"gfx90a", "gfx908", "gfx942"};
|
||||
return supported_archs;
|
||||
}
|
||||
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
25
codegen/test/CMakeLists.txt
Normal file
25
codegen/test/CMakeLists.txt
Normal file
@@ -0,0 +1,25 @@
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
|
||||
add_subdirectory(rtc)
|
||||
file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp)
|
||||
|
||||
# TODO: These tests need to be refactored to remove dependency on main ck
|
||||
# headers and device compilation.
|
||||
set(TESTS_REQUIRE_DEVICE_COMPILE
|
||||
grouped_conv_fwd_multiple_d_v1
|
||||
grouped_conv_fwd_multiple_d_v2
|
||||
grouped_conv_fwd_multiple_d_v3
|
||||
grouped_conv_fwd_multiple_d_v4
|
||||
)
|
||||
find_package(hip)
|
||||
|
||||
foreach(TEST_SRC ${TEST_SRCS})
|
||||
get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
|
||||
rocm_add_test_executable(codegen_test_${BASE_NAME} ${TEST_SRC})
|
||||
target_link_libraries(codegen_test_${BASE_NAME} ck_rtc ck_host)
|
||||
target_include_directories(codegen_test_${BASE_NAME} PUBLIC include)
|
||||
if(BASE_NAME IN_LIST TESTS_REQUIRE_DEVICE_COMPILE)
|
||||
target_link_libraries(codegen_test_${BASE_NAME} hip::device)
|
||||
target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/include)
|
||||
target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/library/include)
|
||||
endif()
|
||||
endforeach()
|
||||
82
codegen/test/batched_gemm_softmax_gemm.cpp
Normal file
82
codegen/test/batched_gemm_softmax_gemm.cpp
Normal file
@@ -0,0 +1,82 @@
|
||||
#include "ck/host/device_batched_gemm_softmax_gemm/problem.hpp"
|
||||
#include "ck/host/stringutils.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include "common.hpp"
|
||||
#include <rtc/compile_kernel.hpp>
|
||||
#include <rtc/hip.hpp>
|
||||
#include <test.hpp>
|
||||
#include <cmath>
|
||||
|
||||
using half = _Float16;
|
||||
|
||||
const std::string gemm_compile_check = R"__ck__(
|
||||
#include <${include}>
|
||||
|
||||
extern "C" __global__ void f(const ck::half_t* a, const ck::half_t* b, const ck::half_t* b1, ck::half_t* c) {
|
||||
using G = ${template};
|
||||
constexpr auto desc = G::make_descriptor(ck::make_naive_tensor_descriptor(ck::make_tuple(${m}, ${k}), ck::make_tuple(${m}, 1)),
|
||||
ck::make_naive_tensor_descriptor(ck::make_tuple(${n}, ${k}), ck::make_tuple(${n}, 1)),
|
||||
ck::make_naive_tensor_descriptor(ck::make_tuple(${n}, ${o}), ck::make_tuple(1, ${n})),
|
||||
ck::make_naive_tensor_descriptor(ck::make_tuple(${m}, ${o}), ck::make_tuple(${m}, 1)));
|
||||
|
||||
static_assert(desc.IsValid(), "Invalid ck gemm.");
|
||||
|
||||
if constexpr(desc.IsValid())
|
||||
{
|
||||
${template}::Run(desc,
|
||||
1.0,
|
||||
a,
|
||||
b,
|
||||
b1,
|
||||
c);
|
||||
}
|
||||
}
|
||||
|
||||
)__ck__";
|
||||
|
||||
TEST_CASE(test_problem_kernel)
|
||||
{
|
||||
ck::host::device_batched_gemm_softmax_gemm::Problem prob;
|
||||
prob.M = 1024;
|
||||
prob.N = 1024;
|
||||
prob.K = 1024;
|
||||
prob.O = 1024;
|
||||
prob.TransB = true;
|
||||
check_all<half> check;
|
||||
auto a = to_gpu(generate_buffer<half>(1024 * 1024, 0));
|
||||
auto b = to_gpu(generate_buffer<half>(1024 * 1024, 1));
|
||||
auto b1 = to_gpu(generate_buffer<half>(1024 * 1024, 2));
|
||||
auto c = to_gpu(generate_buffer<half>(1024 * 1024, 3));
|
||||
|
||||
auto solutions = prob.GetSolutions("gfx90a");
|
||||
std::cout << "Num solutions: " << solutions.size() << std::endl;
|
||||
for(auto i = 0; i < solutions.size(); ++i)
|
||||
{
|
||||
std::cout << "Testing solution " << std::to_string(i + 1) << std::endl;
|
||||
auto&& solution = solutions[i];
|
||||
auto src = ck::host::InterpolateString(gemm_compile_check,
|
||||
{{"include", prob.GetIncludeHeader()},
|
||||
{"template", solution.ToTemplateString()},
|
||||
{"m", std::to_string(prob.M)},
|
||||
{"n", std::to_string(prob.N)},
|
||||
{"k", std::to_string(prob.K)},
|
||||
{"o", std::to_string(prob.O)}});
|
||||
auto srcs = get_headers_for_test();
|
||||
srcs.push_back({"main.cpp", src});
|
||||
rtc::compile_options options;
|
||||
options.kernel_name = "f";
|
||||
auto k = rtc::compile_kernel(srcs, options);
|
||||
auto block_size = solution.GetTemplateParameter<std::size_t>("BlockSize");
|
||||
auto m_per_block = solution.GetTemplateParameter<std::size_t>("Gemm01MPerBlock");
|
||||
auto n_per_block = solution.GetTemplateParameter<std::size_t>("Gemm1NPerBlock");
|
||||
auto grid_size = ck::host::integer_divide_ceil(prob.M, m_per_block) *
|
||||
ck::host::integer_divide_ceil(prob.N, n_per_block);
|
||||
k.launch(nullptr, grid_size * block_size, block_size)(
|
||||
a.data(), b.data(), b1.data(), c.data());
|
||||
|
||||
// NOTE: Solutions where MaskOutUpperTriangle is True don't produce consistent results
|
||||
CHECK(report(solution, check(rtc::from_gpu(c))));
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, const char* argv[]) { test::run(argc, argv); }
|
||||
84
codegen/test/gemm_multiple_d.cpp
Normal file
84
codegen/test/gemm_multiple_d.cpp
Normal file
@@ -0,0 +1,84 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/host/device_gemm_multiple_d/problem.hpp"
|
||||
#include "ck/host/device_gemm_multiple_d/operation.hpp"
|
||||
#include "ck/host/headers.hpp"
|
||||
#include "ck/host/stringutils.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include "common.hpp"
|
||||
#include <rtc/compile_kernel.hpp>
|
||||
#include <rtc/hip.hpp>
|
||||
#include <test.hpp>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
#include <iterator>
|
||||
#include <random>
|
||||
|
||||
using half = _Float16;
|
||||
|
||||
const std::string gemm_compile_check = R"__ck__(
|
||||
#include <${include}>
|
||||
|
||||
extern "C" __global__ void f(const ck::half_t* a, const ck::half_t* b, ck::half_t* c) {
|
||||
using G = ${template};
|
||||
constexpr auto desc = G::make_descriptor(ck::make_naive_tensor_descriptor_packed(ck::make_tuple(${m}, ${k})),
|
||||
ck::make_naive_tensor_descriptor(ck::make_tuple(${n}, ${k}), ck::make_tuple(1, ${n})),
|
||||
ck::make_tuple(),
|
||||
ck::make_naive_tensor_descriptor_packed(ck::make_tuple(${m}, ${n})));
|
||||
|
||||
static_assert(desc.IsValid(), "Invalid ck gemm.");
|
||||
|
||||
if constexpr(desc.IsValid())
|
||||
{
|
||||
${template}::Run(desc,
|
||||
a,
|
||||
b,
|
||||
ck::make_tuple(),
|
||||
c);
|
||||
}
|
||||
}
|
||||
|
||||
)__ck__";
|
||||
|
||||
TEST_CASE(test_problem_kernel)
|
||||
{
|
||||
ck::host::device_gemm_multiple_d::Problem prob;
|
||||
prob.M = 1024;
|
||||
prob.N = 1024;
|
||||
prob.K = 1024;
|
||||
check_all<half> check;
|
||||
auto a = to_gpu(generate_buffer<half>(1024 * 1024, 0));
|
||||
auto b = to_gpu(generate_buffer<half>(1024 * 1024, 1));
|
||||
auto c = to_gpu(generate_buffer<half>(1024 * 1024, 2));
|
||||
|
||||
auto solutions = prob.GetSolutions("gfx90a");
|
||||
std::cout << "Num solutions: " << solutions.size() << std::endl;
|
||||
for(auto i = 0; i < solutions.size(); ++i)
|
||||
{
|
||||
std::cout << "Testing solution " << std::to_string(i + 1) << std::endl;
|
||||
auto&& solution = solutions[i];
|
||||
auto src = ck::host::InterpolateString(gemm_compile_check,
|
||||
{{"include", prob.GetIncludeHeader()},
|
||||
{"template", solution.ToTemplateString()},
|
||||
{"m", std::to_string(prob.M)},
|
||||
{"n", std::to_string(prob.N)},
|
||||
{"k", std::to_string(prob.K)}});
|
||||
auto srcs = get_headers_for_test();
|
||||
srcs.push_back({"main.cpp", src});
|
||||
rtc::compile_options options;
|
||||
options.kernel_name = "f";
|
||||
auto k = rtc::compile_kernel(srcs, options);
|
||||
auto block_size = solution.GetTemplateParameter<std::size_t>("BlockSize");
|
||||
auto m_per_block = solution.GetTemplateParameter<std::size_t>("MPerBlock");
|
||||
auto n_per_block = solution.GetTemplateParameter<std::size_t>("NPerBlock");
|
||||
auto grid_size = ck::host::integer_divide_ceil(prob.M, m_per_block) *
|
||||
ck::host::integer_divide_ceil(prob.N, n_per_block);
|
||||
k.launch(nullptr, grid_size * block_size, block_size)(a.data(), b.data(), c.data());
|
||||
|
||||
CHECK(report(solution, check(rtc::from_gpu(c))));
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, const char* argv[]) { test::run(argc, argv); }
|
||||
210
codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
Normal file
210
codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
Normal file
@@ -0,0 +1,210 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
|
||||
#include "ck/host/headers.hpp"
|
||||
#include "ck/host/stringutils.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/helper.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
#include <test.hpp>
|
||||
#include <rtc/compile_kernel.hpp>
|
||||
#include <rtc/hip.hpp>
|
||||
#include "common.hpp"
|
||||
#include <fstream>
|
||||
|
||||
// Need this for verification
|
||||
/**struct Epilogue
|
||||
{
|
||||
Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};**/
|
||||
const std::string conv_compile_check = R"__ck__(
|
||||
#include <${include}>
|
||||
|
||||
${template};
|
||||
|
||||
)__ck__";
|
||||
|
||||
TEST_CASE(test_problem_kernel)
|
||||
{
|
||||
// set up problem specification
|
||||
ck::host::conv::Problem_Conv_Fwd prob;
|
||||
prob.NumDim = 2;
|
||||
prob.G = 32;
|
||||
prob.N = 256;
|
||||
prob.C = 32;
|
||||
prob.K = 64;
|
||||
prob.Y = 3;
|
||||
prob.X = 3;
|
||||
prob.Hi = 28;
|
||||
prob.Wi = 28;
|
||||
prob.Ho = 28;
|
||||
prob.Wo = 28;
|
||||
check_all<ck::half_t> check;
|
||||
|
||||
// user provided fusion operations
|
||||
std::string epilogue = R"(
|
||||
struct Epilogue
|
||||
{
|
||||
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};
|
||||
)";
|
||||
std::string prologue = "";
|
||||
|
||||
// length+stride arrays
|
||||
ck::Array<ck::index_t, 5> in_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi),
|
||||
static_cast<int>(prob.Wi)};
|
||||
ck::Array<ck::index_t, 5> out_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho),
|
||||
static_cast<int>(prob.Wo)};
|
||||
ck::Array<ck::index_t, 5> wei_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Y),
|
||||
static_cast<int>(prob.X)};
|
||||
|
||||
ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.Wi * prob.G * prob.C),
|
||||
static_cast<int>(prob.G * prob.C)};
|
||||
ck::Array<ck::index_t, 5> out_strides{static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho * prob.Wo * prob.G * prob.K),
|
||||
1,
|
||||
static_cast<int>(prob.Wo * prob.G * prob.K),
|
||||
static_cast<int>(prob.G * prob.K)};
|
||||
ck::Array<ck::index_t, 5> wei_strides{static_cast<int>(prob.K * prob.Y * prob.X * prob.C),
|
||||
static_cast<int>(prob.Y * prob.X * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.X * prob.C),
|
||||
static_cast<int>(prob.C)};
|
||||
|
||||
ck::Array<ck::index_t, 2> conv_filter_strides = {2, 2};
|
||||
ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
|
||||
ck::Array<ck::index_t, 2> input_left_pads = {1, 1};
|
||||
ck::Array<ck::index_t, 2> input_right_pads = {1, 1};
|
||||
|
||||
// move the data onto the device
|
||||
auto in_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(in_lengths, in_strides, 0));
|
||||
auto wei_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(wei_lengths, wei_strides, 1));
|
||||
auto out_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(out_lengths, out_strides, 2));
|
||||
|
||||
// CK Verficiation: Reference Kernel
|
||||
/**bool pass = true;
|
||||
Tensor<ck::half_t> in_host(in_lengths, in_strides);
|
||||
in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
|
||||
wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> out_host(out_lengths, out_strides);
|
||||
|
||||
std::vector<ck::index_t> conv_filter_strides_ = {2, 2};
|
||||
std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
|
||||
std::vector<ck::index_t> input_left_pads_ = {1, 1};
|
||||
std::vector<ck::index_t> input_right_pads_ = {1, 1};
|
||||
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
|
||||
2,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
Epilogue>();
|
||||
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(in_host,
|
||||
wei_host,
|
||||
out_host,
|
||||
conv_filter_strides_,
|
||||
conv_filter_dilations_,
|
||||
input_left_pads_,
|
||||
input_right_pads_,
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
Epilogue{1.0f, 1.0f});
|
||||
out_host.SetZero();
|
||||
ref_invoker.Run(ref_argument);**/
|
||||
|
||||
for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
|
||||
{
|
||||
// substitute instance values into the template
|
||||
auto src = ck::host::InterpolateString(
|
||||
conv_compile_check,
|
||||
{{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}});
|
||||
|
||||
auto srcs = get_headers_for_test();
|
||||
srcs.push_back({"main.cpp", src});
|
||||
rtc::compile_options options;
|
||||
auto name = solution.GetTemplateParameter<std::string>("name");
|
||||
options.kernel_name = "run_" + name;
|
||||
auto k = rtc::compile_kernel(srcs, options);
|
||||
|
||||
// Grid size calculation
|
||||
auto block_size = solution.GetTemplateParameter<ck::index_t>("BlockSize");
|
||||
|
||||
auto tmp = get_launch_params(solution, out_lengths, out_strides);
|
||||
|
||||
auto grid_size = tmp * in_lengths[1];
|
||||
|
||||
// launch the kernel with arguments needed for the argument pointer
|
||||
k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(),
|
||||
wei_dev.data(),
|
||||
out_dev.data(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
|
||||
// auto res = rtc::from_gpu(out_dev);
|
||||
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
// assert(pass);
|
||||
|
||||
// Simple check: this checks that the output from each instance matches the output from the
|
||||
// first instance
|
||||
CHECK(report(solution, check(rtc::from_gpu(out_dev))));
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, const char* argv[]) { test::run(argc, argv); }
|
||||
210
codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
Normal file
210
codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
Normal file
@@ -0,0 +1,210 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
|
||||
#include "ck/host/headers.hpp"
|
||||
#include "ck/host/stringutils.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include "common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/helper.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
#include <test.hpp>
|
||||
#include <rtc/compile_kernel.hpp>
|
||||
#include <rtc/hip.hpp>
|
||||
#include <fstream>
|
||||
|
||||
// need this for validation
|
||||
/**struct Epilogue
|
||||
{
|
||||
Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};**/
|
||||
const std::string conv_compile_check = R"__ck__(
|
||||
#include <${include}>
|
||||
|
||||
${template};
|
||||
|
||||
)__ck__";
|
||||
|
||||
TEST_CASE(test_problem_kernel)
|
||||
{
|
||||
// set up problem specification
|
||||
ck::host::conv::Problem_Conv_Fwd prob;
|
||||
prob.NumDim = 2;
|
||||
prob.G = 32;
|
||||
prob.N = 256;
|
||||
prob.C = 32;
|
||||
prob.K = 64;
|
||||
prob.Y = 3;
|
||||
prob.X = 3;
|
||||
prob.Hi = 28;
|
||||
prob.Wi = 28;
|
||||
prob.Ho = 28;
|
||||
prob.Wo = 28;
|
||||
check_all<ck::half_t> check;
|
||||
|
||||
// user provided fusion operations
|
||||
std::string epilogue = R"(
|
||||
struct Epilogue
|
||||
{
|
||||
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};
|
||||
)";
|
||||
std::string prologue = "";
|
||||
|
||||
// length+stride arrays
|
||||
ck::Array<ck::index_t, 5> in_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi),
|
||||
static_cast<int>(prob.Wi)};
|
||||
ck::Array<ck::index_t, 5> out_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho),
|
||||
static_cast<int>(prob.Wo)};
|
||||
ck::Array<ck::index_t, 5> wei_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Y),
|
||||
static_cast<int>(prob.X)};
|
||||
|
||||
ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.Wi * prob.G * prob.C),
|
||||
static_cast<int>(prob.G * prob.C)};
|
||||
ck::Array<ck::index_t, 5> out_strides{static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho * prob.Wo * prob.G * prob.K),
|
||||
1,
|
||||
static_cast<int>(prob.Wo * prob.G * prob.K),
|
||||
static_cast<int>(prob.G * prob.K)};
|
||||
ck::Array<ck::index_t, 5> wei_strides{static_cast<int>(prob.K * prob.Y * prob.X * prob.C),
|
||||
static_cast<int>(prob.Y * prob.X * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.X * prob.C),
|
||||
static_cast<int>(prob.C)};
|
||||
|
||||
ck::Array<ck::index_t, 2> conv_filter_strides = {1, 1};
|
||||
ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
|
||||
ck::Array<ck::index_t, 2> input_left_pads = {0, 0};
|
||||
ck::Array<ck::index_t, 2> input_right_pads = {0, 0};
|
||||
|
||||
// move the data onto the device
|
||||
auto in_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(in_lengths, in_strides, 0));
|
||||
auto wei_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(wei_lengths, wei_strides, 1));
|
||||
auto out_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(out_lengths, out_strides, 2));
|
||||
|
||||
// CK Verficiation: Reference Kernel
|
||||
/**bool pass = true;
|
||||
Tensor<ck::half_t> in_host(in_lengths, in_strides);
|
||||
in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
|
||||
wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> out_host(out_lengths, out_strides);
|
||||
|
||||
std::vector<ck::index_t> conv_filter_strides_ = {1, 1};
|
||||
std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
|
||||
std::vector<ck::index_t> input_left_pads_ = {0, 0};
|
||||
std::vector<ck::index_t> input_right_pads_ = {0, 0};
|
||||
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
|
||||
2,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
Epilogue>();
|
||||
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(in_host,
|
||||
wei_host,
|
||||
out_host,
|
||||
conv_filter_strides_,
|
||||
conv_filter_dilations_,
|
||||
input_left_pads_,
|
||||
input_right_pads_,
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
Epilogue{1.0f, 1.0f});
|
||||
out_host.SetZero();
|
||||
ref_invoker.Run(ref_argument);**/
|
||||
|
||||
for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
|
||||
{
|
||||
// substitute instance values into the template
|
||||
auto src = ck::host::InterpolateString(
|
||||
conv_compile_check,
|
||||
{{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}});
|
||||
|
||||
auto srcs = get_headers_for_test();
|
||||
srcs.push_back({"main.cpp", src});
|
||||
rtc::compile_options options;
|
||||
auto name = solution.GetTemplateParameter<std::string>("name");
|
||||
options.kernel_name = "run_" + name;
|
||||
auto k = rtc::compile_kernel(srcs, options);
|
||||
|
||||
// Grid size calculation
|
||||
auto block_size = solution.GetTemplateParameter<ck::index_t>("BlockSize");
|
||||
|
||||
auto tmp = get_launch_params(solution, out_lengths, out_strides);
|
||||
|
||||
auto grid_size = tmp * in_lengths[1];
|
||||
|
||||
// launch the kernel with arguments needed for the argument pointer
|
||||
k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(),
|
||||
wei_dev.data(),
|
||||
out_dev.data(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
|
||||
// auto res = rtc::from_gpu(out_dev);
|
||||
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
// assert(pass);
|
||||
|
||||
// Simple check: this checks that the output from each instance matches the output from the
|
||||
// first instance
|
||||
CHECK(report(solution, check(rtc::from_gpu(out_dev))));
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, const char* argv[]) { test::run(argc, argv); }
|
||||
210
codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
Normal file
210
codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
Normal file
@@ -0,0 +1,210 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
|
||||
#include "ck/host/headers.hpp"
|
||||
#include "ck/host/stringutils.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/helper.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
#include "common.hpp"
|
||||
#include <test.hpp>
|
||||
#include <rtc/compile_kernel.hpp>
|
||||
#include <rtc/hip.hpp>
|
||||
#include <fstream>
|
||||
|
||||
// need this for verification
|
||||
/**struct Epilogue
|
||||
{
|
||||
Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};**/
|
||||
const std::string conv_compile_check = R"__ck__(
|
||||
#include <${include}>
|
||||
|
||||
${template};
|
||||
|
||||
)__ck__";
|
||||
|
||||
TEST_CASE(test_problem_kernel)
|
||||
{
|
||||
// set up problem specification
|
||||
ck::host::conv::Problem_Conv_Fwd prob;
|
||||
prob.NumDim = 2;
|
||||
prob.G = 32;
|
||||
prob.N = 256;
|
||||
prob.C = 32;
|
||||
prob.K = 64;
|
||||
prob.Y = 3;
|
||||
prob.X = 3;
|
||||
prob.Hi = 28;
|
||||
prob.Wi = 28;
|
||||
prob.Ho = 28;
|
||||
prob.Wo = 28;
|
||||
check_all<ck::half_t> check;
|
||||
|
||||
// user provided fusion operations
|
||||
std::string epilogue = R"(
|
||||
struct Epilogue
|
||||
{
|
||||
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};
|
||||
)";
|
||||
std::string prologue = "";
|
||||
|
||||
// length+stride arrays
|
||||
ck::Array<ck::index_t, 5> in_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi),
|
||||
static_cast<int>(prob.Wi)};
|
||||
ck::Array<ck::index_t, 5> out_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho),
|
||||
static_cast<int>(prob.Wo)};
|
||||
ck::Array<ck::index_t, 5> wei_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Y),
|
||||
static_cast<int>(prob.X)};
|
||||
|
||||
ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.Wi * prob.G * prob.C),
|
||||
static_cast<int>(prob.G * prob.C)};
|
||||
ck::Array<ck::index_t, 5> out_strides{static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho * prob.Wo * prob.G * prob.K),
|
||||
1,
|
||||
static_cast<int>(prob.Wo * prob.G * prob.K),
|
||||
static_cast<int>(prob.G * prob.K)};
|
||||
ck::Array<ck::index_t, 5> wei_strides{static_cast<int>(prob.K * prob.Y * prob.X * prob.C),
|
||||
static_cast<int>(prob.Y * prob.X * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.X * prob.C),
|
||||
static_cast<int>(prob.C)};
|
||||
|
||||
ck::Array<ck::index_t, 2> conv_filter_strides = {2, 2};
|
||||
ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
|
||||
ck::Array<ck::index_t, 2> input_left_pads = {0, 0};
|
||||
ck::Array<ck::index_t, 2> input_right_pads = {0, 0};
|
||||
|
||||
// move the data onto the device
|
||||
auto in_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(in_lengths, in_strides, 0));
|
||||
auto wei_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(wei_lengths, wei_strides, 1));
|
||||
auto out_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(out_lengths, out_strides, 2));
|
||||
|
||||
// CK Verficiation: Reference Kernel
|
||||
/**bool pass = true;
|
||||
Tensor<ck::half_t> in_host(in_lengths, in_strides);
|
||||
in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
|
||||
wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> out_host(out_lengths, out_strides);
|
||||
|
||||
std::vector<ck::index_t> conv_filter_strides_ = {2, 2};
|
||||
std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
|
||||
std::vector<ck::index_t> input_left_pads_ = {0, 0};
|
||||
std::vector<ck::index_t> input_right_pads_ = {0, 0};
|
||||
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
|
||||
2,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
Epilogue>();
|
||||
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(in_host,
|
||||
wei_host,
|
||||
out_host,
|
||||
conv_filter_strides_,
|
||||
conv_filter_dilations_,
|
||||
input_left_pads_,
|
||||
input_right_pads_,
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
Epilogue{1.0f, 1.0f});
|
||||
out_host.SetZero();
|
||||
ref_invoker.Run(ref_argument);**/
|
||||
|
||||
for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
|
||||
{
|
||||
// substitute instance values into the template
|
||||
auto src = ck::host::InterpolateString(
|
||||
conv_compile_check,
|
||||
{{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}});
|
||||
|
||||
auto srcs = get_headers_for_test();
|
||||
srcs.push_back({"main.cpp", src});
|
||||
rtc::compile_options options;
|
||||
auto name = solution.GetTemplateParameter<std::string>("name");
|
||||
options.kernel_name = "run_" + name;
|
||||
auto k = rtc::compile_kernel(srcs, options);
|
||||
|
||||
// Grid size calculation
|
||||
auto block_size = solution.GetTemplateParameter<ck::index_t>("BlockSize");
|
||||
|
||||
auto tmp = get_launch_params(solution, out_lengths, out_strides);
|
||||
|
||||
auto grid_size = tmp * in_lengths[1];
|
||||
|
||||
// launch the kernel with arguments needed for the argument pointer
|
||||
k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(),
|
||||
wei_dev.data(),
|
||||
out_dev.data(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
|
||||
// auto res = rtc::from_gpu(out_dev);
|
||||
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
// assert(pass);
|
||||
|
||||
// Simple check: this checks that the output from each instance matches the output from the
|
||||
// first instance
|
||||
CHECK(report(solution, check(rtc::from_gpu(out_dev))));
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, const char* argv[]) { test::run(argc, argv); }
|
||||
210
codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
Normal file
210
codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
Normal file
@@ -0,0 +1,210 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
|
||||
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
|
||||
#include "ck/host/headers.hpp"
|
||||
#include "ck/host/stringutils.hpp"
|
||||
#include "ck/host/utils.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/helper.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
#include "common.hpp"
|
||||
#include <test.hpp>
|
||||
#include <rtc/compile_kernel.hpp>
|
||||
#include <rtc/hip.hpp>
|
||||
#include <fstream>
|
||||
|
||||
// need this for verification
|
||||
/**struct Epilogue
|
||||
{
|
||||
Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};**/
|
||||
const std::string conv_compile_check = R"__ck__(
|
||||
#include <${include}>
|
||||
|
||||
${template};
|
||||
|
||||
)__ck__";
|
||||
|
||||
TEST_CASE(test_problem_kernel)
|
||||
{
|
||||
// set up problem specification
|
||||
ck::host::conv::Problem_Conv_Fwd prob;
|
||||
prob.NumDim = 2;
|
||||
prob.G = 32;
|
||||
prob.N = 256;
|
||||
prob.C = 32;
|
||||
prob.K = 64;
|
||||
prob.Y = 3;
|
||||
prob.X = 3;
|
||||
prob.Hi = 28;
|
||||
prob.Wi = 28;
|
||||
prob.Ho = 28;
|
||||
prob.Wo = 28;
|
||||
check_all<ck::half_t> check;
|
||||
|
||||
// user provided fusion operations
|
||||
std::string epilogue = R"(
|
||||
struct Epilogue
|
||||
{
|
||||
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
|
||||
const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
|
||||
}
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};
|
||||
)";
|
||||
std::string prologue = "";
|
||||
|
||||
// length+stride arrays
|
||||
ck::Array<ck::index_t, 5> in_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi),
|
||||
static_cast<int>(prob.Wi)};
|
||||
ck::Array<ck::index_t, 5> out_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.N),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho),
|
||||
static_cast<int>(prob.Wo)};
|
||||
ck::Array<ck::index_t, 5> wei_lengths{static_cast<int>(prob.G),
|
||||
static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Y),
|
||||
static_cast<int>(prob.X)};
|
||||
|
||||
ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
|
||||
static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.Wi * prob.G * prob.C),
|
||||
static_cast<int>(prob.G * prob.C)};
|
||||
ck::Array<ck::index_t, 5> out_strides{static_cast<int>(prob.K),
|
||||
static_cast<int>(prob.Ho * prob.Wo * prob.G * prob.K),
|
||||
1,
|
||||
static_cast<int>(prob.Wo * prob.G * prob.K),
|
||||
static_cast<int>(prob.G * prob.K)};
|
||||
ck::Array<ck::index_t, 5> wei_strides{static_cast<int>(prob.K * prob.Y * prob.X * prob.C),
|
||||
static_cast<int>(prob.Y * prob.X * prob.C),
|
||||
1,
|
||||
static_cast<int>(prob.X * prob.C),
|
||||
static_cast<int>(prob.C)};
|
||||
|
||||
ck::Array<ck::index_t, 2> conv_filter_strides = {1, 1};
|
||||
ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
|
||||
ck::Array<ck::index_t, 2> input_left_pads = {1, 1};
|
||||
ck::Array<ck::index_t, 2> input_right_pads = {1, 1};
|
||||
|
||||
// move the data onto the device
|
||||
auto in_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(in_lengths, in_strides, 0));
|
||||
auto wei_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(wei_lengths, wei_strides, 1));
|
||||
auto out_dev =
|
||||
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(out_lengths, out_strides, 2));
|
||||
|
||||
// CK Verficiation: Reference Kernel
|
||||
/**bool pass = true;
|
||||
Tensor<ck::half_t> in_host(in_lengths, in_strides);
|
||||
in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
|
||||
wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
|
||||
Tensor<ck::half_t> out_host(out_lengths, out_strides);
|
||||
|
||||
std::vector<ck::index_t> conv_filter_strides_ = {1, 1};
|
||||
std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
|
||||
std::vector<ck::index_t> input_left_pads_ = {1, 1};
|
||||
std::vector<ck::index_t> input_right_pads_ = {1, 1};
|
||||
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
|
||||
2,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
Epilogue>();
|
||||
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(in_host,
|
||||
wei_host,
|
||||
out_host,
|
||||
conv_filter_strides_,
|
||||
conv_filter_dilations_,
|
||||
input_left_pads_,
|
||||
input_right_pads_,
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
ck::tensor_operation::element_wise::PassThrough{},
|
||||
Epilogue{1.0f, 1.0f});
|
||||
out_host.SetZero();
|
||||
ref_invoker.Run(ref_argument);**/
|
||||
|
||||
for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
|
||||
{
|
||||
// substitute instance values into the template
|
||||
auto src = ck::host::InterpolateString(
|
||||
conv_compile_check,
|
||||
{{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}});
|
||||
|
||||
auto srcs = get_headers_for_test();
|
||||
srcs.push_back({"main.cpp", src});
|
||||
rtc::compile_options options;
|
||||
auto name = solution.GetTemplateParameter<std::string>("name");
|
||||
options.kernel_name = "run_" + name;
|
||||
auto k = rtc::compile_kernel(srcs, options);
|
||||
|
||||
// Grid size calculation
|
||||
auto block_size = solution.GetTemplateParameter<ck::index_t>("BlockSize");
|
||||
|
||||
auto tmp = get_launch_params(solution, out_lengths, out_strides);
|
||||
|
||||
auto grid_size = tmp * in_lengths[1];
|
||||
|
||||
// launch the kernel with arguments needed for the argument pointer
|
||||
k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(),
|
||||
wei_dev.data(),
|
||||
out_dev.data(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
|
||||
// auto res = rtc::from_gpu(out_dev);
|
||||
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
// assert(pass);
|
||||
|
||||
// Simple check: this checks that the output from each instance matches the output from the
|
||||
// first instance
|
||||
CHECK(report(solution, check(rtc::from_gpu(out_dev))));
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, const char* argv[]) { test::run(argc, argv); }
|
||||
154
codegen/test/include/common.hpp
Normal file
154
codegen/test/include/common.hpp
Normal file
@@ -0,0 +1,154 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/host/headers.hpp"
|
||||
#include <rtc/compile_kernel.hpp>
|
||||
#include <rtc/hip.hpp>
|
||||
#include <test.hpp>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <random>
|
||||
#include <unordered_set>
|
||||
|
||||
inline std::vector<rtc::src_file> create_headers_for_test()
|
||||
{
|
||||
auto ck_headers = ck::host::GetHeaders();
|
||||
std::vector<rtc::src_file> result;
|
||||
std::transform(ck_headers.begin(), ck_headers.end(), std::back_inserter(result), [](auto& p) {
|
||||
std::string content;
|
||||
content.reserve(p.second.size() + 1);
|
||||
content.push_back(' '); // We need a whitespace before the content for hipRTC to work
|
||||
content.append(p.second.data(), p.second.size());
|
||||
return rtc::src_file{p.first, std::move(content)};
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
inline const std::vector<rtc::src_file>& get_headers_for_test()
|
||||
{
|
||||
static const std::vector<rtc::src_file> headers = create_headers_for_test();
|
||||
return headers;
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
std::size_t GetSize(V mLens, V mStrides)
|
||||
{
|
||||
std::size_t space = 1;
|
||||
for(std::size_t i = 0; i < mLens.Size(); ++i)
|
||||
{
|
||||
if(mLens[i] == 0)
|
||||
continue;
|
||||
|
||||
space += (mLens[i] - 1) * mStrides[i];
|
||||
}
|
||||
return space;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
rtc::buffer<T> generate_buffer(std::size_t n, std::size_t seed = 0)
|
||||
{
|
||||
rtc::buffer<T> result(n);
|
||||
std::mt19937 gen(seed);
|
||||
std::uniform_real_distribution<double> dis(-1.0);
|
||||
std::generate(result.begin(), result.end(), [&] { return dis(gen); });
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class T, typename V>
|
||||
std::enable_if_t<!std::is_integral_v<V>, rtc::buffer<T>>
|
||||
generate_buffer(V mLens, V mStrides, std::size_t seed = 0)
|
||||
{
|
||||
std::size_t space = GetSize(mLens, mStrides);
|
||||
return generate_buffer<T>(space, seed);
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
bool allclose(const T& a, const U& b, double atol = 0.01, double rtol = 0.01)
|
||||
{
|
||||
return std::equal(a.begin(), a.end(), b.begin(), b.end(), [&](double x, double y) {
|
||||
return fabs(x - y) < atol + rtol * fabs(y);
|
||||
});
|
||||
}
|
||||
|
||||
inline std::string classify(double x)
|
||||
{
|
||||
switch(std::fpclassify(x))
|
||||
{
|
||||
case FP_INFINITE: return "inf";
|
||||
case FP_NAN: return "nan";
|
||||
case FP_NORMAL: return "normal";
|
||||
case FP_SUBNORMAL: return "subnormal";
|
||||
case FP_ZERO: return "zero";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
template <class Buffer>
|
||||
void print_classification(const Buffer& x)
|
||||
{
|
||||
std::unordered_set<std::string> result;
|
||||
for(const auto& i : x)
|
||||
result.insert(classify(i));
|
||||
for(const auto& c : result)
|
||||
std::cout << c << ", ";
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
template <class Buffer>
|
||||
void print_statistics(const Buffer& x)
|
||||
{
|
||||
std::cout << "Min value: " << *std::min_element(x.begin(), x.end()) << ", ";
|
||||
std::cout << "Max value: " << *std::max_element(x.begin(), x.end()) << ", ";
|
||||
double num_elements = x.size();
|
||||
auto mean =
|
||||
std::accumulate(x.begin(), x.end(), double{0.0}, std::plus<double>{}) / num_elements;
|
||||
auto stddev = std::sqrt(
|
||||
std::accumulate(x.begin(),
|
||||
x.end(),
|
||||
double{0.0},
|
||||
[&](double r, double v) { return r + std::pow((v - mean), 2.0); }) /
|
||||
num_elements);
|
||||
std::cout << "Mean: " << mean << ", ";
|
||||
std::cout << "StdDev: " << stddev << "\n";
|
||||
}
|
||||
|
||||
template <class Buffer>
|
||||
void print_preview(const Buffer& x)
|
||||
{
|
||||
if(x.size() <= 10)
|
||||
{
|
||||
std::for_each(x.begin(), x.end(), [&](double i) { std::cout << i << ", "; });
|
||||
}
|
||||
else
|
||||
{
|
||||
std::for_each(x.begin(), x.begin() + 5, [&](double i) { std::cout << i << ", "; });
|
||||
std::cout << "..., ";
|
||||
std::for_each(x.end() - 5, x.end(), [&](double i) { std::cout << i << ", "; });
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
struct check_all
|
||||
{
|
||||
rtc::buffer<T> data{};
|
||||
bool operator()(const rtc::buffer<T>& x)
|
||||
{
|
||||
if(data.empty())
|
||||
{
|
||||
data = x;
|
||||
return true;
|
||||
}
|
||||
return allclose(data, x);
|
||||
}
|
||||
};
|
||||
|
||||
template <class Solution>
|
||||
auto report(const Solution& solution, bool pass)
|
||||
{
|
||||
return test::make_predicate(solution.ToTemplateString(), [=] { return pass; });
|
||||
}
|
||||
848
codegen/test/include/test.hpp
Normal file
848
codegen/test/include/test.hpp
Normal file
@@ -0,0 +1,848 @@
|
||||
/*
|
||||
* The MIT License (MIT)
|
||||
*
|
||||
* Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <atomic>
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <chrono>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <type_traits>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#ifdef __linux__
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#ifndef MIGRAPHX_GUARD_TEST_TEST_HPP
|
||||
#define MIGRAPHX_GUARD_TEST_TEST_HPP
|
||||
|
||||
namespace test {
|
||||
// clang-format off
|
||||
// NOLINTNEXTLINE
|
||||
#define TEST_FOREACH_BINARY_OPERATORS(m) \
|
||||
m(==, equal) \
|
||||
m(!=, not_equal) \
|
||||
m(<=, less_than_equal) \
|
||||
m(>=, greater_than_equal) \
|
||||
m(<, less_than) \
|
||||
m(>, greater_than) \
|
||||
m(and, and_op) \
|
||||
m(or, or_op)
|
||||
// clang-format on
|
||||
|
||||
// clang-format off
|
||||
// NOLINTNEXTLINE
|
||||
#define TEST_FOREACH_UNARY_OPERATORS(m) \
|
||||
m(not, not_op)
|
||||
// clang-format on
|
||||
|
||||
// NOLINTNEXTLINE
|
||||
#define TEST_EACH_BINARY_OPERATOR_OBJECT(op, name) \
|
||||
struct name \
|
||||
{ \
|
||||
static std::string as_string() { return #op; } \
|
||||
template <class T, class U> \
|
||||
static decltype(auto) call(T&& x, U&& y) \
|
||||
{ \
|
||||
return x op y; \
|
||||
} \
|
||||
};
|
||||
|
||||
// NOLINTNEXTLINE
|
||||
#define TEST_EACH_UNARY_OPERATOR_OBJECT(op, name) \
|
||||
struct name \
|
||||
{ \
|
||||
static std::string as_string() { return #op; } \
|
||||
template <class T> \
|
||||
static decltype(auto) call(T&& x) \
|
||||
{ \
|
||||
return op x; \
|
||||
} \
|
||||
};
|
||||
|
||||
TEST_FOREACH_BINARY_OPERATORS(TEST_EACH_BINARY_OPERATOR_OBJECT)
|
||||
TEST_FOREACH_UNARY_OPERATORS(TEST_EACH_UNARY_OPERATOR_OBJECT)
|
||||
|
||||
struct nop
|
||||
{
|
||||
static std::string as_string() { return ""; }
|
||||
template <class T>
|
||||
static auto call(T&& x)
|
||||
{
|
||||
return static_cast<T&&>(x);
|
||||
}
|
||||
};
|
||||
|
||||
struct function
|
||||
{
|
||||
static std::string as_string() { return ""; }
|
||||
template <class T>
|
||||
static decltype(auto) call(T&& x)
|
||||
{
|
||||
return x();
|
||||
}
|
||||
};
|
||||
|
||||
template <class Stream, class Iterator>
|
||||
Stream& stream_range(Stream& s, Iterator start, Iterator last);
|
||||
|
||||
template <class Stream>
|
||||
inline Stream& operator<<(Stream& s, std::nullptr_t)
|
||||
{
|
||||
s << "nullptr";
|
||||
return s;
|
||||
}
|
||||
|
||||
template <class Stream,
|
||||
class Range,
|
||||
class = typename std::enable_if<not std::is_convertible<Range, std::string>{}>::type>
|
||||
inline auto operator<<(Stream& s, const Range& v) -> decltype(stream_range(s, v.begin(), v.end()))
|
||||
{
|
||||
s << "{ ";
|
||||
stream_range(s, v.begin(), v.end());
|
||||
s << "}";
|
||||
return s;
|
||||
}
|
||||
|
||||
template <class Stream, class Iterator>
|
||||
inline Stream& stream_range(Stream& s, Iterator start, Iterator last)
|
||||
{
|
||||
if(start != last)
|
||||
{
|
||||
s << *start;
|
||||
std::for_each(std::next(start), last, [&](auto&& x) { s << ", " << x; });
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
const T& get_value(const T& x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
template <class T, class Operator = nop>
|
||||
struct lhs_expression;
|
||||
|
||||
template <class T>
|
||||
lhs_expression<T> make_lhs_expression(T&& lhs);
|
||||
|
||||
template <class T, class Operator>
|
||||
lhs_expression<T, Operator> make_lhs_expression(T&& lhs, Operator);
|
||||
|
||||
// NOLINTNEXTLINE
|
||||
#define TEST_EXPR_BINARY_OPERATOR(op, name) \
|
||||
template <class V> \
|
||||
auto operator op(const V& rhs2) const \
|
||||
{ \
|
||||
return make_expression(*this, rhs2, name{}); /* NOLINT */ \
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE
|
||||
#define TEST_EXPR_UNARY_OPERATOR(op, name) \
|
||||
auto operator op() const { return make_lhs_expression(lhs, name{}); /* NOLINT */ }
|
||||
|
||||
template <class T, class U, class Operator>
|
||||
struct expression
|
||||
{
|
||||
T lhs;
|
||||
U rhs;
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& s, const expression& self)
|
||||
{
|
||||
s << self.lhs << " " << Operator::as_string() << " " << self.rhs;
|
||||
return s;
|
||||
}
|
||||
|
||||
friend decltype(auto) get_value(const expression& e) { return e.value(); }
|
||||
|
||||
decltype(auto) value() const { return Operator::call(get_value(lhs), get_value(rhs)); };
|
||||
|
||||
TEST_FOREACH_UNARY_OPERATORS(TEST_EXPR_UNARY_OPERATOR)
|
||||
TEST_FOREACH_BINARY_OPERATORS(TEST_EXPR_BINARY_OPERATOR)
|
||||
};
|
||||
|
||||
// TODO: Remove rvalue references
|
||||
template <class T, class U, class Operator>
|
||||
expression<T, U, Operator> make_expression(T&& rhs, U&& lhs, Operator)
|
||||
{
|
||||
return {std::forward<T>(rhs), std::forward<U>(lhs)};
|
||||
}
|
||||
|
||||
// TODO: Remove rvalue reference
|
||||
template <class T>
|
||||
lhs_expression<T> make_lhs_expression(T&& lhs)
|
||||
{
|
||||
return lhs_expression<T>{std::forward<T>(lhs)};
|
||||
}
|
||||
|
||||
template <class T, class Operator>
|
||||
lhs_expression<T, Operator> make_lhs_expression(T&& lhs, Operator)
|
||||
{
|
||||
return lhs_expression<T, Operator>{std::forward<T>(lhs)};
|
||||
}
|
||||
|
||||
template <class T, class Operator>
|
||||
struct lhs_expression
|
||||
{
|
||||
T lhs;
|
||||
explicit lhs_expression(T e) : lhs(e) {}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& s, const lhs_expression& self)
|
||||
{
|
||||
std::string op = Operator::as_string();
|
||||
if(not op.empty())
|
||||
s << Operator::as_string() << " ";
|
||||
s << self.lhs;
|
||||
return s;
|
||||
}
|
||||
|
||||
friend decltype(auto) get_value(const lhs_expression& e) { return e.value(); }
|
||||
|
||||
decltype(auto) value() const { return Operator::call(get_value(lhs)); }
|
||||
|
||||
TEST_FOREACH_BINARY_OPERATORS(TEST_EXPR_BINARY_OPERATOR)
|
||||
TEST_FOREACH_UNARY_OPERATORS(TEST_EXPR_UNARY_OPERATOR)
|
||||
|
||||
// NOLINTNEXTLINE
|
||||
#define TEST_LHS_REOPERATOR(op) \
|
||||
template <class U> \
|
||||
auto operator op(const U& rhs) const \
|
||||
{ \
|
||||
return make_lhs_expression(lhs op rhs); \
|
||||
}
|
||||
TEST_LHS_REOPERATOR(+)
|
||||
TEST_LHS_REOPERATOR(-)
|
||||
TEST_LHS_REOPERATOR(*)
|
||||
TEST_LHS_REOPERATOR(/)
|
||||
TEST_LHS_REOPERATOR(%)
|
||||
TEST_LHS_REOPERATOR(&)
|
||||
TEST_LHS_REOPERATOR(|)
|
||||
TEST_LHS_REOPERATOR(^)
|
||||
};
|
||||
|
||||
template <class F>
|
||||
struct predicate
|
||||
{
|
||||
std::string msg;
|
||||
F f;
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& s, const predicate& self)
|
||||
{
|
||||
s << self.msg;
|
||||
return s;
|
||||
}
|
||||
|
||||
decltype(auto) operator()() const { return f(); }
|
||||
|
||||
operator decltype(auto)() const { return f(); }
|
||||
};
|
||||
|
||||
template <class F>
|
||||
auto make_predicate(const std::string& msg, F f)
|
||||
{
|
||||
return make_lhs_expression(predicate<F>{msg, f}, function{});
|
||||
}
|
||||
|
||||
inline std::string as_string(bool x)
|
||||
{
|
||||
if(x)
|
||||
return "true";
|
||||
return "false";
|
||||
}
|
||||
|
||||
template <class T>
|
||||
std::string as_string(const T& x)
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << x;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
template <class Iterator>
|
||||
std::string as_string(Iterator start, Iterator last)
|
||||
{
|
||||
std::stringstream ss;
|
||||
stream_range(ss, start, last);
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
template <class F>
|
||||
auto make_function(const std::string& name, F f)
|
||||
{
|
||||
return [=](auto&&... xs) {
|
||||
std::vector<std::string> args = {as_string(xs)...};
|
||||
return make_predicate(name + "(" + as_string(args.begin(), args.end()) + ")",
|
||||
[=] { return f(xs...); });
|
||||
};
|
||||
}
|
||||
|
||||
struct capture
|
||||
{
|
||||
template <class T>
|
||||
auto operator->*(const T& x) const
|
||||
{
|
||||
return make_lhs_expression(x);
|
||||
}
|
||||
|
||||
template <class T, class Operator>
|
||||
auto operator->*(const lhs_expression<T, Operator>& x) const
|
||||
{
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
enum class color
|
||||
{
|
||||
reset = 0,
|
||||
bold = 1,
|
||||
underlined = 4,
|
||||
fg_red = 31,
|
||||
fg_green = 32,
|
||||
fg_yellow = 33,
|
||||
fg_blue = 34,
|
||||
fg_default = 39,
|
||||
bg_red = 41,
|
||||
bg_green = 42,
|
||||
bg_yellow = 43,
|
||||
bg_blue = 44,
|
||||
bg_default = 49
|
||||
};
|
||||
inline std::ostream& operator<<(std::ostream& os, const color& c)
|
||||
{
|
||||
#ifndef _WIN32
|
||||
static const bool use_color = isatty(STDOUT_FILENO) != 0;
|
||||
if(use_color)
|
||||
return os << "\033[" << static_cast<std::size_t>(c) << "m";
|
||||
#else
|
||||
(void)c;
|
||||
#endif
|
||||
return os;
|
||||
}
|
||||
|
||||
inline std::atomic<int>& failures()
|
||||
{
|
||||
// NOLINTNEXTLINE
|
||||
static std::atomic<int> f = 0;
|
||||
return f;
|
||||
}
|
||||
|
||||
template <class T, class F>
|
||||
void failed(T x, const char* msg, const char* func, const char* file, int line, F f)
|
||||
{
|
||||
if(not bool(x.value()))
|
||||
{
|
||||
failures()++;
|
||||
std::cout << func << std::endl;
|
||||
std::cout << file << ":" << line << ":" << std::endl;
|
||||
std::cout << color::bold << color::fg_red << " FAILED: " << color::reset << msg << " "
|
||||
<< "[ " << x << " ]" << std::endl;
|
||||
f();
|
||||
}
|
||||
}
|
||||
|
||||
template <class F>
|
||||
bool throws(F f)
|
||||
{
|
||||
try
|
||||
{
|
||||
f();
|
||||
return false;
|
||||
}
|
||||
catch(...)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
template <class Exception, class F>
|
||||
bool throws(F f, const std::string& msg = "")
|
||||
{
|
||||
try
|
||||
{
|
||||
f();
|
||||
return false;
|
||||
}
|
||||
catch(const Exception& ex)
|
||||
{
|
||||
return std::string(ex.what()).find(msg) != std::string::npos;
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
auto within_abs(T px, U py, double ptol = 1e-6f)
|
||||
{
|
||||
return make_function("near", [](auto x, auto y, auto tol) { return std::abs(x - y) < tol; })(
|
||||
px, py, ptol);
|
||||
}
|
||||
|
||||
// This implements the basic globbing algorithm where `*` matches any number
|
||||
// of characters(including none) and `?` matches any single character. It
|
||||
// doesnt support character classes.
|
||||
//
|
||||
// This is a simple recursive implementation that scans the string where the
|
||||
// string and pattern matches. When a `*` is found in the pattern, the
|
||||
// `glob_match` function is called recursively to compare the rest of the
|
||||
// pattern to the rest of the string. If the recursive call returns true,
|
||||
// then we have a match. However, if it returns false, then we advance one
|
||||
// character and call the recusrsive call again. This is referred to as a
|
||||
// star-loop, which will consume zero or more characters.
|
||||
//
|
||||
// This simple recursive implementation works well for short string and
|
||||
// patterns with few stars. First, it is unlikely to use many stars to glob
|
||||
// test names. Secondly, using many stars is still signficantly faster than
|
||||
// using the equivalent std::regex, which has a much slower time complexity.
|
||||
template <class Iterator1, class Iterator2>
|
||||
bool glob_match(Iterator1 start, Iterator1 last, Iterator2 pattern_start, Iterator2 pattern_last)
|
||||
{
|
||||
std::tie(start, pattern_start) =
|
||||
std::mismatch(start, last, pattern_start, pattern_last, [](auto c, auto m) {
|
||||
if(m == '?')
|
||||
return true;
|
||||
// We need a loop for star, so bail and handle the loop below
|
||||
if(m == '*')
|
||||
return false;
|
||||
return c == m;
|
||||
});
|
||||
// If there is no more pattern then return true if there is no more string to match
|
||||
if(pattern_start == pattern_last)
|
||||
return start == last;
|
||||
// If the pattern is not a star then its a mismatch
|
||||
if(*pattern_start != '*')
|
||||
return false;
|
||||
// Multiple stars are the same as a single star so skip over multiple stars
|
||||
pattern_start = std::find_if(pattern_start, pattern_last, [](auto c) { return c != '*'; });
|
||||
// If the star is at the end then return true
|
||||
if(pattern_start == pattern_last)
|
||||
return true;
|
||||
// star-loop: match the rest of the pattern and text
|
||||
while(not glob_match(start, last, pattern_start, pattern_last) and start != last)
|
||||
start++;
|
||||
// If the string is empty then it means a match was never found
|
||||
return start != last;
|
||||
}
|
||||
|
||||
using string_map = std::unordered_map<std::string, std::vector<std::string>>;
|
||||
|
||||
template <class Keyword>
|
||||
string_map generic_parse(std::vector<std::string> as, Keyword keyword)
|
||||
{
|
||||
string_map result;
|
||||
|
||||
std::string flag;
|
||||
for(auto&& x : as)
|
||||
{
|
||||
auto f = keyword(x);
|
||||
if(f.empty())
|
||||
{
|
||||
result[flag].push_back(x);
|
||||
}
|
||||
else
|
||||
{
|
||||
flag = f.front();
|
||||
result[flag]; // Ensure the flag exists
|
||||
flag = f.back();
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
using test_case = std::function<void()>;
|
||||
|
||||
inline auto& get_test_cases()
|
||||
{
|
||||
// NOLINTNEXTLINE
|
||||
static std::vector<std::pair<std::string, test_case>> cases;
|
||||
return cases;
|
||||
}
|
||||
|
||||
inline void add_test_case(std::string name, test_case f)
|
||||
{
|
||||
get_test_cases().emplace_back(std::move(name), std::move(f));
|
||||
}
|
||||
|
||||
struct auto_register_test_case
|
||||
{
|
||||
template <class F>
|
||||
auto_register_test_case(const char* name, F f) noexcept
|
||||
{
|
||||
add_test_case(name, f);
|
||||
}
|
||||
};
|
||||
|
||||
struct failure_error
|
||||
{
|
||||
};
|
||||
|
||||
[[noreturn]] inline void fail() { throw failure_error{}; }
|
||||
|
||||
struct driver
|
||||
{
|
||||
driver()
|
||||
{
|
||||
add_flag({"--help", "-h"}, "Show help");
|
||||
add_flag({"--list", "-l"}, "List all test cases");
|
||||
add_flag({"--continue", "-c"}, "Continue after failure");
|
||||
add_flag({"--quiet", "-q"}, "Don't print out extra output");
|
||||
}
|
||||
struct argument
|
||||
{
|
||||
std::vector<std::string> flags = {};
|
||||
std::string help = "";
|
||||
int nargs = 1;
|
||||
};
|
||||
|
||||
void add_arg(const std::vector<std::string>& flags, const std::string& help = "")
|
||||
{
|
||||
arguments.push_back(argument{flags, help, 1});
|
||||
}
|
||||
|
||||
void add_flag(const std::vector<std::string>& flags, const std::string& help = "")
|
||||
{
|
||||
arguments.push_back(argument{flags, help, 0});
|
||||
}
|
||||
|
||||
static void wrap(std::ostream& os,
|
||||
const std::string& text,
|
||||
const std::string& prefix = "",
|
||||
unsigned int line_length = 80)
|
||||
{
|
||||
std::istringstream iss(text);
|
||||
std::string line = prefix;
|
||||
do
|
||||
{
|
||||
std::string word;
|
||||
iss >> word;
|
||||
if(line.length() + word.length() > line_length)
|
||||
{
|
||||
os << line << std::endl;
|
||||
line = prefix;
|
||||
}
|
||||
line += word + " ";
|
||||
} while(iss);
|
||||
if(not line.empty())
|
||||
os << line << std::endl;
|
||||
}
|
||||
|
||||
void show_help(const std::string& exe) const
|
||||
{
|
||||
const std::string prefix = " ";
|
||||
std::cout << std::endl;
|
||||
std::cout << color::fg_yellow << "USAGE:" << color::reset << std::endl;
|
||||
std::cout << " ";
|
||||
std::cout << exe << " <test-case>... <options>" << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << color::fg_yellow << "ARGS:" << color::reset << std::endl;
|
||||
std::cout << " ";
|
||||
std::cout << color::fg_green << "<test-case>..." << color::reset;
|
||||
std::cout << std::endl;
|
||||
|
||||
wrap(std::cout,
|
||||
"Test cases to run. A test case can be either the exact test case name or a glob. A "
|
||||
"glob expression uses a '*' to select zero or more characters or a '?' to select any "
|
||||
"single character.",
|
||||
prefix + prefix);
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << color::fg_yellow << "OPTIONS:" << color::reset << std::endl;
|
||||
for(auto&& arg : arguments)
|
||||
{
|
||||
std::cout << color::fg_green;
|
||||
std::string arg_prefix = prefix;
|
||||
for(const std::string& a : arg.flags)
|
||||
{
|
||||
std::cout << arg_prefix;
|
||||
std::cout << a;
|
||||
arg_prefix = ", ";
|
||||
}
|
||||
std::cout << color::reset << std::endl;
|
||||
wrap(std::cout, arg.help, prefix + prefix);
|
||||
}
|
||||
}
|
||||
|
||||
std::ostream& out() const
|
||||
{
|
||||
struct null_buffer : std::streambuf
|
||||
{
|
||||
virtual int overflow(int c) override { return c; }
|
||||
};
|
||||
static null_buffer buffer;
|
||||
static std::ostream null_stream(&buffer);
|
||||
if(quiet)
|
||||
return null_stream;
|
||||
return std::cout;
|
||||
}
|
||||
|
||||
string_map parse(int argc, const char* argv[]) const
|
||||
{
|
||||
std::vector<std::string> args(argv + 1, argv + argc);
|
||||
string_map keys;
|
||||
for(auto&& arg : arguments)
|
||||
{
|
||||
for(auto&& flag : arg.flags)
|
||||
{
|
||||
keys[flag] = {arg.flags.front()};
|
||||
if(arg.nargs == 0)
|
||||
keys[flag].push_back("");
|
||||
}
|
||||
}
|
||||
auto result = generic_parse(args, [&](auto&& s) -> std::vector<std::string> {
|
||||
if(keys.count(s) > 0)
|
||||
return keys[s];
|
||||
else
|
||||
return {};
|
||||
});
|
||||
result["__exe__"].push_back(argv[0]);
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string create_command(const string_map& args)
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << args.at("__exe__").front();
|
||||
if(args.count("") > 0)
|
||||
{
|
||||
for(auto&& arg : args.at(""))
|
||||
ss << " \"" << arg << "\"";
|
||||
}
|
||||
for(auto&& p : args)
|
||||
{
|
||||
if(p.first == "__exe__")
|
||||
continue;
|
||||
if(p.first.empty())
|
||||
continue;
|
||||
ss << " " << p.first;
|
||||
for(auto&& arg : p.second)
|
||||
ss << " \"" << arg << "\"";
|
||||
}
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
static std::string fork(const std::string& name, string_map args)
|
||||
{
|
||||
std::string msg;
|
||||
args[""] = {name};
|
||||
args.erase("--continue");
|
||||
args["--quiet"];
|
||||
auto cmd = create_command(args);
|
||||
auto r = std::system(cmd.c_str()); // NOLINT
|
||||
if(r != 0)
|
||||
msg = "Exited with " + std::to_string(r);
|
||||
return msg;
|
||||
}
|
||||
|
||||
static std::vector<std::pair<std::string, test_case>> glob_tests(const std::string& pattern)
|
||||
{
|
||||
std::vector<std::pair<std::string, test_case>> result;
|
||||
std::copy_if(get_test_cases().begin(),
|
||||
get_test_cases().end(),
|
||||
std::back_inserter(result),
|
||||
[&](auto&& p) {
|
||||
return glob_match(
|
||||
p.first.begin(), p.first.end(), pattern.begin(), pattern.end());
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
void run_test_case(const std::string& name, const test_case& f, const string_map& args)
|
||||
{
|
||||
ran++;
|
||||
out() << color::fg_green << "[ RUN ] " << color::reset << color::bold << name
|
||||
<< color::reset << std::endl;
|
||||
std::string msg;
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
if(args.count("--continue") > 0)
|
||||
{
|
||||
msg = fork(name, args);
|
||||
}
|
||||
else
|
||||
{
|
||||
try
|
||||
{
|
||||
failures() = 0;
|
||||
f();
|
||||
}
|
||||
// cppcheck-suppress migraphx-EmptyCatchStatement
|
||||
catch(const failure_error&)
|
||||
{
|
||||
}
|
||||
}
|
||||
auto finish = std::chrono::steady_clock::now();
|
||||
auto elapsed_ms =
|
||||
std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(finish - start)
|
||||
.count();
|
||||
if(msg.empty() and failures() != 0)
|
||||
{
|
||||
if(failures() == 1)
|
||||
msg = "Test failure";
|
||||
else
|
||||
msg = std::to_string(failures()) + " test failures";
|
||||
}
|
||||
if(msg.empty())
|
||||
{
|
||||
out() << color::fg_green << "[ COMPLETE ] " << color::reset;
|
||||
}
|
||||
else
|
||||
{
|
||||
failed.push_back(name);
|
||||
out() << color::fg_red << "[ FAILED ] " << color::reset;
|
||||
}
|
||||
out() << color::bold << name << color::reset;
|
||||
out() << color::fg_blue << " (" << elapsed_ms << "ms)" << color::reset;
|
||||
if(not msg.empty())
|
||||
out() << ": " << color::fg_yellow << msg << color::reset;
|
||||
out() << std::endl;
|
||||
}
|
||||
|
||||
void run(int argc, const char* argv[])
|
||||
{
|
||||
auto args = parse(argc, argv);
|
||||
if(args.count("--help") > 0)
|
||||
{
|
||||
show_help(args.at("__exe__").front());
|
||||
return;
|
||||
}
|
||||
if(args.count("--list") > 0)
|
||||
{
|
||||
for(auto&& tc : get_test_cases())
|
||||
out() << tc.first << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
if(args.count("--quiet") > 0)
|
||||
quiet = true;
|
||||
|
||||
auto cases = args[""];
|
||||
if(cases.empty())
|
||||
{
|
||||
for(auto&& tc : get_test_cases())
|
||||
run_test_case(tc.first, tc.second, args);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::unordered_map<std::string, test_case> m(get_test_cases().begin(),
|
||||
get_test_cases().end());
|
||||
|
||||
for(auto&& iname : cases)
|
||||
{
|
||||
std::vector<std::pair<std::string, test_case>> found_cases;
|
||||
for(auto&& pattern : get_case_names(iname))
|
||||
{
|
||||
auto f = m.find(pattern);
|
||||
if(f == m.end())
|
||||
{
|
||||
found_cases = glob_tests(pattern);
|
||||
}
|
||||
else
|
||||
{
|
||||
found_cases.push_back(*f);
|
||||
}
|
||||
}
|
||||
if(found_cases.empty())
|
||||
{
|
||||
out() << color::fg_red << "[ ERROR ] Test case '" << iname << "' not found."
|
||||
<< color::reset << std::endl;
|
||||
failed.push_back(iname);
|
||||
}
|
||||
for(auto&& p : found_cases)
|
||||
run_test_case(p.first, p.second, args);
|
||||
}
|
||||
}
|
||||
out() << color::fg_green << "[==========] " << color::fg_yellow << ran << " tests ran"
|
||||
<< color::reset << std::endl;
|
||||
if(not failed.empty())
|
||||
{
|
||||
out() << color::fg_red << "[ FAILED ] " << color::fg_yellow << failed.size()
|
||||
<< " tests failed" << color::reset << std::endl;
|
||||
for(auto&& name : failed)
|
||||
out() << color::fg_red << "[ FAILED ] " << color::fg_yellow << name
|
||||
<< color::reset << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
std::function<std::vector<std::string>(const std::string&)> get_case_names =
|
||||
[](const std::string& name) -> std::vector<std::string> { return {name}; };
|
||||
std::vector<argument> arguments = {};
|
||||
std::vector<std::string> failed = {};
|
||||
std::size_t ran = 0;
|
||||
bool quiet = false;
|
||||
};
|
||||
|
||||
inline void run(int argc, const char* argv[])
|
||||
{
|
||||
driver d{};
|
||||
d.run(argc, argv);
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
|
||||
// NOLINTNEXTLINE
|
||||
#define TEST_CAPTURE(...) test::capture{}->*__VA_ARGS__
|
||||
|
||||
// NOLINTNEXTLINE
|
||||
#define CHECK(...) \
|
||||
test::failed( \
|
||||
TEST_CAPTURE(__VA_ARGS__), #__VA_ARGS__, __PRETTY_FUNCTION__, __FILE__, __LINE__, [] {})
|
||||
|
||||
// NOLINTNEXTLINE
|
||||
#define EXPECT(...) \
|
||||
test::failed(TEST_CAPTURE(__VA_ARGS__), \
|
||||
#__VA_ARGS__, \
|
||||
__PRETTY_FUNCTION__, \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
&test::fail)
|
||||
// NOLINTNEXTLINE
|
||||
#define STATUS(...) EXPECT((__VA_ARGS__) == 0)
|
||||
|
||||
// NOLINTNEXTLINE
|
||||
#define TEST_CAT(x, ...) TEST_PRIMITIVE_CAT(x, __VA_ARGS__)
|
||||
// NOLINTNEXTLINE
|
||||
#define TEST_PRIMITIVE_CAT(x, ...) x##__VA_ARGS__
|
||||
|
||||
// NOLINTNEXTLINE
|
||||
#define TEST_CASE_REGISTER(...) \
|
||||
static test::auto_register_test_case TEST_CAT(register_test_case_, __LINE__) = \
|
||||
test::auto_register_test_case(#__VA_ARGS__, &__VA_ARGS__);
|
||||
|
||||
// NOLINTNEXTLINE
|
||||
#define TEST_CASE(...) \
|
||||
void __VA_ARGS__(); \
|
||||
TEST_CASE_REGISTER(__VA_ARGS__) \
|
||||
void __VA_ARGS__()
|
||||
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wglobal-constructors"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
12
codegen/test/rtc/CMakeLists.txt
Normal file
12
codegen/test/rtc/CMakeLists.txt
Normal file
@@ -0,0 +1,12 @@
|
||||
find_package(hip)
|
||||
file(GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp)
|
||||
add_library(ck_rtc ${RTC_SOURCES})
|
||||
target_include_directories(ck_rtc PUBLIC include)
|
||||
target_link_libraries(ck_rtc PUBLIC hip::host)
|
||||
target_link_libraries(ck_rtc PUBLIC -lstdc++fs)
|
||||
|
||||
option(USE_HIPRTC_FOR_CODEGEN_TESTS "Whether to enable hipRTC for codegen tests." ON)
|
||||
if(USE_HIPRTC_FOR_CODEGEN_TESTS)
|
||||
target_compile_definitions(ck_rtc PUBLIC HIPRTC_FOR_CODEGEN_TESTS)
|
||||
message("CK compiled with USE_HIPRTC_FOR_CODEGEN_TESTS set to ${USE_HIPRTC_FOR_CODEGEN_TESTS}")
|
||||
endif()
|
||||
31
codegen/test/rtc/include/rtc/compile_kernel.hpp
Normal file
31
codegen/test/rtc/include/rtc/compile_kernel.hpp
Normal file
@@ -0,0 +1,31 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
|
||||
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
|
||||
|
||||
#include <rtc/kernel.hpp>
|
||||
#include <rtc/filesystem.hpp>
|
||||
#include <string>
|
||||
|
||||
namespace rtc {
|
||||
|
||||
struct src_file
|
||||
{
|
||||
src_file(std::filesystem::path p, std::string c) : path{std::move(p)}, content{std::move(c)} {}
|
||||
fs::path path;
|
||||
std::string content;
|
||||
};
|
||||
|
||||
struct compile_options
|
||||
{
|
||||
std::string flags = "";
|
||||
std::string kernel_name = "main";
|
||||
};
|
||||
|
||||
kernel compile_kernel(const std::vector<src_file>& srcs,
|
||||
compile_options options = compile_options{});
|
||||
|
||||
} // namespace rtc
|
||||
|
||||
#endif
|
||||
60
codegen/test/rtc/include/rtc/filesystem.hpp
Normal file
60
codegen/test/rtc/include/rtc/filesystem.hpp
Normal file
@@ -0,0 +1,60 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#ifndef GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
|
||||
#define GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
|
||||
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
// clang-format off
|
||||
#if defined(CPPCHECK)
|
||||
#define RTC_HAS_FILESYSTEM 1
|
||||
#define RTC_HAS_FILESYSTEM_TS 1
|
||||
#elif defined(_WIN32)
|
||||
#if _MSC_VER >= 1920
|
||||
#define RTC_HAS_FILESYSTEM 1
|
||||
#define RTC_HAS_FILESYSTEM_TS 0
|
||||
#elif _MSC_VER >= 1900
|
||||
#define RTC_HAS_FILESYSTEM 0
|
||||
#define RTC_HAS_FILESYSTEM_TS 1
|
||||
#else
|
||||
#define RTC_HAS_FILESYSTEM 0
|
||||
#define RTC_HAS_FILESYSTEM_TS 0
|
||||
#endif
|
||||
#elif defined(__has_include)
|
||||
#if __has_include(<filesystem>) && __cplusplus >= 201703L
|
||||
#define RTC_HAS_FILESYSTEM 1
|
||||
#else
|
||||
#define RTC_HAS_FILESYSTEM 0
|
||||
#endif
|
||||
#if __has_include(<experimental/filesystem>) && __cplusplus >= 201103L
|
||||
#define RTC_HAS_FILESYSTEM_TS 1
|
||||
#else
|
||||
#define RTC_HAS_FILESYSTEM_TS 0
|
||||
#endif
|
||||
#else
|
||||
#define RTC_HAS_FILESYSTEM 0
|
||||
#define RTC_HAS_FILESYSTEM_TS 0
|
||||
#endif
|
||||
// clang-format on
|
||||
|
||||
#if RTC_HAS_FILESYSTEM
|
||||
#include <filesystem>
|
||||
#elif RTC_HAS_FILESYSTEM_TS
|
||||
#include <experimental/filesystem>
|
||||
#else
|
||||
#error "No filesystem include available"
|
||||
#endif
|
||||
|
||||
namespace rtc {
|
||||
|
||||
#if RTC_HAS_FILESYSTEM
|
||||
namespace fs = ::std::filesystem;
|
||||
#elif RTC_HAS_FILESYSTEM_TS
|
||||
namespace fs = ::std::experimental::filesystem;
|
||||
#endif
|
||||
|
||||
} // namespace rtc
|
||||
|
||||
#endif // GUARD_RTC_FILESYSTEM_HPP_
|
||||
83
codegen/test/rtc/include/rtc/hip.hpp
Normal file
83
codegen/test/rtc/include/rtc/hip.hpp
Normal file
@@ -0,0 +1,83 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_HIP
|
||||
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_HIP
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace rtc {
|
||||
|
||||
template <class T>
|
||||
struct buffer
|
||||
{
|
||||
buffer() : ptr(), n(0) {}
|
||||
buffer(std::shared_ptr<T> p, std::size_t sz) : ptr(p), n(sz) {}
|
||||
buffer(std::shared_ptr<void> p, std::size_t sz)
|
||||
: ptr(std::reinterpret_pointer_cast<T>(p)), n(sz)
|
||||
{
|
||||
}
|
||||
explicit buffer(std::size_t sz) : ptr(new T[sz]), n(sz) {}
|
||||
T* begin() { return data(); }
|
||||
T* end() { return data() + size(); }
|
||||
const T* begin() const { return data(); }
|
||||
const T* end() const { return data() + size(); }
|
||||
|
||||
T& front() { return data()[0]; }
|
||||
T& back() { return data()[size() - 1]; }
|
||||
T& operator[](std::size_t i) { return data()[i]; }
|
||||
T& at(std::size_t i)
|
||||
{
|
||||
if(i >= size())
|
||||
throw std::runtime_error("Out of bounds");
|
||||
return data()[i];
|
||||
}
|
||||
|
||||
const T& front() const { return data()[0]; }
|
||||
const T& back() const { return data()[size() - 1]; }
|
||||
const T& operator[](std::size_t i) const { return data()[i]; }
|
||||
const T& at(std::size_t i) const
|
||||
{
|
||||
if(i >= size())
|
||||
throw std::runtime_error("Out of bounds");
|
||||
return data()[i];
|
||||
}
|
||||
const T* data() const { return ptr.get(); }
|
||||
T* data() { return ptr.get(); }
|
||||
|
||||
std::size_t size() const { return n; }
|
||||
std::size_t bytes() const { return size() * sizeof(T); }
|
||||
|
||||
bool empty() const { return size() == 0; }
|
||||
|
||||
private:
|
||||
std::shared_ptr<T> ptr;
|
||||
std::size_t n;
|
||||
};
|
||||
|
||||
std::string get_device_name();
|
||||
std::string hip_error(int error);
|
||||
|
||||
std::shared_ptr<void> allocate_gpu(std::size_t sz, bool host = false);
|
||||
std::shared_ptr<void> write_to_gpu(const void* x, std::size_t sz, bool host = false);
|
||||
std::shared_ptr<void> read_from_gpu(const void* x, std::size_t sz);
|
||||
|
||||
template <class T>
|
||||
buffer<T> to_gpu(const buffer<T>& input)
|
||||
{
|
||||
return {write_to_gpu(input.data(), input.bytes()), input.size()};
|
||||
}
|
||||
|
||||
template <class T>
|
||||
buffer<T> from_gpu(const buffer<T>& input)
|
||||
{
|
||||
return {read_from_gpu(input.data(), input.bytes()), input.size()};
|
||||
}
|
||||
|
||||
} // namespace rtc
|
||||
|
||||
#endif
|
||||
65
codegen/test/rtc/include/rtc/kernel.hpp
Normal file
65
codegen/test/rtc/include/rtc/kernel.hpp
Normal file
@@ -0,0 +1,65 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL
|
||||
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace rtc {
|
||||
|
||||
struct kernel_argument
|
||||
{
|
||||
template <class T,
|
||||
class U = std::remove_reference_t<T>,
|
||||
class = std::enable_if_t<not std::is_base_of<kernel_argument, T>{}>>
|
||||
kernel_argument(T&& x) : size(sizeof(U)), align(alignof(U)), data(&x) // NOLINT
|
||||
{
|
||||
}
|
||||
std::size_t size;
|
||||
std::size_t align;
|
||||
void* data;
|
||||
};
|
||||
|
||||
std::vector<char> pack_args(const std::vector<kernel_argument>& args);
|
||||
|
||||
struct kernel_impl;
|
||||
|
||||
struct kernel
|
||||
{
|
||||
kernel() = default;
|
||||
kernel(const char* image, const std::string& name);
|
||||
template <class T>
|
||||
kernel(const std::vector<T>& image, const std::string& name)
|
||||
: kernel(reinterpret_cast<const char*>(image.data()), name)
|
||||
{
|
||||
static_assert(sizeof(T) == 1, "Only byte types");
|
||||
}
|
||||
|
||||
void launch(hipStream_t stream,
|
||||
std::size_t global,
|
||||
std::size_t local,
|
||||
const std::vector<kernel_argument>& args) const;
|
||||
|
||||
void launch(hipStream_t stream,
|
||||
std::size_t global,
|
||||
std::size_t local,
|
||||
std::vector<void*> args) const;
|
||||
|
||||
template <class... Ts>
|
||||
auto launch(hipStream_t stream, std::size_t global, std::size_t local, Ts... zs) const
|
||||
{
|
||||
return [=](auto&&... xs) {
|
||||
launch(stream, global, local, std::vector<kernel_argument>{xs...}, zs...);
|
||||
};
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<kernel_impl> impl;
|
||||
};
|
||||
} // namespace rtc
|
||||
|
||||
#endif
|
||||
58
codegen/test/rtc/include/rtc/manage_ptr.hpp
Normal file
58
codegen/test/rtc/include/rtc/manage_ptr.hpp
Normal file
@@ -0,0 +1,58 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER
|
||||
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER
|
||||
|
||||
#include <type_traits>
|
||||
#include <memory>
|
||||
|
||||
namespace rtc {
|
||||
template <class F, F f>
|
||||
struct manage_deleter
|
||||
{
|
||||
template <class T>
|
||||
void operator()(T* x) const
|
||||
{
|
||||
if(x != nullptr)
|
||||
{
|
||||
(void)f(x);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct null_deleter
|
||||
{
|
||||
template <class T>
|
||||
void operator()(T*) const
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template <class T, class F, F f>
|
||||
using manage_ptr = std::unique_ptr<T, manage_deleter<F, f>>;
|
||||
|
||||
template <class T>
|
||||
struct element_type
|
||||
{
|
||||
using type = typename T::element_type;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using remove_ptr = typename std::
|
||||
conditional_t<std::is_pointer<T>{}, std::remove_pointer<T>, element_type<T>>::type;
|
||||
|
||||
template <class T>
|
||||
using shared = std::shared_ptr<remove_ptr<T>>;
|
||||
|
||||
template <class T>
|
||||
shared<T> share(T p)
|
||||
{
|
||||
return shared<T>{std::move(p)};
|
||||
}
|
||||
|
||||
#define RTC_MANAGE_PTR(T, F) rtc::manage_ptr<std::remove_pointer_t<T>, decltype(&F), &F>
|
||||
|
||||
} // namespace rtc
|
||||
|
||||
#endif
|
||||
27
codegen/test/rtc/include/rtc/tmp_dir.hpp
Normal file
27
codegen/test/rtc/include/rtc/tmp_dir.hpp
Normal file
@@ -0,0 +1,27 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
|
||||
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
|
||||
|
||||
#include <string>
|
||||
#include <rtc/filesystem.hpp>
|
||||
|
||||
namespace rtc {
|
||||
|
||||
struct tmp_dir
|
||||
{
|
||||
fs::path path;
|
||||
tmp_dir(const std::string& prefix = "");
|
||||
|
||||
void execute(const std::string& cmd) const;
|
||||
|
||||
tmp_dir(tmp_dir const&) = delete;
|
||||
tmp_dir& operator=(tmp_dir const&) = delete;
|
||||
|
||||
~tmp_dir();
|
||||
};
|
||||
|
||||
} // namespace rtc
|
||||
|
||||
#endif
|
||||
302
codegen/test/rtc/src/compile_kernel.cpp
Normal file
302
codegen/test/rtc/src/compile_kernel.cpp
Normal file
@@ -0,0 +1,302 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <rtc/hip.hpp>
|
||||
#include <rtc/compile_kernel.hpp>
|
||||
#ifdef HIPRTC_FOR_CODEGEN_TESTS
|
||||
#include <hip/hiprtc.h>
|
||||
#include <rtc/manage_ptr.hpp>
|
||||
#endif
|
||||
#include <rtc/tmp_dir.hpp>
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <deque>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace rtc {
|
||||
|
||||
bool EndsWith(const std::string& value, const std::string& suffix)
|
||||
{
|
||||
if(suffix.size() > value.size())
|
||||
return false;
|
||||
else
|
||||
return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
|
||||
}
|
||||
|
||||
std::vector<std::string> SplitString(const std::string& s, char delim)
|
||||
{
|
||||
std::vector<std::string> elems;
|
||||
std::stringstream ss(s + delim);
|
||||
std::string item;
|
||||
while(std::getline(ss, item, delim))
|
||||
{
|
||||
elems.push_back(item);
|
||||
}
|
||||
return elems;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
T generic_read_file(const std::string& filename, size_t offset = 0, size_t nbytes = 0)
|
||||
{
|
||||
std::ifstream is(filename, std::ios::binary | std::ios::ate);
|
||||
if(nbytes == 0)
|
||||
{
|
||||
// if there is a non-zero offset and nbytes is not set,
|
||||
// calculate size of remaining bytes to read
|
||||
nbytes = is.tellg();
|
||||
if(offset > nbytes)
|
||||
throw std::runtime_error("offset is larger than file size");
|
||||
nbytes -= offset;
|
||||
}
|
||||
if(nbytes < 1)
|
||||
throw std::runtime_error("Invalid size for: " + filename);
|
||||
is.seekg(offset, std::ios::beg);
|
||||
|
||||
T buffer(nbytes, 0);
|
||||
if(not is.read(&buffer[0], nbytes))
|
||||
throw std::runtime_error("Error reading file: " + filename);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
std::vector<char> read_buffer(const std::string& filename, size_t offset = 0, size_t nbytes = 0)
|
||||
{
|
||||
return generic_read_file<std::vector<char>>(filename, offset, nbytes);
|
||||
}
|
||||
|
||||
std::string read_string(const std::string& filename)
|
||||
{
|
||||
return generic_read_file<std::string>(filename);
|
||||
}
|
||||
|
||||
void write_buffer(const std::string& filename, const char* buffer, std::size_t size)
|
||||
{
|
||||
std::ofstream os(filename);
|
||||
os.write(buffer, size);
|
||||
}
|
||||
void write_buffer(const std::string& filename, const std::vector<char>& buffer)
|
||||
{
|
||||
write_buffer(filename, buffer.data(), buffer.size());
|
||||
}
|
||||
void write_string(const std::string& filename, const std::string_view& buffer)
|
||||
{
|
||||
write_buffer(filename, buffer.data(), buffer.size());
|
||||
}
|
||||
|
||||
std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip --cuda-device-only"; }
|
||||
// TODO: undo after extracting the codeobj
|
||||
// std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip"; }
|
||||
|
||||
kernel clang_compile_kernel(const std::vector<src_file>& srcs, compile_options options)
|
||||
{
|
||||
assert(not srcs.empty());
|
||||
tmp_dir td{"compile"};
|
||||
options.flags += " -I. -O3";
|
||||
options.flags += " -std=c++17";
|
||||
options.flags += " --offload-arch=" + get_device_name();
|
||||
std::string out;
|
||||
|
||||
for(const auto& src : srcs)
|
||||
{
|
||||
fs::path full_path = td.path / src.path;
|
||||
fs::path parent_path = full_path.parent_path();
|
||||
fs::create_directories(parent_path);
|
||||
write_string(full_path.string(), src.content);
|
||||
if(src.path.extension().string() == ".cpp")
|
||||
{
|
||||
options.flags += " -c " + src.path.filename().string();
|
||||
if(out.empty())
|
||||
out = src.path.stem().string() + ".o";
|
||||
}
|
||||
}
|
||||
|
||||
options.flags += " -o " + out;
|
||||
td.execute(compiler() + options.flags);
|
||||
|
||||
auto out_path = td.path / out;
|
||||
if(not fs::exists(out_path))
|
||||
throw std::runtime_error("Output file missing: " + out);
|
||||
|
||||
auto obj = read_buffer(out_path.string());
|
||||
|
||||
std::ofstream ofh("obj.o", std::ios::binary);
|
||||
for(auto i : obj)
|
||||
ofh << i;
|
||||
ofh.close();
|
||||
// int s = std::system(("/usr/bin/cp " + out_path.string() + " codeobj.bin").c_str());
|
||||
// assert(s == 0);
|
||||
return kernel{obj.data(), options.kernel_name};
|
||||
}
|
||||
|
||||
#ifdef HIPRTC_FOR_CODEGEN_TESTS
|
||||
|
||||
std::string hiprtc_error(hiprtcResult err, const std::string& msg)
|
||||
{
|
||||
return "hiprtc: " + (hiprtcGetErrorString(err) + (": " + msg));
|
||||
}
|
||||
|
||||
void hiprtc_check_error(hiprtcResult err, const std::string& msg = "")
|
||||
{
|
||||
if(err != HIPRTC_SUCCESS)
|
||||
throw std::runtime_error(hiprtc_error(err, msg));
|
||||
}
|
||||
|
||||
struct hiprtc_src_file
|
||||
{
|
||||
hiprtc_src_file() = default;
|
||||
hiprtc_src_file(const src_file& s) : path(s.path.string()), content(s.content) {}
|
||||
std::string path;
|
||||
std::string content;
|
||||
};
|
||||
|
||||
void hiprtc_program_destroy(hiprtcProgram prog) { hiprtcDestroyProgram(&prog); }
|
||||
using hiprtc_program_ptr = RTC_MANAGE_PTR(hiprtcProgram, hiprtc_program_destroy);
|
||||
|
||||
template <class... Ts>
|
||||
hiprtc_program_ptr hiprtc_program_create(Ts... xs)
|
||||
{
|
||||
hiprtcProgram prog = nullptr;
|
||||
auto result = hiprtcCreateProgram(&prog, xs...);
|
||||
hiprtc_program_ptr p{prog};
|
||||
hiprtc_check_error(result, "Create program failed.");
|
||||
return p;
|
||||
}
|
||||
|
||||
struct hiprtc_program
|
||||
{
|
||||
struct string_array
|
||||
{
|
||||
std::deque<std::string> strings{};
|
||||
std::vector<const char*> c_strs{};
|
||||
|
||||
string_array() {}
|
||||
string_array(const string_array&) = delete;
|
||||
|
||||
std::size_t size() const { return strings.size(); }
|
||||
|
||||
const char** data() { return c_strs.data(); }
|
||||
|
||||
void push_back(std::string s)
|
||||
{
|
||||
strings.push_back(std::move(s));
|
||||
c_strs.push_back(strings.back().c_str());
|
||||
}
|
||||
};
|
||||
|
||||
hiprtc_program_ptr prog = nullptr;
|
||||
string_array headers{};
|
||||
string_array include_names{};
|
||||
std::string cpp_src = "";
|
||||
std::string cpp_name = "";
|
||||
|
||||
hiprtc_program(const std::string& src, const std::string& name = "main.cpp")
|
||||
: cpp_src(src), cpp_name(name)
|
||||
{
|
||||
create_program();
|
||||
}
|
||||
|
||||
hiprtc_program(std::vector<src_file> srcs)
|
||||
{
|
||||
for(auto&& src : srcs)
|
||||
{
|
||||
if(EndsWith(src.path, ".cpp"))
|
||||
{
|
||||
cpp_src = std::move(src.content);
|
||||
cpp_name = std::move(src.path);
|
||||
}
|
||||
else
|
||||
{
|
||||
headers.push_back(std::move(src.content));
|
||||
include_names.push_back(std::move(src.path));
|
||||
}
|
||||
}
|
||||
create_program();
|
||||
}
|
||||
|
||||
void create_program()
|
||||
{
|
||||
assert(not cpp_src.empty());
|
||||
assert(not cpp_name.empty());
|
||||
assert(headers.size() == include_names.size());
|
||||
prog = hiprtc_program_create(cpp_src.c_str(),
|
||||
cpp_name.c_str(),
|
||||
headers.size(),
|
||||
headers.data(),
|
||||
include_names.data());
|
||||
}
|
||||
|
||||
void compile(const std::vector<std::string>& options, bool quiet = false) const
|
||||
{
|
||||
std::vector<const char*> c_options;
|
||||
std::transform(options.begin(),
|
||||
options.end(),
|
||||
std::back_inserter(c_options),
|
||||
[](const std::string& s) { return s.c_str(); });
|
||||
auto result = hiprtcCompileProgram(prog.get(), c_options.size(), c_options.data());
|
||||
auto prog_log = log();
|
||||
if(not prog_log.empty() and not quiet)
|
||||
{
|
||||
std::cerr << prog_log << std::endl;
|
||||
}
|
||||
if(result != HIPRTC_SUCCESS)
|
||||
throw std::runtime_error("Compilation failed.");
|
||||
}
|
||||
|
||||
std::string log() const
|
||||
{
|
||||
std::size_t n = 0;
|
||||
hiprtc_check_error(hiprtcGetProgramLogSize(prog.get(), &n));
|
||||
if(n == 0)
|
||||
return {};
|
||||
std::string buffer(n, '\0');
|
||||
hiprtc_check_error(hiprtcGetProgramLog(prog.get(), buffer.data()));
|
||||
assert(buffer.back() != 0);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
std::vector<char> get_code_obj() const
|
||||
{
|
||||
std::size_t n = 0;
|
||||
hiprtc_check_error(hiprtcGetCodeSize(prog.get(), &n));
|
||||
std::vector<char> buffer(n);
|
||||
hiprtc_check_error(hiprtcGetCode(prog.get(), buffer.data()));
|
||||
return buffer;
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<std::vector<char>> compile_hip_src_with_hiprtc(const std::vector<src_file>& srcs,
|
||||
const compile_options& options)
|
||||
{
|
||||
hiprtc_program prog(srcs);
|
||||
auto flags = SplitString(options.flags, ' ');
|
||||
prog.compile(flags);
|
||||
return {prog.get_code_obj()};
|
||||
}
|
||||
|
||||
static kernel hiprtc_compile_kernel(const std::vector<src_file>& srcs, compile_options options)
|
||||
{
|
||||
options.flags += " -I. -O3";
|
||||
options.flags += " -std=c++17";
|
||||
options.flags += " -DCK_CODE_GEN_RTC";
|
||||
options.flags += " --offload-arch=" + get_device_name();
|
||||
auto cos = compile_hip_src_with_hiprtc(srcs, options);
|
||||
if(cos.size() != 1)
|
||||
std::runtime_error("No code object");
|
||||
auto& obj = cos.front();
|
||||
return kernel{obj.data(), options.kernel_name};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options)
|
||||
{
|
||||
#ifdef HIPRTC_FOR_CODEGEN_TESTS
|
||||
return hiprtc_compile_kernel(srcs, options);
|
||||
#else
|
||||
return clang_compile_kernel(srcs, options);
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace rtc
|
||||
109
codegen/test/rtc/src/hip.cpp
Normal file
109
codegen/test/rtc/src/hip.cpp
Normal file
@@ -0,0 +1,109 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <rtc/hip.hpp>
|
||||
#include <rtc/manage_ptr.hpp>
|
||||
#include <stdexcept>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
|
||||
namespace rtc {
|
||||
|
||||
using hip_ptr = RTC_MANAGE_PTR(void, hipFree);
|
||||
|
||||
std::string hip_error(int error) { return hipGetErrorString(static_cast<hipError_t>(error)); }
|
||||
|
||||
int get_device_id()
|
||||
{
|
||||
int device;
|
||||
auto status = hipGetDevice(&device);
|
||||
if(status != hipSuccess)
|
||||
throw std::runtime_error("No device");
|
||||
return device;
|
||||
}
|
||||
|
||||
std::string get_device_name()
|
||||
{
|
||||
hipDeviceProp_t props{};
|
||||
auto status = hipGetDeviceProperties(&props, get_device_id());
|
||||
if(status != hipSuccess)
|
||||
throw std::runtime_error("Failed to get device properties");
|
||||
return props.gcnArchName;
|
||||
}
|
||||
|
||||
bool is_device_ptr(const void* ptr)
|
||||
{
|
||||
hipPointerAttribute_t attr;
|
||||
auto status = hipPointerGetAttributes(&attr, ptr);
|
||||
if(status != hipSuccess)
|
||||
return false;
|
||||
return attr.type == hipMemoryTypeDevice;
|
||||
}
|
||||
|
||||
void gpu_sync()
|
||||
{
|
||||
auto status = hipDeviceSynchronize();
|
||||
if(status != hipSuccess)
|
||||
throw std::runtime_error("hip device synchronization failed: " + hip_error(status));
|
||||
}
|
||||
|
||||
std::size_t get_available_gpu_memory()
|
||||
{
|
||||
size_t free;
|
||||
size_t total;
|
||||
auto status = hipMemGetInfo(&free, &total);
|
||||
if(status != hipSuccess)
|
||||
{
|
||||
std::cerr << "Failed getting available memory: " + hip_error(status) << std::endl;
|
||||
return (8ull * 1024ull * 1024ull * 1024ull);
|
||||
}
|
||||
return free;
|
||||
}
|
||||
|
||||
std::shared_ptr<void> allocate_gpu(std::size_t sz, bool host)
|
||||
{
|
||||
if(sz > get_available_gpu_memory())
|
||||
throw std::runtime_error("Memory not available to allocate buffer: " + std::to_string(sz));
|
||||
void* alloc_ptr = nullptr;
|
||||
auto status = host ? hipHostMalloc(&alloc_ptr, sz) : hipMalloc(&alloc_ptr, sz);
|
||||
if(status != hipSuccess)
|
||||
{
|
||||
if(host)
|
||||
throw std::runtime_error("Gpu allocation failed: " + hip_error(status));
|
||||
else
|
||||
return allocate_gpu(sz, true);
|
||||
}
|
||||
assert(alloc_ptr != nullptr);
|
||||
std::shared_ptr<void> result = share(hip_ptr{alloc_ptr});
|
||||
return result;
|
||||
}
|
||||
|
||||
std::shared_ptr<void> write_to_gpu(const void* x, std::size_t sz, bool host)
|
||||
{
|
||||
gpu_sync();
|
||||
auto result = allocate_gpu(sz, host);
|
||||
assert(is_device_ptr(result.get()));
|
||||
assert(not is_device_ptr(x));
|
||||
auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
|
||||
if(status != hipSuccess)
|
||||
throw std::runtime_error("Copy to gpu failed: " + hip_error(status));
|
||||
return result;
|
||||
}
|
||||
|
||||
std::shared_ptr<void> read_from_gpu(const void* x, std::size_t sz)
|
||||
{
|
||||
gpu_sync();
|
||||
std::shared_ptr<char> result(new char[sz]);
|
||||
assert(not is_device_ptr(result.get()));
|
||||
if(not is_device_ptr(x))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"read_from_gpu() requires Src buffer to be on the GPU, Copy from gpu failed\n");
|
||||
}
|
||||
auto status = hipMemcpy(result.get(), x, sz, hipMemcpyDeviceToHost);
|
||||
if(status != hipSuccess)
|
||||
throw std::runtime_error("Copy from gpu failed: " + hip_error(status)); // NOLINT
|
||||
return std::static_pointer_cast<void>(result);
|
||||
}
|
||||
|
||||
} // namespace rtc
|
||||
125
codegen/test/rtc/src/kernel.cpp
Normal file
125
codegen/test/rtc/src/kernel.cpp
Normal file
@@ -0,0 +1,125 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <rtc/kernel.hpp>
|
||||
#include <rtc/manage_ptr.hpp>
|
||||
#include <rtc/hip.hpp>
|
||||
#include <stdexcept>
|
||||
#include <cassert>
|
||||
|
||||
// extern declare the function since hip/hip_ext.h header is broken
|
||||
extern hipError_t hipExtModuleLaunchKernel(hipFunction_t, // NOLINT
|
||||
uint32_t,
|
||||
uint32_t,
|
||||
uint32_t,
|
||||
uint32_t,
|
||||
uint32_t,
|
||||
uint32_t,
|
||||
size_t,
|
||||
hipStream_t,
|
||||
void**,
|
||||
void**,
|
||||
hipEvent_t = nullptr,
|
||||
hipEvent_t = nullptr,
|
||||
uint32_t = 0);
|
||||
|
||||
namespace rtc {
|
||||
|
||||
std::vector<char> pack_args(const std::vector<kernel_argument>& args)
|
||||
{
|
||||
std::vector<char> kernargs;
|
||||
for(auto&& arg : args)
|
||||
{
|
||||
std::size_t n = arg.size;
|
||||
const auto* p = static_cast<const char*>(arg.data);
|
||||
// Insert padding
|
||||
std::size_t padding = (arg.align - (kernargs.size() % arg.align)) % arg.align;
|
||||
kernargs.insert(kernargs.end(), padding, 0);
|
||||
kernargs.insert(kernargs.end(), p, p + n);
|
||||
}
|
||||
return kernargs;
|
||||
}
|
||||
|
||||
using hip_module_ptr = RTC_MANAGE_PTR(hipModule_t, hipModuleUnload);
|
||||
|
||||
struct kernel_impl
|
||||
{
|
||||
hip_module_ptr module = nullptr;
|
||||
hipFunction_t fun = nullptr;
|
||||
};
|
||||
|
||||
hip_module_ptr load_module(const char* image)
|
||||
{
|
||||
hipModule_t raw_m;
|
||||
auto status = hipModuleLoadData(&raw_m, image);
|
||||
hip_module_ptr m{raw_m};
|
||||
if(status != hipSuccess)
|
||||
throw std::runtime_error("Failed to load module: " + hip_error(status));
|
||||
return m;
|
||||
}
|
||||
|
||||
kernel::kernel(const char* image, const std::string& name) : impl(std::make_shared<kernel_impl>())
|
||||
{
|
||||
impl->module = load_module(image);
|
||||
auto status = hipModuleGetFunction(&impl->fun, impl->module.get(), name.c_str());
|
||||
if(hipSuccess != status)
|
||||
throw std::runtime_error("Failed to get function: " + name + ": " + hip_error(status));
|
||||
}
|
||||
|
||||
void launch_kernel(hipFunction_t fun,
|
||||
hipStream_t stream,
|
||||
std::size_t global,
|
||||
std::size_t local,
|
||||
void* kernargs,
|
||||
std::size_t size)
|
||||
{
|
||||
assert(global > 0);
|
||||
assert(local > 0);
|
||||
void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER,
|
||||
kernargs,
|
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE,
|
||||
&size,
|
||||
HIP_LAUNCH_PARAM_END};
|
||||
|
||||
auto status = hipExtModuleLaunchKernel(fun,
|
||||
global,
|
||||
1,
|
||||
1,
|
||||
local,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
stream,
|
||||
nullptr,
|
||||
reinterpret_cast<void**>(&config),
|
||||
nullptr,
|
||||
nullptr);
|
||||
if(status != hipSuccess)
|
||||
throw std::runtime_error("Failed to launch kernel: " + hip_error(status));
|
||||
}
|
||||
|
||||
void kernel::launch(hipStream_t stream,
|
||||
std::size_t global,
|
||||
std::size_t local,
|
||||
std::vector<void*> args) const
|
||||
{
|
||||
assert(impl != nullptr);
|
||||
void* kernargs = args.data();
|
||||
std::size_t size = args.size() * sizeof(void*);
|
||||
|
||||
launch_kernel(impl->fun, stream, global, local, kernargs, size);
|
||||
}
|
||||
|
||||
void kernel::launch(hipStream_t stream,
|
||||
std::size_t global,
|
||||
std::size_t local,
|
||||
const std::vector<kernel_argument>& args) const
|
||||
{
|
||||
assert(impl != nullptr);
|
||||
std::vector<char> kernargs = pack_args(args);
|
||||
std::size_t size = kernargs.size();
|
||||
|
||||
launch_kernel(impl->fun, stream, global, local, kernargs.data(), size);
|
||||
}
|
||||
|
||||
} // namespace rtc
|
||||
51
codegen/test/rtc/src/tmp_dir.cpp
Normal file
51
codegen/test/rtc/src/tmp_dir.cpp
Normal file
@@ -0,0 +1,51 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <rtc/tmp_dir.hpp>
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
#include <thread>
|
||||
#include <unistd.h>
|
||||
|
||||
namespace rtc {
|
||||
std::string random_string(std::string::size_type length)
|
||||
{
|
||||
static const std::string& chars = "0123456789"
|
||||
"abcdefghijklmnopqrstuvwxyz"
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
||||
|
||||
std::mt19937 rg{std::random_device{}()};
|
||||
std::uniform_int_distribution<std::string::size_type> pick(0, chars.length() - 1);
|
||||
|
||||
std::string str(length, 0);
|
||||
std::generate(str.begin(), str.end(), [&] { return chars[pick(rg)]; });
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
std::string unique_string(const std::string& prefix)
|
||||
{
|
||||
auto pid = getpid();
|
||||
auto tid = std::this_thread::get_id();
|
||||
auto clk = std::chrono::steady_clock::now().time_since_epoch().count();
|
||||
std::stringstream ss;
|
||||
ss << std::hex << prefix << "-" << pid << "-" << tid << "-" << clk << "-" << random_string(16);
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
tmp_dir::tmp_dir(const std::string& prefix)
|
||||
: path(fs::temp_directory_path() /
|
||||
unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix))
|
||||
{
|
||||
fs::create_directories(this->path);
|
||||
}
|
||||
|
||||
void tmp_dir::execute(const std::string& cmd) const
|
||||
{
|
||||
std::string s = "cd " + path.string() + "; " + cmd;
|
||||
std::system(s.c_str());
|
||||
}
|
||||
|
||||
tmp_dir::~tmp_dir() { fs::remove_all(this->path); }
|
||||
|
||||
} // namespace rtc
|
||||
Reference in New Issue
Block a user