From 4acf502f4ce862bc02084289edb7076a3062a625 Mon Sep 17 00:00:00 2001 From: arai713 <67439843+arai713@users.noreply.github.com> Date: Tue, 25 Jun 2024 14:37:35 -0700 Subject: [PATCH] CK Instance Gen (#1145) * Format * Format * Format * Remove const * Use the right template * Format * Format * add row/col instances * Add missing file * fixed * fixing block to etile error * Format * Updates * Format * fixed rrr layout * generating a sample JSON file: currently contains includes, prologue/epilogue and instances * version where the json is passed into the instances to generate a key * updated run function to just launch kernel * updated run function: only contains kernel object, json file is updated but still needs to be cleaned up, added front-end API to parse JSON into character buffer * adding in testing files * cleaned up comments, still need to work on including header files * removed unneeded files * removed/commented out JSON implementation * added fusion(prologue/epilogue) into instance generation * working on instance selection * added instance selection, need to fix instance validation * removed block2etile map validity check for testing purposes * test running: failing due to incorrect files/input * all grid descs/ptrs completed, but device file not found * Update test and embed modules * Restore older version * added convolution operation, written test, debugging generated code for compilation * attempting to include CK in host directory: _Float16 error * CK header file issues * slight fix * don't crash when hip can't report total memory * dump generated code to a file * changing sizes * creating tensor descriptors using CK methods: set up grid desc manually, also trying to set up an argument pointer - this needs to be fixed * some fixes to call the device code * separating test files for conv and gemm * completed arg ptr, now have linking errors * clang format fix * resolved linker issues in conv test * remove dependency on libutility from ck * resolved num dim error * properly passing arg ptr, errors with passing typenames: redefinition/redeclaration * undo the commenting of device function * hand created kernel code to find rtc issues * dump the full src to file * resolved redeclaration errors, cleaned up errors for Amber's kernel code * debugging purposes: redeclaration error * config files * resolved errors for NumTensor and redeclaration, formatted version.h * resolved most errors in manually added kernel and my own. error with calling kernel object: overloaded function type * WIP: close to getting kernel compiled * WIP: fixing rtc errors * fixed sequence errors, formatting, still one error with run fcn * yay: kernel compiles and runs * updated templated/generated version to run and compile * minor fixes * working generated example, resolved memory access error due to padding * adding in reference kernel, validation failing against reference * debugging: printing kernel argsz * reduced error in results * debugged reference kernel and output errors, added to generated version, currently debugging prologue function issues * working validation (using reference convolution) with prologue function for both hard-coded and generated version * WIP: create an alt version that creates Argument on the device * wip: added new duplicate files, fixed fusion templating errors from working example, setting up kernel arguments * wip: making necessary methods device code * added grid descs, working on grid pointers, errors with stl numerics * wip: updating kernel args - issue, replacing some std functions * replaced std::accumulate call with temp hardcoded version * wip: args causing memory issue * Construct Argument object inside the kernel and use it to call convolution device function. Code runs and verification passes * adding object file dump * temporary hardcoding of grid size, can remove device op inst + arg ptr * minor fix for grid size * added modified example where arg ptr is created on the device for generated version as well * removed device op instance and arg ptr from modified examples * moving device op file for testing purposes and to properly build CK * commenting out print-outs * adjust compiler args to produce a valid ELF file * temporary removal of validation * reverting compiler args back for working example * retrieve necessary arguments from generated template parameters in correct format * calculating grid size on host-side, still need to clean up process, pass parameters to host functions properly * scaled up factory functions/wrapper structs to implement host-side launch parameter calculations using CK host side functions - in hard-coded example * temporary change to generate ELF format binary object file * removed unecessary code, added comments * formatting fix * cleaned up code, added new tests, restructured library: move helper into CK * refactored launch parameter calculation to be more concise * renamed files and variables for more clarity/uniformity * more code cleaning, removed debug statements * moved majority of my files into codegen directory, running properly * updated Embed.cmake(string_view) in codegen directory * updated host directory to match Embed.cmake as well * added old tests in * updated instance generation methods to be more concise * removed layout from launch parameter calculation * working test * fixed issue with verification, all instances working * updated verification in other tests * removed duplicate matrix padder file, removed code dumps * removed old hard-coded tests * removed old host directory, all files in codegen directory now * fixed copyright in files * commenting out validation * renamed files * made changes for review: fixed copyright, renamed files for clarity, removed comments, refactored code * updated headers * removing duplicate file for fwd conv to gemm, merging with original file * fix building codegen with clang++ directly * resolving build error from conv_fwd_to_gemm * fix for previous error * renaming tests * created common test file * cleaned up code, added comments * renamed device op * fixed typos in comments * removed extra space * code cleanup: resolving Amber's comments * removed wrapper struct for matrix padder, fixed template * cleaned up if statements for better readability --------- Co-authored-by: Paul Co-authored-by: Jing Zhang Co-authored-by: M. Amber Hassaan Co-authored-by: illsilin Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> [ROCm/composable_kernel commit: 3e9711f0cb1c7ffd3826a93dfa6dd65e98715636] --- codegen/CMakeLists.txt | 28 +- codegen/driver/main.cpp | 42 +- .../ck/host/device_gemm_multiple_d.hpp | 2 +- .../host/device_gemm_multiple_d/operation.hpp | 17 +- .../host/device_gemm_multiple_d/problem.hpp | 17 +- .../conv_fwd_op.hpp | 60 ++ .../conv_fwd_problem.hpp | 56 ++ codegen/include/ck/host/headers.hpp | 1 - codegen/include/ck/host/operation/gemm.hpp | 2 +- codegen/include/ck/host/stringutils.hpp | 2 +- codegen/include/ck/host/types.hpp | 18 +- codegen/include/ck/host/utils.hpp | 5 +- codegen/src/device_gemm_multiple_d.cpp | 15 +- ...gemm_multiple_d_operation_xdl_cshuffle.cpp | 57 +- .../device_grouped_conv_fwd_multiple_abd.cpp | 42 + ...wd_multiple_abd_operation_xdl_cshuffle.cpp | 364 ++++++++ codegen/src/headers.cpp | 2 +- codegen/src/types.cpp | 8 + codegen/src/utils.cpp | 2 +- codegen/test/CMakeLists.txt | 14 +- codegen/test/common.hpp | 134 +++ codegen/test/gemm_multiple_d.cpp | 7 +- .../test/grouped_conv_fwd_multiple_d_v1.cpp | 209 +++++ .../test/grouped_conv_fwd_multiple_d_v2.cpp | 209 +++++ .../test/grouped_conv_fwd_multiple_d_v3.cpp | 209 +++++ .../test/grouped_conv_fwd_multiple_d_v4.cpp | 209 +++++ codegen/test/rtc/src/compile_kernel.cpp | 8 + codegen/test/rtc/src/hip.cpp | 6 +- .../ck/tensor_operation/gpu/device/helper.hpp | 359 ++++++++ ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp | 781 ++++++++++++++++++ .../gpu/device/matrix_padder.hpp | 13 + .../transform_conv_fwd_to_gemm.hpp | 564 +++++++++++++ include/ck/utility/array.hpp | 2 + 33 files changed, 3417 insertions(+), 47 deletions(-) create mode 100644 codegen/include/ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp create mode 100644 codegen/include/ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp create mode 100644 codegen/src/device_grouped_conv_fwd_multiple_abd.cpp create mode 100644 codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp create mode 100644 codegen/test/common.hpp create mode 100644 codegen/test/grouped_conv_fwd_multiple_d_v1.cpp create mode 100644 codegen/test/grouped_conv_fwd_multiple_d_v2.cpp create mode 100644 codegen/test/grouped_conv_fwd_multiple_d_v3.cpp create mode 100644 codegen/test/grouped_conv_fwd_multiple_d_v4.cpp create mode 100644 include/ck/tensor_operation/gpu/device/helper.hpp create mode 100644 include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp diff --git a/codegen/CMakeLists.txt b/codegen/CMakeLists.txt index 72549c9a4e..d8b22fc943 100644 --- a/codegen/CMakeLists.txt +++ b/codegen/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.16) -project(composable_kernel_host) +project(composable_kernel_host LANGUAGES CXX HIP) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -12,24 +12,38 @@ find_package(ROCM) include(ROCMInstallTargets) include(ROCMTest) +add_compile_options(-std=c++17) +find_package(hip) +## HIP +set(CMAKE_HIP_PLATFORM amd) +set(CMAKE_HIP_COMPILER ${CMAKE_CXX_COMPILER}) +set(CMAKE_HIP_EXTENSIONS ON) +message("CMAKE_HIP_COMPILER: ${CMAKE_HIP_COMPILER}") + +# add include directories +include_directories(BEFORE + ${PROJECT_BINARY_DIR}/include + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${HIP_INCLUDE_DIRS} + ) + list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake) include(Embed) file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS - ${CK_ROOT}/include/ck/*.hpp) + ${CK_ROOT}/include/ck/*.hpp) message(STATUS "KERNEL_FILES: ${KERNEL_FILES}") message(STATUS "RELATIVE: ${CK_ROOT}/include") add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include) -add_definitions(-std=c++17) - file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp) # TODO: Use object library add_library(ck_host STATIC ${SOURCES}) target_link_libraries(ck_host PRIVATE ck_headers) -set_target_properties(ck_host PROPERTIES - LINKER_LANGUAGE CXX - POSITION_INDEPENDENT_CODE ON) +set_target_properties(ck_host PROPERTIES + LINKER_LANGUAGE CXX + POSITION_INDEPENDENT_CODE ON) target_include_directories(ck_host PUBLIC $ diff --git a/codegen/driver/main.cpp b/codegen/driver/main.cpp index dfd513106b..c7d295de94 100644 --- a/codegen/driver/main.cpp +++ b/codegen/driver/main.cpp @@ -5,24 +5,27 @@ #include #include #include "ck/host/device_gemm_multiple_d/operation.hpp" +#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" #include "ck/host/stringutils.hpp" using ck::host::Transform; struct Emitters { + // retrieve the hard-coded instances provided, template them, and then store them in a map std::unordered_map()>> m; template - void Register(const std::string& name) + void Register(const std::string& name, const std::string& prologue, const std::string& epilogue) { - m[name] = [] { - auto configs = T::CreateOperations(); + m[name] = [&] { + auto configs = T::CreateOperations(prologue, epilogue); return Transform(configs, [](const auto& ops) { return ToTuple(ops); }); }; } + // takes in an operation instance and uses it to substitute the correct values into the template template static std::string ToTuple(const T& ops) { @@ -31,6 +34,7 @@ struct Emitters return "std::tuple<\n" + ck::host::JoinStrings(templates, ",\n") + ">"; } + // Join together all the strings in the map std::string Emit(const std::string& name) { return ck::host::JoinStrings(m.at(name)(), "\n"); } std::vector List() const @@ -43,9 +47,38 @@ int main(int argc, const char* argv[]) { std::string prog = argv[0]; std::vector args(argv + 1, argv + argc); + + // Specify problem type and problem size + ck::host::device_gemm_multiple_d::Problem prob; + prob.M = 1024; + prob.N = 1024; + prob.K = 1024; + + // user provided fusion + std::string prologue = ""; + std::string epilogue = R"( +struct Epilogue +{ + __host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){}; + + template + __host__ __device__ constexpr void operator()(E& e, const D& d) const; + + template <> + __host__ __device__ constexpr void operator()(ck::half_t& e, + const ck::half_t& d) const + { + e = ck::type_convert(alpha_ * e + beta_ * ck::type_convert(d)); + } + + float alpha_; + float beta_; +};)"; + + // Load in operations into the Register Emitters e; e.Register( - "DeviceGemmMultipleD_Xdl_CShuffle"); + "DeviceGemmMultipleD_Xdl_CShuffle", prologue, epilogue); if(args.empty() or std::any_of(args.begin(), args.end(), [](auto arg) { return arg == "-h" or arg == "--help"; @@ -64,6 +97,7 @@ int main(int argc, const char* argv[]) return 0; } + // print out all the instances for the operation that was chosen at the command line for(auto name : args) std::cout << e.Emit(name) << std::endl; diff --git a/codegen/include/ck/host/device_gemm_multiple_d.hpp b/codegen/include/ck/host/device_gemm_multiple_d.hpp index 88e040db53..02c19c88e7 100644 --- a/codegen/include/ck/host/device_gemm_multiple_d.hpp +++ b/codegen/include/ck/host/device_gemm_multiple_d.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/codegen/include/ck/host/device_gemm_multiple_d/operation.hpp b/codegen/include/ck/host/device_gemm_multiple_d/operation.hpp index f9d39633ac..359da7d8cf 100644 --- a/codegen/include/ck/host/device_gemm_multiple_d/operation.hpp +++ b/codegen/include/ck/host/device_gemm_multiple_d/operation.hpp @@ -14,10 +14,15 @@ namespace ck { namespace host { namespace device_gemm_multiple_d { +// defines all values need for an instance of fwd conv struct Operation_Xdl_CShuffle { - static std::vector> CreateOperations(); - static std::vector CreateOperations(const Problem& prob); + // returns a vector of instances, only given fusion operators: will use default problem spec + static std::vector> + CreateOperations(const std::string& prologue, const std::string& epilogue); + // returns a vector of instances, given a problem spec and fusion operators + static std::vector + CreateOperations(const Problem& prob, const std::string& prologue, const std::string& epilogue); TensorDesc A{}; TensorDesc B{}; DataType acc = DataType::Float; @@ -27,13 +32,21 @@ struct Operation_Xdl_CShuffle std::string a_elem_op = PassThrough; std::string b_elem_op = PassThrough; std::string cde_elem_op = Bilinear; + std::string prologue = ""; + std::string epilogue = ""; std::string gemm_specialization = "ck::tensor_operation::device::GemmSpecialization::Default"; + // tuning parameters operation::TileDesc tile_desc{}; operation::BlockTransferDesc a_block_transfer{}; operation::BlockTransferDesc b_block_transfer{}; operation::CShuffleDesc cshuffle{}; operation::CBlockTransferDesc c_block_transfer{}; + // functions to update fusion operators if provided + void update_prologue(const std::string& prologue); + void update_epilogue(const std::string& epilogue); + /**constexpr**/ bool IsSupported(std::size_t MRaw_, std::size_t NRaw_, std::size_t KRaw_); + // returns a templated instance Solution ToSolution() const; }; diff --git a/codegen/include/ck/host/device_gemm_multiple_d/problem.hpp b/codegen/include/ck/host/device_gemm_multiple_d/problem.hpp index f6dbc2b6e8..f4036328ec 100644 --- a/codegen/include/ck/host/device_gemm_multiple_d/problem.hpp +++ b/codegen/include/ck/host/device_gemm_multiple_d/problem.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -12,11 +12,14 @@ namespace ck { namespace host { namespace device_gemm_multiple_d { +// defines the problem specification for a GEMM operation struct Problem { - std::size_t M = 0; - std::size_t N = 0; - std::size_t K = 0; + // dimensions for GEMM operation + std::size_t M = 0; + std::size_t N = 0; + std::size_t K = 0; + // layouts for tensors bool TransA = false; bool TransB = false; bool TransE = false; @@ -29,9 +32,13 @@ struct Problem std::string BElementOp = PassThrough; std::string CDEElementOp = PassThrough; + // returns the correct device op file for the operation std::string GetIncludeHeader() const; - std::vector GetSolutions(const std::string& arch) const; + // returns a list of instances based on the problem spec and provided fusion operations + std::vector GetSolutions(const std::string& arch, + const std::string& prologue, + const std::string& epilogue) const; }; } // namespace device_gemm_multiple_d diff --git a/codegen/include/ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp b/codegen/include/ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp new file mode 100644 index 0000000000..5ad1dce176 --- /dev/null +++ b/codegen/include/ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include "ck/host/types.hpp" +#include "ck/host/operation/gemm.hpp" +#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp" + +namespace ck { +namespace host { +namespace conv { + +// defines the values needed for an instance of forward convolution and functions to return +// (templated) instances +struct Operation_Conv_Fwd_Xdl_Cshuffle +{ + // returns a vector of instances given the fusion operations, uses default values for problem + // spec + static std::vector + CreateOperations(const std::string& prologue, const std::string& epilogue); + // returns a vector of instances, provided with a problem spec and fusion operations + static std::vector CreateOperations( + const Problem_Conv_Fwd& prob, const std::string& prologue, const std::string& epilogue); + std::size_t NumDim; + TensorDesc A{}; + TensorDesc B{}; + DataType acc = DataType::Float; + DataType cs_type = DataType::Half; + std::vector Ds = {}; + TensorDesc E{}; + std::string a_elem_op = PassThrough; + std::string b_elem_op = PassThrough; + std::string cde_elem_op = PassThrough; + std::string prologue = ""; + std::string epilogue = ""; + std::string conv_specialization = + "ck::tensor_operation::device::ConvolutionForwardSpecialization::Default"; + std::string gemm_specialization = + "ck::tensor_operation::device::GemmSpecialization::MNKPadding"; + // tuning parameters + operation::TileDesc tile_desc{}; + operation::BlockTransferDesc a_block_transfer{}; + operation::BlockTransferDesc b_block_transfer{}; + operation::CShuffleDesc cshuffle{}; + operation::CBlockTransferDesc c_block_transfer{}; + + // functions to update fusion operations if they are provided + void update_prologue(const std::string& prologue); + void update_epilogue(const std::string& epilogue); + // returns a templated instance + Solution ToSolution() const; +}; + +} // namespace conv +} // namespace host +} // namespace ck diff --git a/codegen/include/ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp b/codegen/include/ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp new file mode 100644 index 0000000000..433f9a8fc9 --- /dev/null +++ b/codegen/include/ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "ck/host/types.hpp" + +namespace ck { +namespace host { +namespace conv { + +// defines the problem specification for a forward convolution operation +struct Problem_Conv_Fwd +{ + std::size_t NumDim = 0; + // size of a forward convolution operation + std::size_t G = 0; + std::size_t N = 0; + std::size_t C = 0; + std::size_t Hi = 0; + std::size_t Wi = 0; + std::size_t Ho = 0; + std::size_t Wo = 0; + std::size_t K = 0; + std::size_t Y = 0; + std::size_t X = 0; + Layout ALayout = Layout::NHWGC; + Layout BLayout = Layout::GKYXC; + Layout ELayout = Layout::NHWGK; + std::vector DsLayout = {}; + DataType ADataType = DataType::Half; + DataType BDataType = DataType::Half; + DataType EDataType = DataType::Half; + std::vector DsDataType = {}; + std::string AElementOp = "ck::tensor_operation::element_wise::PassThrough"; + std::string BElementOp = "ck::tensor_operation::element_wise::PassThrough"; + std::string CDEElementOp = "ck::tensor_operation::element_wise::PassThrough"; + + // returns the correct device op file for the operation + std::string GetIncludeHeader() const; + + // returns a list of instances based on the problem spec and provided fusion operations + std::vector GetSolutions(const std::string& arch, + const std::string& prologue, + const std::string& epilogue) const; +}; + +} // namespace conv +} // namespace host +} // namespace ck diff --git a/codegen/include/ck/host/headers.hpp b/codegen/include/ck/host/headers.hpp index 3da05baaaf..54f8d9f731 100644 --- a/codegen/include/ck/host/headers.hpp +++ b/codegen/include/ck/host/headers.hpp @@ -4,7 +4,6 @@ #pragma once #include -#include #include #include #include diff --git a/codegen/include/ck/host/operation/gemm.hpp b/codegen/include/ck/host/operation/gemm.hpp index f587122b05..84ef92f0a0 100644 --- a/codegen/include/ck/host/operation/gemm.hpp +++ b/codegen/include/ck/host/operation/gemm.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/codegen/include/ck/host/stringutils.hpp b/codegen/include/ck/host/stringutils.hpp index 01374b86c8..89c1884d2e 100644 --- a/codegen/include/ck/host/stringutils.hpp +++ b/codegen/include/ck/host/stringutils.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/codegen/include/ck/host/types.hpp b/codegen/include/ck/host/types.hpp index 23488a66d0..812c073678 100644 --- a/codegen/include/ck/host/types.hpp +++ b/codegen/include/ck/host/types.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -12,6 +12,7 @@ namespace ck { namespace host { +// holds the templated instance, substitues values into template from instancess struct Solution { @@ -33,6 +34,7 @@ struct Solution std::unordered_map template_values; }; +// supported data types enum class DataType { Half, @@ -40,22 +42,28 @@ enum class DataType Int8, Int32 }; - std::string ToString(DataType dt); +// supported layouts: gemm and fwd conv enum class Layout { Row, - Column + Column, + GKYXC, + GKCYX, + GNHWK, + GNHWC, + NHWGC, + NHWGK }; - std::string ToString(Layout dl); +Layout ToLayout(bool Trans); // returns the layout for gemm +// supported GEMM types enum class GemmType { Default }; - std::string ToString(GemmType gt); struct TensorDesc diff --git a/codegen/include/ck/host/utils.hpp b/codegen/include/ck/host/utils.hpp index e8785a456f..21926814f1 100644 --- a/codegen/include/ck/host/utils.hpp +++ b/codegen/include/ck/host/utils.hpp @@ -1,10 +1,12 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include #include +#include +#include namespace ck { namespace host { @@ -12,6 +14,5 @@ namespace host { std::size_t integer_divide_ceil(std::size_t x, std::size_t y); const std::unordered_set& get_xdlop_archs(); - } // namespace host } // namespace ck diff --git a/codegen/src/device_gemm_multiple_d.cpp b/codegen/src/device_gemm_multiple_d.cpp index ec25afc0f9..44bc051a8b 100644 --- a/codegen/src/device_gemm_multiple_d.cpp +++ b/codegen/src/device_gemm_multiple_d.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/host/device_gemm_multiple_d/problem.hpp" #include "ck/host/device_gemm_multiple_d/operation.hpp" @@ -11,23 +11,28 @@ namespace ck { namespace host { namespace device_gemm_multiple_d { +// return the relevant device op file based on the operation std::string Problem::GetIncludeHeader() const { return "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"; } -std::vector Problem::GetSolutions(const std::string& arch) const +// returns templated instances when provided with a problem specification +std::vector Problem::GetSolutions(const std::string& arch, + const std::string& prologue, + const std::string& epilogue) const { if(get_xdlop_archs().count(arch) == 0) return {}; - auto ops = ck::host::device_gemm_multiple_d::Operation_Xdl_CShuffle::CreateOperations(*this); + auto ops = ck::host::device_gemm_multiple_d::Operation_Xdl_CShuffle::CreateOperations( + *this, prologue, epilogue); // obtains vector of instances std::vector result; std::transform(ops.begin(), ops.end(), std::back_inserter(result), [&](const auto& op) { - return op.ToSolution(); + return op.ToSolution(); // template instance with correct values }); return result; } } // namespace device_gemm_multiple_d } // namespace host -} // namespace ck \ No newline at end of file +} // namespace ck diff --git a/codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp b/codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp index 9e397497ee..a2e8eccbf1 100644 --- a/codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp +++ b/codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp @@ -10,6 +10,7 @@ namespace ck { namespace host { namespace device_gemm_multiple_d { +// calculate appropriate Gemm Specification based on input tensor dimensions static std::string GetGemmSpec(const std::size_t m, const std::size_t n, const std::size_t k, @@ -30,9 +31,40 @@ static std::string GetGemmSpec(const std::size_t m, return "ck::tensor_operation::device::GemmSpecialization::" + spec + "Padding"; } +// function to update prologue/epilogue with user provided operation +void Operation_Xdl_CShuffle::update_prologue(const std::string& prologue) +{ + if(!prologue.empty()) + { + this->prologue = prologue; + this->cde_elem_op = "CDEElementOp"; + } + else + { + this->prologue = ""; + } +} + +void Operation_Xdl_CShuffle::update_epilogue(const std::string& epilogue) +{ + if(!epilogue.empty()) + { + this->epilogue = epilogue; + this->cde_elem_op = "CDEElementOp"; + } + else + { + this->epilogue = ""; + } +} + +// accounts for all possible combinations of Row/Col major static Layout ToLayout(bool Trans) { return Trans ? Layout::Column : Layout::Row; } -std::vector Operation_Xdl_CShuffle::CreateOperations(const Problem& prob) +// Hard-code tuning parameters in modularized fashion, string them together into a vector of +// instances +std::vector Operation_Xdl_CShuffle::CreateOperations( + const Problem& prob, const std::string& prologue, const std::string& epilogue) { std::vector result; @@ -155,6 +187,7 @@ std::vector Operation_Xdl_CShuffle::CreateOperations(con // clang-format on }; + // choose correct arrangement of tuning parameters based on the layout of each tensor const auto a_block_descriptions = prob.TransA ? a_block_descriptions_colmajor : a_block_descriptions_rowmajor; const auto b_block_descriptions = @@ -165,6 +198,7 @@ std::vector Operation_Xdl_CShuffle::CreateOperations(con assert(tile_descriptions.size() == cshuffle_descriptions.size()); assert(tile_descriptions.size() == c_block_descriptions.size()); + // Put all values together into a single operation > store into the result vector for(std::size_t i = 0; i < tile_descriptions.size(); i++) { Operation_Xdl_CShuffle x; @@ -188,12 +222,17 @@ std::vector Operation_Xdl_CShuffle::CreateOperations(con x.tile_desc.m_per_block, x.tile_desc.n_per_block, x.tile_desc.k_per_block); + x.update_prologue(prologue); + x.update_epilogue(epilogue); result.push_back(x); } return result; } -std::vector> Operation_Xdl_CShuffle::CreateOperations() +// set up instances when not provided with a problem specification, use default operation values and +// all possible layout combinations +std::vector> +Operation_Xdl_CShuffle::CreateOperations(const std::string& prologue, const std::string& epilogue) { std::vector problems; for(bool TransA : {true, false}) @@ -204,7 +243,8 @@ std::vector> Operation_Xdl_CShuffle::CreateO prob.TransB = TransB; problems.push_back(prob); } - return Transform(problems, [](const Problem& p) { return CreateOperations(p); }); + return Transform(problems, + [&](const Problem& p) { return CreateOperations(p, prologue, epilogue); }); } static const char* const DeviceGemmMultipleD_Xdl_CShuffleTemplate = @@ -224,9 +264,20 @@ static const char* const DeviceGemmMultipleD_Xdl_CShuffleTemplate = "${CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock}, " "${CDEBlockTransferScalarPerVector_NPerBlock}>"; +// use hardcoded instances from vector of operations to substitute values into instance template Solution Operation_Xdl_CShuffle::ToSolution() const { std::unordered_map values = { + {"name", + std::to_string(this->tile_desc.block_size) + "_" + + std::to_string(this->tile_desc.m_per_block) + "_" + + std::to_string(this->tile_desc.n_per_block) + "_" + + std::to_string(this->tile_desc.k_per_block) + "_" + + std::to_string(this->tile_desc.ak1) + "_" + std::to_string(this->tile_desc.bk1) + "_" + + std::to_string(this->tile_desc.m_per_XDL) + "_" + + std::to_string(this->tile_desc.n_per_XDL) + "_" + + std::to_string(this->tile_desc.m_Xdl_per_wave) + "_" + + std::to_string(this->tile_desc.n_Xdl_per_wave)}, {"LayoutA", ToString(this->A.layout)}, {"LayoutB", ToString(this->B.layout)}, {"LayoutDs", diff --git a/codegen/src/device_grouped_conv_fwd_multiple_abd.cpp b/codegen/src/device_grouped_conv_fwd_multiple_abd.cpp new file mode 100644 index 0000000000..c689e5ec95 --- /dev/null +++ b/codegen/src/device_grouped_conv_fwd_multiple_abd.cpp @@ -0,0 +1,42 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp" +#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" +#include "ck/host/utils.hpp" +#include +#include + +namespace ck { +namespace host { +namespace conv { + +// return the relevant device op file based on the operation +// NOTE: this is a modified version of the original CK file that calls the kernel from a device +// function and makes the Argument class accessible on the device +std::string Problem_Conv_Fwd::GetIncludeHeader() const +{ + return "ck/tensor_operation/gpu/device/impl/" + "codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"; +} + +// return vector of forward convolution instances when provided with a problem instance +std::vector Problem_Conv_Fwd::GetSolutions(const std::string& arch, + const std::string& prologue, + const std::string& epilogue) const +{ + if(get_xdlop_archs().count(arch) == 0) + return {}; + auto ops = ck::host::conv::Operation_Conv_Fwd_Xdl_Cshuffle::CreateOperations( + *this, prologue, epilogue); + std::vector result; + std::transform(ops.begin(), ops.end(), std::back_inserter(result), [&](const auto& op) { + return op.ToSolution(); + }); + return result; +} + +} // namespace conv +} // namespace host +} // namespace ck diff --git a/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp b/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp new file mode 100644 index 0000000000..94161a76d9 --- /dev/null +++ b/codegen/src/device_grouped_conv_fwd_multiple_abd_operation_xdl_cshuffle.cpp @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" +#include +#include "ck/host/stringutils.hpp" +#include "ck/host/utils.hpp" +#include + +namespace ck { +namespace host { +namespace conv { + +// calculate appropriate Gemm Specification based on input tensor dimensions +// NOTE: in CK, MNKPadding is always used for forward convolution +static std::string GetGemmSpec(const std::size_t m, + const std::size_t n, + const std::size_t k, + const std::size_t m_per_block, + const std::size_t n_per_block, + const std::size_t k_per_block) +{ + std::string spec = ""; + if(integer_divide_ceil(m, m_per_block) * m_per_block - m != 0) + spec += "M"; + if(integer_divide_ceil(n, n_per_block) * n_per_block - n != 0) + spec += "N"; + if(integer_divide_ceil(k, k_per_block) * k_per_block - k != 0) + spec += "K"; + if(spec == "") + return "ck::tensor_operation::device::GemmSpecialization::Default"; + + return "ck::tensor_operation::device::GemmSpecialization::" + spec + "Padding"; +} + +// function to update prologue/epilogue with user provided operation +void Operation_Conv_Fwd_Xdl_Cshuffle::update_prologue(const std::string& prologue) +{ + if(!prologue.empty()) + { + this->prologue = prologue; + this->cde_elem_op = "CDEElementOp"; + } + else + { + this->prologue = ""; + } +} + +void Operation_Conv_Fwd_Xdl_Cshuffle::update_epilogue(const std::string& epilogue) +{ + if(!epilogue.empty()) + { + this->epilogue = epilogue; + this->cde_elem_op = "CDEElementOp"; + } + else + { + this->epilogue = ""; + } +} + +// Hard-code tuning parameters in modularized fashion, string them together into a vector of +// instances +std::vector Operation_Conv_Fwd_Xdl_Cshuffle::CreateOperations( + const Problem_Conv_Fwd& prob, const std::string& prologue, const std::string& epilogue) +{ + std::vector result; + + std::vector tile_descriptions = { + // clang-format off +// Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| NumGemmK| +// Size| Block| Block| Block| | | XDL| XDL| Per| Per| Prefetch| +// | | | | | | | | Wave| Wave| Stage| +// | | | | | | | | | | | + { 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, 1}, + { 256, 128, 256, 32, 8, 8, 32, 32, 4, 2, 1}, + { 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, 1}, + { 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, 1}, + { 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, 1}, + { 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, 1} + // clang-format on + }; + + std::vector a_block_descriptions = { + // clang-format off +// ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| +// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| +// Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | +// | | | | | | | + { S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1}, + { S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1}, + { S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1}, + { S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1}, + { S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1}, + { S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1} + // clang-format on + }; + + std::vector b_block_descriptions = { + // clang-format off +// BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| +// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| +// Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | +// | | | | | | | + { S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1}, + { S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1}, + { S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1}, + { S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1}, + { S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1}, + { S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1} + // clang-format on + }; + + std::vector cshuffle_descriptions = { + // clang-format off +// CShuffle| CShuffle| +// MXdlPerWave| NXdlPerWave| +// PerShuffle| PerShuffle| +// | | + { 1, 1}, + { 1, 1}, + { 1, 1}, + { 1, 1}, + { 1, 1}, + { 1, 1} + // clang-format on + }; + + std::vector c_block_descriptions = { + // clang-format off +// CBlockTransferClusterLengths| CBlockTransfer +// _MBlock_MWaveMPerXdl| ScalarPerVector +// _NBlock_NWaveNPerXdl| _NWaveNPerXdl +// | + { S<1, 16, 1, 4>, 1}, + { S<1, 32, 1, 8>, 8}, + { S<1, 32, 1, 8>, 8}, + { S<1, 16, 1, 4>, 1}, + { S<1, 32, 1, 8>, 8}, + { S<1, 16, 1, 8>, 8} + // clang-format on + }; + + assert(tile_descriptions.size() == a_block_descriptions.size()); + assert(tile_descriptions.size() == b_block_descriptions.size()); + assert(tile_descriptions.size() == cshuffle_descriptions.size()); + assert(tile_descriptions.size() == c_block_descriptions.size()); + + // Put all values together into a single operation > store into the result vector + for(std::size_t i = 0; i < tile_descriptions.size(); i++) + { + Operation_Conv_Fwd_Xdl_Cshuffle x; + x.NumDim = prob.NumDim; + x.tile_desc = tile_descriptions[i]; + x.a_block_transfer = a_block_descriptions[i]; + x.b_block_transfer = b_block_descriptions[i]; + x.cshuffle = cshuffle_descriptions[i]; + x.c_block_transfer = c_block_descriptions[i]; + x.A = TensorDesc{prob.ADataType, prob.ALayout}; + x.B = TensorDesc{prob.BDataType, prob.BLayout}; + x.E = TensorDesc{prob.EDataType, prob.ELayout}; + x.Ds = Transform(prob.DsLayout, prob.DsDataType, [](auto lo, auto dt) { + return TensorDesc{dt, lo}; + }); + x.a_elem_op = prob.AElementOp; + x.b_elem_op = prob.BElementOp; + x.cde_elem_op = prob.CDEElementOp; + x.update_prologue(prologue); + x.update_epilogue(epilogue); + result.push_back(x); + } + return result; +} + +// set up instances when not provided with a problem specification, use default operation values +std::vector +Operation_Conv_Fwd_Xdl_Cshuffle::CreateOperations(const std::string& prologue, + const std::string& epilogue) +{ + Problem_Conv_Fwd prob; + return CreateOperations(prob, prologue, epilogue); +} + +static const char* const CopyDevice_ConvTemplate = + R"( +${Prologue} +${Epilogue} + +using CDEElementOp = Epilogue; +using DeviceConv = ck::tensor_operation::device::CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<${NumDim}, ${LayoutA}, ${LayoutB}, ${LayoutDs}, ${LayoutE}, ${ADataType}, ${BDataType}, ${AccDataType}, ${CShuffleDataType}, ${DsDataType}, ${EDataType}, ${AElementwiseOperation}, ${BElementwiseOperation}, ${CDEElementwiseOperation}, ${ConvSpecialization}, ${GemmSpecialization}, ${NumGemmkPrefetchStage}, ${BlockSize}, ${MPerBlock}, ${NPerBlock}, ${KPerBlock}, ${AK1}, ${BK1}, ${MPerXDL}, ${NPerXDL}, ${MXdlPerWave}, ${NXdlPerWave}, ${ABlockTransferThreadClusterLengths_AK0_M_AK1}, ${ABlockTransferThreadClusterArrangeOrder}, ${ABlockTransferSrcAccessOrder}, ${ABlockTransferSrcVectorDim}, ${ABlockTransferSrcScalarPerVector}, ${ABlockTransferDstScalarPerVector_AK1}, ${ABlockLdsExtraM}, ${BBlockTransferThreadClusterLengths_BK0_N_BK1}, ${BBlockTransferThreadClusterArrangeOrder}, ${BBlockTransferSrcAccessOrder}, ${BBlockTransferSrcVectorDim}, ${BBlockTransferSrcScalarPerVector}, ${BBlockTransferDstScalarPerVector_BK1}, ${BBlockLdsExtraN}, ${CShuffleMXdlPerWavePerShuffle}, ${CShuffleNXdlPerWavePerShuffle}, ${CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock}, ${CDEBlockTransferScalarPerVector_NPerBlock}>; + +constexpr ck::index_t NumATensor = ck::tensor_operation::device::GetNumABTensors(); +constexpr ck::index_t NumBTensor = ck::tensor_operation::device::GetNumABTensors(); + +extern "C" __global__ void run_${name}( + const ${ADataType}* in_dev, + const ${BDataType}* wei_dev, + ${EDataType}* __restrict__ out_dev, + ck::Array in_lengths, + ck::Array in_strides, + ck::Array wei_lengths, + ck::Array wei_strides, + ck::Array out_lengths, + ck::Array out_strides, + ck::Array conv_filter_strides, + ck::Array conv_filter_dilations, + ck::Array input_left_pads, + ck::Array input_right_pads, + const ${AElementwiseOperation} a_element_op, + const ${BElementwiseOperation} b_element_op, + const ${CDEElementwiseOperation} cde_element_op +){ + + + auto arg = DeviceConv::Argument(in_dev, + wei_dev, + ck::Array{}, + out_dev, + in_lengths, + in_strides, + wei_lengths, + wei_strides, + ck::Array, 0>{}, + ck::Array, 0>{}, + out_lengths, + out_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + ${AElementwiseOperation}{}, + ${BElementwiseOperation}{}, + ${CDEElementwiseOperation}{1.0f, 1.0f}); + + constexpr ck::LoopScheduler LoopSched = ck::make_default_loop_scheduler(); + + // GridwiseGemm + using GridwiseGemm = DeviceConv::GridwiseGemm; + + static constexpr auto I0 = ck::Number<0>{}; + + ck::tensor_operation::device::device_grouped_conv_fwd_multiple_abd_xdl_cshuffle< + GridwiseGemm, + const ${ADataType}*, + const ${BDataType}*, + typename GridwiseGemm::DsGridPointer, + ${EDataType}, + ${AElementwiseOperation}, + ${BElementwiseOperation}, + ${CDEElementwiseOperation}, + DeviceConv::AGridDesc_AK0_M_AK1, + DeviceConv::BGridDesc_BK0_N_BK1, + DeviceConv::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, + DeviceConv::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, + DeviceConv::Block2ETileMap, + ck::tensor_operation::device::ComputePtrOffsetOfStridedBatch, + ck::integral_constant{}, + false, + false> + ( + arg.p_as_grid_.At(I0), + arg.p_bs_grid_.At(I0), + arg.p_ds_grid_, + arg.p_e_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.cde_element_op_, + arg.a_g_n_c_wis_lengths_[0], // Group count + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.e_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_etile_map_, + arg.compute_ptr_offset_of_batch_ + ); + +} +)"; + +// use hardcoded instances from vector of operations to substitute values into instance template +Solution Operation_Conv_Fwd_Xdl_Cshuffle::ToSolution() const +{ + std::unordered_map values = { + {"name", + std::to_string(this->tile_desc.block_size) + "_" + + std::to_string(this->tile_desc.m_per_block) + "_" + + std::to_string(this->tile_desc.n_per_block) + "_" + + std::to_string(this->tile_desc.k_per_block) + "_" + + std::to_string(this->tile_desc.ak1) + "_" + std::to_string(this->tile_desc.bk1) + "_" + + std::to_string(this->tile_desc.m_per_XDL) + "_" + + std::to_string(this->tile_desc.n_per_XDL) + "_" + + std::to_string(this->tile_desc.m_Xdl_per_wave) + "_" + + std::to_string(this->tile_desc.n_Xdl_per_wave)}, + {"NumDim", std::to_string(this->NumDim)}, + {"LayoutA", ToString(this->A.layout)}, + {"LayoutB", ToString(this->B.layout)}, + {"LayoutDs", + MakeTuple(Transform(this->Ds, [](auto tensor) { return ToString(tensor.layout); }))}, + {"LayoutE", ToString(this->E.layout)}, + {"ADataType", ToString(this->A.element)}, + {"BDataType", ToString(this->B.element)}, + {"AccDataType", ToString(this->acc)}, + {"ComputeDataType", ToString(this->A.element)}, + {"CShuffleDataType", ToString(this->cs_type)}, + {"DsDataType", + MakeTuple(Transform(this->Ds, [](auto tensor) { return ToString(tensor.element); }))}, + {"EDataType", ToString(this->E.element)}, + {"AElementwiseOperation", this->a_elem_op}, + {"BElementwiseOperation", this->b_elem_op}, + {"CDEElementwiseOperation", this->cde_elem_op}, + {"Prologue", this->prologue}, + {"Epilogue", this->epilogue}, + {"ConvSpecialization", this->conv_specialization}, + {"GemmSpecialization", this->gemm_specialization}, + {"NumGemmkPrefetchStage", std::to_string(this->tile_desc.num_gemmk_prefetch_stage)}, + {"BlockSize", std::to_string(this->tile_desc.block_size)}, + {"MPerBlock", std::to_string(this->tile_desc.m_per_block)}, + {"NPerBlock", std::to_string(this->tile_desc.n_per_block)}, + {"KPerBlock", std::to_string(this->tile_desc.k_per_block)}, + {"AK1", std::to_string(this->tile_desc.ak1)}, + {"BK1", std::to_string(this->tile_desc.bk1)}, + {"MPerXDL", std::to_string(this->tile_desc.m_per_XDL)}, + {"NPerXDL", std::to_string(this->tile_desc.n_per_XDL)}, + {"MXdlPerWave", std::to_string(this->tile_desc.m_Xdl_per_wave)}, + {"NXdlPerWave", std::to_string(this->tile_desc.n_Xdl_per_wave)}, + {"ABlockTransferThreadClusterLengths_AK0_M_AK1", + this->a_block_transfer.thread_cluster_length}, + {"ABlockTransferThreadClusterArrangeOrder", + this->a_block_transfer.thread_cluster_arrange_order}, + {"ABlockTransferSrcAccessOrder", this->a_block_transfer.src_access_order}, + {"ABlockTransferSrcVectorDim", std::to_string(this->a_block_transfer.src_vec_dim)}, + {"ABlockTransferSrcScalarPerVector", + std::to_string(this->a_block_transfer.src_scalar_per_vector)}, + {"ABlockTransferDstScalarPerVector_AK1", + std::to_string(this->a_block_transfer.dst_scalar_per_vector_k1)}, + {"ABlockLdsExtraM", std::to_string(this->a_block_transfer.lds_add_extra_dim)}, + {"BBlockTransferThreadClusterLengths_BK0_N_BK1", + this->b_block_transfer.thread_cluster_length}, + {"BBlockTransferThreadClusterArrangeOrder", + this->b_block_transfer.thread_cluster_arrange_order}, + {"BBlockTransferSrcAccessOrder", this->b_block_transfer.src_access_order}, + {"BBlockTransferSrcVectorDim", std::to_string(this->b_block_transfer.src_vec_dim)}, + {"BBlockTransferSrcScalarPerVector", + std::to_string(this->b_block_transfer.src_scalar_per_vector)}, + {"BBlockTransferDstScalarPerVector_BK1", + std::to_string(this->b_block_transfer.dst_scalar_per_vector_k1)}, + {"BBlockLdsExtraN", std::to_string(this->b_block_transfer.lds_add_extra_dim)}, + {"CShuffleMXdlPerWavePerShuffle", + std::to_string(this->cshuffle.m_Xdl_per_wave_per_shuffle)}, + {"CShuffleNXdlPerWavePerShuffle", + std::to_string(this->cshuffle.n_Xdl_per_wave_per_shuffle)}, + {"CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock", + this->c_block_transfer.cluster_lengths_m_block_m_wave_m_per_Xdl_n_block_n_wave_n_per_Xdl}, + {"CDEBlockTransferScalarPerVector_NPerBlock", + std::to_string(this->c_block_transfer.scalar_per_vector_n_wave_n_per_Xdl)}, + }; + + return Solution{InterpolateString(CopyDevice_ConvTemplate, values), std::move(values)}; +} + +} // namespace conv +} // namespace host +} // namespace ck diff --git a/codegen/src/headers.cpp b/codegen/src/headers.cpp index 6fcb94cdbd..f685aca044 100644 --- a/codegen/src/headers.cpp +++ b/codegen/src/headers.cpp @@ -14,4 +14,4 @@ std::unordered_map GetHeaders() } } // namespace host -} // namespace ck \ No newline at end of file +} // namespace ck diff --git a/codegen/src/types.cpp b/codegen/src/types.cpp index d43df73f33..a8a8b10c04 100644 --- a/codegen/src/types.cpp +++ b/codegen/src/types.cpp @@ -29,12 +29,20 @@ std::string ToString(DataType dt) throw std::runtime_error("Incorrect data type"); } +Layout ToLayout(bool Trans) { return Trans ? Layout::Column : Layout::Row; } + std::string ToString(Layout dl) { switch(dl) { case Layout::Row: return "ck::tensor_layout::gemm::RowMajor"; case Layout::Column: return "ck::tensor_layout::gemm::ColumnMajor"; + case Layout::GKCYX: return "ck::tensor_layout::convolution::GKCYX"; + case Layout::GKYXC: return "ck::tensor_layout::convolution::GKYXC"; + case Layout::GNHWK: return "ck::tensor_layout::convolution::GNHWK"; + case Layout::GNHWC: return "ck::tensor_layout::convolution::GNHWC"; + case Layout::NHWGC: return "ck::tensor_layout::convolution::NHWGC"; + case Layout::NHWGK: return "ck::tensor_layout::convolution::NHWGK"; } throw std::runtime_error("Incorrect layout"); } diff --git a/codegen/src/utils.cpp b/codegen/src/utils.cpp index cd6700c489..19627d4cf6 100644 --- a/codegen/src/utils.cpp +++ b/codegen/src/utils.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "ck/host/utils.hpp" diff --git a/codegen/test/CMakeLists.txt b/codegen/test/CMakeLists.txt index 897cce1c94..f891286019 100644 --- a/codegen/test/CMakeLists.txt +++ b/codegen/test/CMakeLists.txt @@ -1,11 +1,13 @@ - list(APPEND CMAKE_PREFIX_PATH /opt/rocm) add_subdirectory(rtc) - file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp) foreach(TEST_SRC ${TEST_SRCS}) -get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE) -rocm_add_test_executable(test_host_${BASE_NAME} ${TEST_SRC}) -target_link_libraries(test_host_${BASE_NAME} ck_rtc ck_host) -target_include_directories(test_host_${BASE_NAME} PUBLIC include()) + set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP) + get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE) + rocm_add_test_executable(test_host_${BASE_NAME} ${TEST_SRC}) + target_link_libraries(test_host_${BASE_NAME} ck_rtc ck_host) + # target_link_libraries(test_host_${BASE_NAME} ${CK_ROOT}/build/lib/libutility.a) + target_include_directories(test_host_${BASE_NAME} PUBLIC include()) + target_include_directories(test_host_${BASE_NAME} PUBLIC ${CK_ROOT}/include) + target_include_directories(test_host_${BASE_NAME} PUBLIC ${CK_ROOT}/library/include) endforeach() diff --git a/codegen/test/common.hpp b/codegen/test/common.hpp new file mode 100644 index 0000000000..99d4c64973 --- /dev/null +++ b/codegen/test/common.hpp @@ -0,0 +1,134 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + +std::vector get_headers_for_test() +{ + std::vector result; + auto hs = ck::host::GetHeaders(); + std::transform( + hs.begin(), hs.end(), std::back_inserter(result), [&](const auto& p) -> rtc::src_file { + return {p.first, p.second}; + }); + return result; +} + +template +std::size_t GetSize(V mLens, V mStrides) +{ + std::size_t space = 1; + for(std::size_t i = 0; i < mLens.Size(); ++i) + { + if(mLens[i] == 0) + continue; + + space += (mLens[i] - 1) * mStrides[i]; + } + return space; +} + +template +rtc::buffer generate_buffer(V mLens, V mStrides, std::size_t seed = 0) +{ + std::size_t space = GetSize(mLens, mStrides); + rtc::buffer result(space); + std::mt19937 gen(seed); + std::uniform_real_distribution dis(-1.0); + std::generate(result.begin(), result.end(), [&] { return dis(gen); }); + // std::fill(result.begin(), result.end(), 1); + return result; +} + +template +bool allclose(const T& a, const U& b, double atol = 0.01, double rtol = 0.01) +{ + return std::equal(a.begin(), a.end(), b.begin(), b.end(), [&](double x, double y) { + return fabs(x - y) < atol + rtol * fabs(y); + }); +} + +std::string classify(double x) +{ + switch(std::fpclassify(x)) + { + case FP_INFINITE: return "inf"; + case FP_NAN: return "nan"; + case FP_NORMAL: return "normal"; + case FP_SUBNORMAL: return "subnormal"; + case FP_ZERO: return "zero"; + default: return "unknown"; + } +} + +template +void print_classification(const Buffer& x) +{ + std::unordered_set result; + for(const auto& i : x) + result.insert(classify(i)); + for(const auto& c : result) + std::cout << c << ", "; + std::cout << std::endl; +} + +template +void print_statistics(const Buffer& x) +{ + std::cout << "Min value: " << *std::min_element(x.begin(), x.end()) << ", "; + std::cout << "Max value: " << *std::max_element(x.begin(), x.end()) << ", "; + double num_elements = x.size(); + auto mean = + std::accumulate(x.begin(), x.end(), double{0.0}, std::plus{}) / num_elements; + auto stddev = std::sqrt( + std::accumulate(x.begin(), + x.end(), + double{0.0}, + [&](double r, double v) { return r + std::pow((v - mean), 2.0); }) / + num_elements); + std::cout << "Mean: " << mean << ", "; + std::cout << "StdDev: " << stddev << "\n"; +} + +template +void print_preview(const Buffer& x) +{ + if(x.size() <= 10) + { + std::for_each(x.begin(), x.end(), [&](double i) { std::cout << i << ", "; }); + } + else + { + std::for_each(x.begin(), x.begin() + 5, [&](double i) { std::cout << i << ", "; }); + std::cout << "..., "; + std::for_each(x.end() - 5, x.end(), [&](double i) { std::cout << i << ", "; }); + } + std::cout << std::endl; +} + +template +struct check_all +{ + rtc::buffer data{}; + bool operator()(const rtc::buffer& x) + { + if(data.empty()) + { + data = x; + return true; + } + return allclose(data, x); + } +}; + +template +auto report(const Solution& solution, bool pass) +{ + return test::make_predicate(solution.ToTemplateString(), [=] { return pass; }); +} diff --git a/codegen/test/gemm_multiple_d.cpp b/codegen/test/gemm_multiple_d.cpp index 17b659993a..bd7ef463fb 100644 --- a/codegen/test/gemm_multiple_d.cpp +++ b/codegen/test/gemm_multiple_d.cpp @@ -10,6 +10,7 @@ #include #include #include +#include using half = _Float16; // using half = __fp16; @@ -159,7 +160,10 @@ TEST_CASE(test_problem_kernel) auto b = to_gpu(generate_buffer(1024 * 1024, 1)); auto c = to_gpu(generate_buffer(1024 * 1024, 2)); - for(auto solution : prob.GetSolutions("gfx90a")) + std::string epilogue = ""; + std::string prologue = ""; + + for(auto solution : prob.GetSolutions("gfx90a", prologue, epilogue)) { auto src = ck::host::InterpolateString(gemm_compile_check, {{"include", prob.GetIncludeHeader()}, @@ -178,6 +182,7 @@ TEST_CASE(test_problem_kernel) auto grid_size = ck::host::integer_divide_ceil(prob.M, m_per_block) * ck::host::integer_divide_ceil(prob.N, n_per_block); k.launch(nullptr, grid_size * block_size, block_size)(a.data(), b.data(), c.data()); + CHECK(report(solution, check(rtc::from_gpu(c)))); } } diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp new file mode 100644 index 0000000000..3c477692e5 --- /dev/null +++ b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp @@ -0,0 +1,209 @@ +#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" +#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp" +#include "ck/host/headers.hpp" +#include "ck/host/stringutils.hpp" +#include "ck/host/utils.hpp" +#include "ck/tensor_operation/gpu/device/helper.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +#include +#include +#include +#include "common.hpp" +#include + +// Need this for verification +/**struct Epilogue +{ + Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){}; + + template + __host__ __device__ constexpr void operator()(E& e, const D& d) const; + + template <> + __host__ __device__ constexpr void operator()(ck::half_t& e, + const ck::half_t& d) const + { + e = ck::type_convert(alpha_ * e + beta_ * ck::type_convert(d)); + } + + float alpha_; + float beta_; +};**/ +const std::string conv_compile_check = R"__ck__( +#include <${include}> + +${template}; + +)__ck__"; + +TEST_CASE(test_problem_kernel) +{ + // set up problem specification + ck::host::conv::Problem_Conv_Fwd prob; + prob.NumDim = 2; + prob.G = 32; + prob.N = 256; + prob.C = 32; + prob.K = 64; + prob.Y = 3; + prob.X = 3; + prob.Hi = 28; + prob.Wi = 28; + prob.Ho = 28; + prob.Wo = 28; + check_all check; + + // user provided fusion operations + std::string epilogue = R"( +struct Epilogue +{ + __host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){}; + + template + __host__ __device__ constexpr void operator()(E& e, const D& d) const; + + template <> + __host__ __device__ constexpr void operator()(ck::half_t& e, + const ck::half_t& d) const + { + e = ck::type_convert(alpha_ * e + beta_ * ck::type_convert(d)); + } + + float alpha_; + float beta_; +}; +)"; + std::string prologue = ""; + + // length+stride arrays + ck::Array in_lengths{static_cast(prob.G), + static_cast(prob.N), + static_cast(prob.C), + static_cast(prob.Hi), + static_cast(prob.Wi)}; + ck::Array out_lengths{static_cast(prob.G), + static_cast(prob.N), + static_cast(prob.K), + static_cast(prob.Ho), + static_cast(prob.Wo)}; + ck::Array wei_lengths{static_cast(prob.G), + static_cast(prob.K), + static_cast(prob.C), + static_cast(prob.Y), + static_cast(prob.X)}; + ck::Array d_lengths = {}; + + ck::Array in_strides{static_cast(prob.C), + static_cast(prob.Hi * prob.Wi * prob.G * prob.C), + 1, + static_cast(prob.Wi * prob.G * prob.C), + static_cast(prob.G * prob.C)}; + ck::Array out_strides{static_cast(prob.K), + static_cast(prob.Ho * prob.Wo * prob.G * prob.K), + 1, + static_cast(prob.Wo * prob.G * prob.K), + static_cast(prob.G * prob.K)}; + ck::Array wei_strides{static_cast(prob.K * prob.Y * prob.X * prob.C), + static_cast(prob.Y * prob.X * prob.C), + 1, + static_cast(prob.X * prob.C), + static_cast(prob.C)}; + ck::Array d_strides = {}; + + ck::Array conv_filter_strides = {2, 2}; + ck::Array conv_filter_dilations = {1, 1}; + ck::Array input_left_pads = {1, 1}; + ck::Array input_right_pads = {1, 1}; + + // move the data onto the device + auto in_dev = + to_gpu(generate_buffer>(in_lengths, in_strides, 0)); + auto wei_dev = + to_gpu(generate_buffer>(wei_lengths, wei_strides, 1)); + auto out_dev = + to_gpu(generate_buffer>(out_lengths, out_strides, 2)); + + // CK Verficiation: Reference Kernel + /**bool pass = true; + Tensor in_host(in_lengths, in_strides); + in_host.GenerateTensorValue(GeneratorTensor_1{1}); + Tensor wei_host(wei_lengths, wei_strides); + wei_host.GenerateTensorValue(GeneratorTensor_1{1}); + Tensor out_host(out_lengths, out_strides); + + std::vector conv_filter_strides_ = {2, 2}; + std::vector conv_filter_dilations_ = {1, 1}; + std::vector input_left_pads_ = {1, 1}; + std::vector input_right_pads_ = {1, 1}; + + auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd< + 2, + ck::half_t, + ck::half_t, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + Epilogue>(); + + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(in_host, + wei_host, + out_host, + conv_filter_strides_, + conv_filter_dilations_, + input_left_pads_, + input_right_pads_, + ck::tensor_operation::element_wise::PassThrough{}, + ck::tensor_operation::element_wise::PassThrough{}, + Epilogue{1.0f, 1.0f}); + out_host.SetZero(); + ref_invoker.Run(ref_argument);**/ + + for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue)) + { + // substitute instance values into the template + auto src = ck::host::InterpolateString( + conv_compile_check, + {{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}}); + + auto srcs = get_headers_for_test(); + srcs.push_back({"main.cpp", src}); + rtc::compile_options options; + auto name = solution.GetTemplateParameter("name"); + options.kernel_name = "run_" + name; + auto k = rtc::compile_kernel(srcs, options); + + // Grid size calculation + auto block_size = solution.GetTemplateParameter("BlockSize"); + + auto tmp = get_launch_params(solution, out_lengths, out_strides); + + auto grid_size = tmp * in_lengths[1]; + + // launch the kernel with arguments needed for the argument pointer + k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(), + wei_dev.data(), + out_dev.data(), + in_lengths, + in_strides, + wei_lengths, + wei_strides, + out_lengths, + out_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads); + + // auto res = rtc::from_gpu(out_dev); + // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f); + // assert(pass); + + // Simple check: this checks that the output from each instance matches the output from the + // first instance + CHECK(report(solution, check(rtc::from_gpu(out_dev)))); + } +} + +int main(int argc, const char* argv[]) { test::run(argc, argv); } diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp new file mode 100644 index 0000000000..ec9bd2b781 --- /dev/null +++ b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp @@ -0,0 +1,209 @@ +#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" +#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp" +#include "ck/host/headers.hpp" +#include "ck/host/stringutils.hpp" +#include "ck/host/utils.hpp" +#include "common.hpp" +#include "ck/tensor_operation/gpu/device/helper.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +#include +#include +#include +#include + +// need this for validation +/**struct Epilogue +{ + Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){}; + + template + __host__ __device__ constexpr void operator()(E& e, const D& d) const; + + template <> + __host__ __device__ constexpr void operator()(ck::half_t& e, + const ck::half_t& d) const + { + e = ck::type_convert(alpha_ * e + beta_ * ck::type_convert(d)); + } + + float alpha_; + float beta_; +};**/ +const std::string conv_compile_check = R"__ck__( +#include <${include}> + +${template}; + +)__ck__"; + +TEST_CASE(test_problem_kernel) +{ + // set up problem specification + ck::host::conv::Problem_Conv_Fwd prob; + prob.NumDim = 2; + prob.G = 32; + prob.N = 256; + prob.C = 32; + prob.K = 64; + prob.Y = 3; + prob.X = 3; + prob.Hi = 28; + prob.Wi = 28; + prob.Ho = 28; + prob.Wo = 28; + check_all check; + + // user provided fusion operations + std::string epilogue = R"( +struct Epilogue +{ + __host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){}; + + template + __host__ __device__ constexpr void operator()(E& e, const D& d) const; + + template <> + __host__ __device__ constexpr void operator()(ck::half_t& e, + const ck::half_t& d) const + { + e = ck::type_convert(alpha_ * e + beta_ * ck::type_convert(d)); + } + + float alpha_; + float beta_; +}; +)"; + std::string prologue = ""; + + // length+stride arrays + ck::Array in_lengths{static_cast(prob.G), + static_cast(prob.N), + static_cast(prob.C), + static_cast(prob.Hi), + static_cast(prob.Wi)}; + ck::Array out_lengths{static_cast(prob.G), + static_cast(prob.N), + static_cast(prob.K), + static_cast(prob.Ho), + static_cast(prob.Wo)}; + ck::Array wei_lengths{static_cast(prob.G), + static_cast(prob.K), + static_cast(prob.C), + static_cast(prob.Y), + static_cast(prob.X)}; + ck::Array d_lengths = {}; + + ck::Array in_strides{static_cast(prob.C), + static_cast(prob.Hi * prob.Wi * prob.G * prob.C), + 1, + static_cast(prob.Wi * prob.G * prob.C), + static_cast(prob.G * prob.C)}; + ck::Array out_strides{static_cast(prob.K), + static_cast(prob.Ho * prob.Wo * prob.G * prob.K), + 1, + static_cast(prob.Wo * prob.G * prob.K), + static_cast(prob.G * prob.K)}; + ck::Array wei_strides{static_cast(prob.K * prob.Y * prob.X * prob.C), + static_cast(prob.Y * prob.X * prob.C), + 1, + static_cast(prob.X * prob.C), + static_cast(prob.C)}; + ck::Array d_strides = {}; + + ck::Array conv_filter_strides = {1, 1}; + ck::Array conv_filter_dilations = {1, 1}; + ck::Array input_left_pads = {0, 0}; + ck::Array input_right_pads = {0, 0}; + + // move the data onto the device + auto in_dev = + to_gpu(generate_buffer>(in_lengths, in_strides, 0)); + auto wei_dev = + to_gpu(generate_buffer>(wei_lengths, wei_strides, 1)); + auto out_dev = + to_gpu(generate_buffer>(out_lengths, out_strides, 2)); + + // CK Verficiation: Reference Kernel + /**bool pass = true; + Tensor in_host(in_lengths, in_strides); + in_host.GenerateTensorValue(GeneratorTensor_1{1}); + Tensor wei_host(wei_lengths, wei_strides); + wei_host.GenerateTensorValue(GeneratorTensor_1{1}); + Tensor out_host(out_lengths, out_strides); + + std::vector conv_filter_strides_ = {1, 1}; + std::vector conv_filter_dilations_ = {1, 1}; + std::vector input_left_pads_ = {0, 0}; + std::vector input_right_pads_ = {0, 0}; + + auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd< + 2, + ck::half_t, + ck::half_t, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + Epilogue>(); + + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(in_host, + wei_host, + out_host, + conv_filter_strides_, + conv_filter_dilations_, + input_left_pads_, + input_right_pads_, + ck::tensor_operation::element_wise::PassThrough{}, + ck::tensor_operation::element_wise::PassThrough{}, + Epilogue{1.0f, 1.0f}); + out_host.SetZero(); + ref_invoker.Run(ref_argument);**/ + + for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue)) + { + // substitute instance values into the template + auto src = ck::host::InterpolateString( + conv_compile_check, + {{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}}); + + auto srcs = get_headers_for_test(); + srcs.push_back({"main.cpp", src}); + rtc::compile_options options; + auto name = solution.GetTemplateParameter("name"); + options.kernel_name = "run_" + name; + auto k = rtc::compile_kernel(srcs, options); + + // Grid size calculation + auto block_size = solution.GetTemplateParameter("BlockSize"); + + auto tmp = get_launch_params(solution, out_lengths, out_strides); + + auto grid_size = tmp * in_lengths[1]; + + // launch the kernel with arguments needed for the argument pointer + k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(), + wei_dev.data(), + out_dev.data(), + in_lengths, + in_strides, + wei_lengths, + wei_strides, + out_lengths, + out_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads); + + // auto res = rtc::from_gpu(out_dev); + // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f); + // assert(pass); + + // Simple check: this checks that the output from each instance matches the output from the + // first instance + CHECK(report(solution, check(rtc::from_gpu(out_dev)))); + } +} + +int main(int argc, const char* argv[]) { test::run(argc, argv); } diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp new file mode 100644 index 0000000000..9850184c5e --- /dev/null +++ b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp @@ -0,0 +1,209 @@ +#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" +#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp" +#include "ck/host/headers.hpp" +#include "ck/host/stringutils.hpp" +#include "ck/host/utils.hpp" +#include "ck/tensor_operation/gpu/device/helper.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +#include "common.hpp" +#include +#include +#include +#include + +// need this for verification +/**struct Epilogue +{ + Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){}; + + template + __host__ __device__ constexpr void operator()(E& e, const D& d) const; + + template <> + __host__ __device__ constexpr void operator()(ck::half_t& e, + const ck::half_t& d) const + { + e = ck::type_convert(alpha_ * e + beta_ * ck::type_convert(d)); + } + + float alpha_; + float beta_; +};**/ +const std::string conv_compile_check = R"__ck__( +#include <${include}> + +${template}; + +)__ck__"; + +TEST_CASE(test_problem_kernel) +{ + // set up problem specification + ck::host::conv::Problem_Conv_Fwd prob; + prob.NumDim = 2; + prob.G = 32; + prob.N = 256; + prob.C = 32; + prob.K = 64; + prob.Y = 3; + prob.X = 3; + prob.Hi = 28; + prob.Wi = 28; + prob.Ho = 28; + prob.Wo = 28; + check_all check; + + // user provided fusion operations + std::string epilogue = R"( +struct Epilogue +{ + __host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){}; + + template + __host__ __device__ constexpr void operator()(E& e, const D& d) const; + + template <> + __host__ __device__ constexpr void operator()(ck::half_t& e, + const ck::half_t& d) const + { + e = ck::type_convert(alpha_ * e + beta_ * ck::type_convert(d)); + } + + float alpha_; + float beta_; +}; +)"; + std::string prologue = ""; + + // length+stride arrays + ck::Array in_lengths{static_cast(prob.G), + static_cast(prob.N), + static_cast(prob.C), + static_cast(prob.Hi), + static_cast(prob.Wi)}; + ck::Array out_lengths{static_cast(prob.G), + static_cast(prob.N), + static_cast(prob.K), + static_cast(prob.Ho), + static_cast(prob.Wo)}; + ck::Array wei_lengths{static_cast(prob.G), + static_cast(prob.K), + static_cast(prob.C), + static_cast(prob.Y), + static_cast(prob.X)}; + ck::Array d_lengths = {}; + + ck::Array in_strides{static_cast(prob.C), + static_cast(prob.Hi * prob.Wi * prob.G * prob.C), + 1, + static_cast(prob.Wi * prob.G * prob.C), + static_cast(prob.G * prob.C)}; + ck::Array out_strides{static_cast(prob.K), + static_cast(prob.Ho * prob.Wo * prob.G * prob.K), + 1, + static_cast(prob.Wo * prob.G * prob.K), + static_cast(prob.G * prob.K)}; + ck::Array wei_strides{static_cast(prob.K * prob.Y * prob.X * prob.C), + static_cast(prob.Y * prob.X * prob.C), + 1, + static_cast(prob.X * prob.C), + static_cast(prob.C)}; + ck::Array d_strides = {}; + + ck::Array conv_filter_strides = {2, 2}; + ck::Array conv_filter_dilations = {1, 1}; + ck::Array input_left_pads = {0, 0}; + ck::Array input_right_pads = {0, 0}; + + // move the data onto the device + auto in_dev = + to_gpu(generate_buffer>(in_lengths, in_strides, 0)); + auto wei_dev = + to_gpu(generate_buffer>(wei_lengths, wei_strides, 1)); + auto out_dev = + to_gpu(generate_buffer>(out_lengths, out_strides, 2)); + + // CK Verficiation: Reference Kernel + /**bool pass = true; + Tensor in_host(in_lengths, in_strides); + in_host.GenerateTensorValue(GeneratorTensor_1{1}); + Tensor wei_host(wei_lengths, wei_strides); + wei_host.GenerateTensorValue(GeneratorTensor_1{1}); + Tensor out_host(out_lengths, out_strides); + + std::vector conv_filter_strides_ = {2, 2}; + std::vector conv_filter_dilations_ = {1, 1}; + std::vector input_left_pads_ = {0, 0}; + std::vector input_right_pads_ = {0, 0}; + + auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd< + 2, + ck::half_t, + ck::half_t, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + Epilogue>(); + + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(in_host, + wei_host, + out_host, + conv_filter_strides_, + conv_filter_dilations_, + input_left_pads_, + input_right_pads_, + ck::tensor_operation::element_wise::PassThrough{}, + ck::tensor_operation::element_wise::PassThrough{}, + Epilogue{1.0f, 1.0f}); + out_host.SetZero(); + ref_invoker.Run(ref_argument);**/ + + for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue)) + { + // substitute instance values into the template + auto src = ck::host::InterpolateString( + conv_compile_check, + {{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}}); + + auto srcs = get_headers_for_test(); + srcs.push_back({"main.cpp", src}); + rtc::compile_options options; + auto name = solution.GetTemplateParameter("name"); + options.kernel_name = "run_" + name; + auto k = rtc::compile_kernel(srcs, options); + + // Grid size calculation + auto block_size = solution.GetTemplateParameter("BlockSize"); + + auto tmp = get_launch_params(solution, out_lengths, out_strides); + + auto grid_size = tmp * in_lengths[1]; + + // launch the kernel with arguments needed for the argument pointer + k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(), + wei_dev.data(), + out_dev.data(), + in_lengths, + in_strides, + wei_lengths, + wei_strides, + out_lengths, + out_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads); + + // auto res = rtc::from_gpu(out_dev); + // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f); + // assert(pass); + + // Simple check: this checks that the output from each instance matches the output from the + // first instance + CHECK(report(solution, check(rtc::from_gpu(out_dev)))); + } +} + +int main(int argc, const char* argv[]) { test::run(argc, argv); } diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp new file mode 100644 index 0000000000..907f744db4 --- /dev/null +++ b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp @@ -0,0 +1,209 @@ +#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" +#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp" +#include "ck/host/headers.hpp" +#include "ck/host/stringutils.hpp" +#include "ck/host/utils.hpp" +#include "ck/tensor_operation/gpu/device/helper.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +#include "common.hpp" +#include +#include +#include +#include + +// need this for verification +/**struct Epilogue +{ + Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){}; + + template + __host__ __device__ constexpr void operator()(E& e, const D& d) const; + + template <> + __host__ __device__ constexpr void operator()(ck::half_t& e, + const ck::half_t& d) const + { + e = ck::type_convert(alpha_ * e + beta_ * ck::type_convert(d)); + } + + float alpha_; + float beta_; +};**/ +const std::string conv_compile_check = R"__ck__( +#include <${include}> + +${template}; + +)__ck__"; + +TEST_CASE(test_problem_kernel) +{ + // set up problem specification + ck::host::conv::Problem_Conv_Fwd prob; + prob.NumDim = 2; + prob.G = 32; + prob.N = 256; + prob.C = 32; + prob.K = 64; + prob.Y = 3; + prob.X = 3; + prob.Hi = 28; + prob.Wi = 28; + prob.Ho = 28; + prob.Wo = 28; + check_all check; + + // user provided fusion operations + std::string epilogue = R"( +struct Epilogue +{ + __host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){}; + + template + __host__ __device__ constexpr void operator()(E& e, const D& d) const; + + template <> + __host__ __device__ constexpr void operator()(ck::half_t& e, + const ck::half_t& d) const + { + e = ck::type_convert(alpha_ * e + beta_ * ck::type_convert(d)); + } + + float alpha_; + float beta_; +}; +)"; + std::string prologue = ""; + + // length+stride arrays + ck::Array in_lengths{static_cast(prob.G), + static_cast(prob.N), + static_cast(prob.C), + static_cast(prob.Hi), + static_cast(prob.Wi)}; + ck::Array out_lengths{static_cast(prob.G), + static_cast(prob.N), + static_cast(prob.K), + static_cast(prob.Ho), + static_cast(prob.Wo)}; + ck::Array wei_lengths{static_cast(prob.G), + static_cast(prob.K), + static_cast(prob.C), + static_cast(prob.Y), + static_cast(prob.X)}; + ck::Array d_lengths = {}; + + ck::Array in_strides{static_cast(prob.C), + static_cast(prob.Hi * prob.Wi * prob.G * prob.C), + 1, + static_cast(prob.Wi * prob.G * prob.C), + static_cast(prob.G * prob.C)}; + ck::Array out_strides{static_cast(prob.K), + static_cast(prob.Ho * prob.Wo * prob.G * prob.K), + 1, + static_cast(prob.Wo * prob.G * prob.K), + static_cast(prob.G * prob.K)}; + ck::Array wei_strides{static_cast(prob.K * prob.Y * prob.X * prob.C), + static_cast(prob.Y * prob.X * prob.C), + 1, + static_cast(prob.X * prob.C), + static_cast(prob.C)}; + ck::Array d_strides = {}; + + ck::Array conv_filter_strides = {1, 1}; + ck::Array conv_filter_dilations = {1, 1}; + ck::Array input_left_pads = {1, 1}; + ck::Array input_right_pads = {1, 1}; + + // move the data onto the device + auto in_dev = + to_gpu(generate_buffer>(in_lengths, in_strides, 0)); + auto wei_dev = + to_gpu(generate_buffer>(wei_lengths, wei_strides, 1)); + auto out_dev = + to_gpu(generate_buffer>(out_lengths, out_strides, 2)); + + // CK Verficiation: Reference Kernel + /**bool pass = true; + Tensor in_host(in_lengths, in_strides); + in_host.GenerateTensorValue(GeneratorTensor_1{1}); + Tensor wei_host(wei_lengths, wei_strides); + wei_host.GenerateTensorValue(GeneratorTensor_1{1}); + Tensor out_host(out_lengths, out_strides); + + std::vector conv_filter_strides_ = {1, 1}; + std::vector conv_filter_dilations_ = {1, 1}; + std::vector input_left_pads_ = {1, 1}; + std::vector input_right_pads_ = {1, 1}; + + auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd< + 2, + ck::half_t, + ck::half_t, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + Epilogue>(); + + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(in_host, + wei_host, + out_host, + conv_filter_strides_, + conv_filter_dilations_, + input_left_pads_, + input_right_pads_, + ck::tensor_operation::element_wise::PassThrough{}, + ck::tensor_operation::element_wise::PassThrough{}, + Epilogue{1.0f, 1.0f}); + out_host.SetZero(); + ref_invoker.Run(ref_argument);**/ + + for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue)) + { + // substitute instance values into the template + auto src = ck::host::InterpolateString( + conv_compile_check, + {{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}}); + + auto srcs = get_headers_for_test(); + srcs.push_back({"main.cpp", src}); + rtc::compile_options options; + auto name = solution.GetTemplateParameter("name"); + options.kernel_name = "run_" + name; + auto k = rtc::compile_kernel(srcs, options); + + // Grid size calculation + auto block_size = solution.GetTemplateParameter("BlockSize"); + + auto tmp = get_launch_params(solution, out_lengths, out_strides); + + auto grid_size = tmp * in_lengths[1]; + + // launch the kernel with arguments needed for the argument pointer + k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(), + wei_dev.data(), + out_dev.data(), + in_lengths, + in_strides, + wei_lengths, + wei_strides, + out_lengths, + out_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads); + + // auto res = rtc::from_gpu(out_dev); + // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f); + // assert(pass); + + // Simple check: this checks that the output from each instance matches the output from the + // first instance + CHECK(report(solution, check(rtc::from_gpu(out_dev)))); + } +} + +int main(int argc, const char* argv[]) { test::run(argc, argv); } diff --git a/codegen/test/rtc/src/compile_kernel.cpp b/codegen/test/rtc/src/compile_kernel.cpp index 7ea55b9328..d84ebf4de9 100644 --- a/codegen/test/rtc/src/compile_kernel.cpp +++ b/codegen/test/rtc/src/compile_kernel.cpp @@ -56,6 +56,8 @@ void write_string(const std::string& filename, const std::string_view& buffer) } std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip --cuda-device-only"; } +// TODO: undo after extracting the codeobj +// std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip"; } kernel compile_kernel(const std::vector& srcs, compile_options options) { @@ -89,6 +91,12 @@ kernel compile_kernel(const std::vector& srcs, compile_options options auto obj = read_buffer(out_path.string()); + std::ofstream ofh("obj.o", std::ios::binary); + for(auto i : obj) + ofh << i; + ofh.close(); + // int s = std::system(("/usr/bin/cp " + out_path.string() + " codeobj.bin").c_str()); + // assert(s == 0); return kernel{obj.data(), options.kernel_name}; } diff --git a/codegen/test/rtc/src/hip.cpp b/codegen/test/rtc/src/hip.cpp index 10e38c9adb..747f83e3ba 100644 --- a/codegen/test/rtc/src/hip.cpp +++ b/codegen/test/rtc/src/hip.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace rtc { @@ -49,7 +50,10 @@ std::size_t get_available_gpu_memory() size_t total; auto status = hipMemGetInfo(&free, &total); if(status != hipSuccess) - throw std::runtime_error("Failed getting available memory: " + hip_error(status)); + { + std::cerr << "Failed getting available memory: " + hip_error(status) << std::endl; + return (8ull * 1024ull * 1024ull * 1024ull); + } return free; } diff --git a/include/ck/tensor_operation/gpu/device/helper.hpp b/include/ck/tensor_operation/gpu/device/helper.hpp new file mode 100644 index 0000000000..c52566509f --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/helper.hpp @@ -0,0 +1,359 @@ +#pragma once + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/multi_index_transform_helper.hpp" +#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp" +#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include +#include + +// functions to return the corresponding structs based on generated template parameters + +using layouts = std::variant; +// return the layout type: currently this is the only type supported in MIOpen +auto layout_type(std::string type) +{ + if(type == "ck::tensor_layout::convolution::NHWGK") + { + return ck::tensor_layout::convolution::NHWGK{}; + } + throw std::runtime_error("Incorrect layout"); +} +// return the right gemm spec based on the generated template parameters +ck::tensor_operation::device::GemmSpecialization gemm_type(std::string type) +{ + if(type == "ck::tensor_operation::device::GemmSpecialization::Default") + { + return ck::tensor_operation::device::GemmSpecialization::Default; + } + if(type == "ck::tensor_operation::device::GemmSpecialization::MNKPadding") + { + return ck::tensor_operation::device::GemmSpecialization::MNKPadding; + } + throw std::runtime_error("Incorrect gemm spec: " + type); +} + +// return the type of convolution +ck::tensor_operation::device::ConvolutionForwardSpecialization conv_type(std::string type) +{ + if(type == "ck::tensor_operation::device::ConvolutionForwardSpecialization::Default") + { + return ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + } + if(type == "ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0") + { + return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0; + } + if(type == + "ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0") + { + return ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0; + } + if(type == "ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC") + { + return ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC; + } + throw std::runtime_error("Incorrect conv spec: " + type); +} + +// Function to call on MatrixPadder via a wrapper struct +// NOTE: CK only uses MNKPadding for forward convolution +template +auto pad(ck::index_t mpb, + ck::index_t npb, + ck::index_t kpb, + ck::tensor_operation::device::GemmSpecialization gemm, + CDesc_MRaw_NRaw conv) +{ + if(gemm == ck::tensor_operation::device::GemmSpecialization::MNKPadding) + { + ck::tensor_operation::device::MatrixPadder< + ck::tensor_operation::device::GemmSpecialization::MNKPadding, + ck::index_t, + ck::index_t, + ck::index_t> + a; + a.MPerTile_ = mpb; + a.NPerTile_ = npb; + a.KPerTile_ = kpb; + auto tmp = grid_desc(a, conv); + return tmp; + } + throw std::runtime_error("Incorrect template parameters, check gemm spec"); +} + +// Functions to call on TransformConvFwdToGemm through wrapper: different functions based on num +// dims +// FIXME: add a way to properly pass in the layout +auto transform_conv(ck::index_t num_dim, + ck::tensor_operation::device::ConvolutionForwardSpecialization spec, + ck::Array out_lengths, + ck::Array out_strides) +{ + if(num_dim == 2 && + spec == ck::tensor_operation::device::ConvolutionForwardSpecialization::Default) + { + ck::tensor_operation::TransformConvFwdToGemm< + 2, + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default> + conv_fwd; + + auto res = ck::tensor_operation::TransformConv(); + return res.transform_func(out_lengths, out_strides, conv_fwd); + } + if(num_dim == 2 && + spec == ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0) + { + ck::tensor_operation::TransformConvFwdToGemm< + 2, + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0> + conv_fwd; + + auto res = ck::tensor_operation::TransformConv(); + return res.transform_func(out_lengths, out_strides, conv_fwd); + } + if(num_dim == 2 && + spec == ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) + { + ck::tensor_operation::TransformConvFwdToGemm< + 2, + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0> + conv_fwd; + + auto res = ck::tensor_operation::TransformConv(); + return res.transform_func(out_lengths, out_strides, conv_fwd); + } + if(num_dim == 2 && spec == ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC) + { + ck::tensor_operation::TransformConvFwdToGemm< + 2, + ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC> + conv_fwd; + + auto res = ck::tensor_operation::TransformConv(); + return res.transform_func(out_lengths, out_strides, conv_fwd); + } + throw std::runtime_error("Incorrect conv spec"); +} + +auto transform_conv_3d(ck::index_t num_dim, + ck::tensor_operation::device::ConvolutionForwardSpecialization spec, + ck::Array out_lengths, + ck::Array out_strides) +{ + if(num_dim == 3 && + spec == ck::tensor_operation::device::ConvolutionForwardSpecialization::Default) + { + ck::tensor_operation::TransformConvFwdToGemm< + 3, + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default> + conv_fwd; + + auto res = ck::tensor_operation::TransformConv(); + return res.transform_func(out_lengths, out_strides, conv_fwd); + } + if(num_dim == 3 && + spec == ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0) + { + ck::tensor_operation::TransformConvFwdToGemm< + 3, + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0> + conv_fwd; + + auto res = ck::tensor_operation::TransformConv(); + return res.transform_func(out_lengths, out_strides, conv_fwd); + } + if(num_dim == 3 && + spec == ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) + { + ck::tensor_operation::TransformConvFwdToGemm< + 3, + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0> + conv_fwd; + + auto res = ck::tensor_operation::TransformConv(); + return res.transform_func(out_lengths, out_strides, conv_fwd); + } + if(num_dim == 3 && spec == ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC) + { + ck::tensor_operation::TransformConvFwdToGemm< + 3, + ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC> + conv_fwd; + + auto res = ck::tensor_operation::TransformConv(); + return res.transform_func(out_lengths, out_strides, conv_fwd); + } + throw std::runtime_error("Incorrect conv spec"); +} + +auto transform_conv_1d(ck::index_t num_dim, + ck::tensor_operation::device::ConvolutionForwardSpecialization spec, + ck::Array out_lengths, + ck::Array out_strides) +{ + if(num_dim == 1 && + spec == ck::tensor_operation::device::ConvolutionForwardSpecialization::Default) + { + ck::tensor_operation::TransformConvFwdToGemm< + 1, + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default> + conv_fwd; + + auto res = ck::tensor_operation::TransformConv(); + return res.transform_func(out_lengths, out_strides, conv_fwd); + } + if(num_dim == 1 && + spec == ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0) + { + ck::tensor_operation::TransformConvFwdToGemm< + 1, + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0> + conv_fwd; + + auto res = ck::tensor_operation::TransformConv(); + return res.transform_func(out_lengths, out_strides, conv_fwd); + } + if(num_dim == 1 && + spec == ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) + { + ck::tensor_operation::TransformConvFwdToGemm< + 1, + ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0> + conv_fwd; + + auto res = ck::tensor_operation::TransformConv(); + return res.transform_func(out_lengths, out_strides, conv_fwd); + } + if(num_dim == 1 && spec == ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC) + { + ck::tensor_operation::TransformConvFwdToGemm< + 1, + ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC> + conv_fwd; + + auto res = ck::tensor_operation::TransformConv(); + return res.transform_func(out_lengths, out_strides, conv_fwd); + } + throw std::runtime_error("Incorrect dims or conv spec"); +} + +template +auto block_2_etile(ck::index_t m_per_block, ck::index_t n_per_block, CGridDesc_M_N matrix_padder) +{ + if(m_per_block == 32 && n_per_block == 64) + { + auto b2e = ck::BlockToCTileMap_M00_N0_M01Adapt<32, 64, CGridDesc_M_N>(matrix_padder); + return b2e.CalculateGridSize(matrix_padder); + } + if(m_per_block == 32 && n_per_block == 128) + { + ck::BlockToCTileMap_M00_N0_M01Adapt<32, 128, CGridDesc_M_N> b2e(matrix_padder); + return b2e.CalculateGridSize(matrix_padder); + } + if(m_per_block == 64 && n_per_block == 32) + { + ck::BlockToCTileMap_M00_N0_M01Adapt<64, 32, CGridDesc_M_N> b2e(matrix_padder); + return b2e.CalculateGridSize(matrix_padder); + } + if(m_per_block == 64 && n_per_block == 64) + { + ck::BlockToCTileMap_M00_N0_M01Adapt<64, 64, CGridDesc_M_N> b2e(matrix_padder); + return b2e.CalculateGridSize(matrix_padder); + } + if(m_per_block == 64 && n_per_block == 128) + { + ck::BlockToCTileMap_M00_N0_M01Adapt<64, 128, CGridDesc_M_N> b2e(matrix_padder); + return b2e.CalculateGridSize(matrix_padder); + } + if(m_per_block == 128 && n_per_block == 32) + { + ck::BlockToCTileMap_M00_N0_M01Adapt<128, 32, CGridDesc_M_N> b2e(matrix_padder); + return b2e.CalculateGridSize(matrix_padder); + } + if(m_per_block == 128 && n_per_block == 64) + { + ck::BlockToCTileMap_M00_N0_M01Adapt<128, 64, CGridDesc_M_N> b2e(matrix_padder); + return b2e.CalculateGridSize(matrix_padder); + } + if(m_per_block == 128 && n_per_block == 128) + { + ck::BlockToCTileMap_M00_N0_M01Adapt<128, 128, CGridDesc_M_N> b2e(matrix_padder); + return b2e.CalculateGridSize(matrix_padder); + } + if(m_per_block == 128 && n_per_block == 256) + { + ck::BlockToCTileMap_M00_N0_M01Adapt<128, 256, CGridDesc_M_N> b2e(matrix_padder); + return b2e.CalculateGridSize(matrix_padder); + } + if(m_per_block == 256 && n_per_block == 128) + { + ck::BlockToCTileMap_M00_N0_M01Adapt<256, 128, CGridDesc_M_N> b2e(matrix_padder); + return b2e.CalculateGridSize(matrix_padder); + } + throw std::runtime_error("Incorrect template parameters"); +} + +// wrapper functions by dims to get grid size - uses above 3 functions +// TODO: eventually remove the 1d/2d versions as CK will only support 3d convolutions +auto get_launch_params_1d(ck::host::Solution solution, + ck::Array out_lengths, + ck::Array out_strides) +{ + auto num_dim = solution.GetTemplateParameter("NumDim"); + auto m_per_block = solution.GetTemplateParameter("MPerBlock"); + auto n_per_block = solution.GetTemplateParameter("NPerBlock"); + auto k_per_block = solution.GetTemplateParameter("KPerBlock"); + auto GemmType = solution.GetTemplateParameter("GemmSpecialization"); + auto ConvType = solution.GetTemplateParameter("ConvSpecialization"); + ck::tensor_operation::device::GemmSpecialization GemmSpec = gemm_type(GemmType); + ck::tensor_operation::device::ConvolutionForwardSpecialization ConvSpec = conv_type(ConvType); + auto conv_to_gemm_transformer = transform_conv_1d(num_dim, ConvSpec, out_lengths, out_strides); + auto matrix_padder = + pad(m_per_block, n_per_block, k_per_block, GemmSpec, conv_to_gemm_transformer); + auto b2e = block_2_etile(m_per_block, n_per_block, matrix_padder); + return b2e; +} + +auto get_launch_params(ck::host::Solution solution, + ck::Array out_lengths, + ck::Array out_strides) +{ + auto num_dim = solution.GetTemplateParameter("NumDim"); + auto m_per_block = solution.GetTemplateParameter("MPerBlock"); + auto n_per_block = solution.GetTemplateParameter("NPerBlock"); + auto k_per_block = solution.GetTemplateParameter("KPerBlock"); + auto GemmType = solution.GetTemplateParameter("GemmSpecialization"); + auto ConvType = solution.GetTemplateParameter("ConvSpecialization"); + ck::tensor_operation::device::GemmSpecialization GemmSpec = gemm_type(GemmType); + ck::tensor_operation::device::ConvolutionForwardSpecialization ConvSpec = conv_type(ConvType); + auto conv_to_gemm_transformer = transform_conv(num_dim, ConvSpec, out_lengths, out_strides); + auto matrix_padder = + pad(m_per_block, n_per_block, k_per_block, GemmSpec, conv_to_gemm_transformer); + auto b2e = block_2_etile(m_per_block, n_per_block, matrix_padder); + return b2e; +} + +auto get_launch_params_3d(ck::host::Solution solution, + ck::Array out_lengths, + ck::Array out_strides) +{ + auto num_dim = solution.GetTemplateParameter("NumDim"); + auto m_per_block = solution.GetTemplateParameter("MPerBlock"); + auto n_per_block = solution.GetTemplateParameter("NPerBlock"); + auto k_per_block = solution.GetTemplateParameter("KPerBlock"); + auto GemmType = solution.GetTemplateParameter("GemmSpecialization"); + auto ConvType = solution.GetTemplateParameter("ConvSpecialization"); + ck::tensor_operation::device::GemmSpecialization GemmSpec = gemm_type(GemmType); + ck::tensor_operation::device::ConvolutionForwardSpecialization ConvSpec = conv_type(ConvType); + auto conv_to_gemm_transformer = transform_conv_3d(num_dim, ConvSpec, out_lengths, out_strides); + auto matrix_padder = + pad(m_per_block, n_per_block, k_per_block, GemmSpec, conv_to_gemm_transformer); + auto b2e = block_2_etile(m_per_block, n_per_block, matrix_padder); + return b2e; +} diff --git a/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp new file mode 100644 index 0000000000..7ef4e7f184 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/codegen_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp @@ -0,0 +1,781 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" +#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp" +#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" +#include "ck/host_utility/io.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +namespace { + +/* + * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM. + * + * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix + * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly + * strided batched, but we can easily extend to other layouts. The returned offset can be either \p + * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB + * limitations. + * + * \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and + * returns the 2D index of the tile that it computes. \see + * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run(). + * + * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2 + * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid + * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link + * impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for + * \link DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the + * computing of pointer offset into \p ComputePtrOffsetOfStridedBatch. + * + * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes. + * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to + * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion). + * + */ +template +__device__ void device_grouped_conv_fwd_multiple_abd_xdl_cshuffle( + AsPointer p_as_grid, + BsPointer p_bs_grid, + DsPointer p_ds_grid, + EDataType* __restrict__ p_e_grid, + const AElementwiseOperation a_element_op, + const BElementwiseOperation b_element_op, + const CDEElementwiseOperation cde_element_op, + const index_t batch_count, + const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1, + const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1, + const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + ds_grid_desc_mblock_mperblock_nblock_nperblock, + const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock + e_grid_desc_mblock_mperblock_nblock_nperblock_, + const Block2ETileMap block_2_ctile_map, + const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \ + defined(__gfx94__)) + // offset base pointer for each work-group + const index_t num_blocks_per_batch = + __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); + const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + + const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx))); + const auto& ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); + + __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; + + DsPointer p_ds_grid_grp; + + static constexpr index_t NumDTensor = + DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size(); + + static_for<0, NumDTensor, 1>{}( + [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; }); + + if constexpr(isMultiA || isMultiB) + { + AsPointer p_as_grid_grp; + BsPointer p_bs_grid_grp; + + const auto& as_batch_offset = compute_ptr_offset_of_batch.GetAsPtrOffset(g_idx); + + static constexpr index_t NumATensor = AGridDesc_AK0_M_AK1::Size(); + static_for<0, NumATensor, 1>{}( + [&](auto i) { p_as_grid_grp(i) = p_as_grid[i] + as_batch_offset[i]; }); + + const auto& bs_batch_offset = compute_ptr_offset_of_batch.GetBsPtrOffset(g_idx); + + static constexpr index_t NumBTensor = BGridDesc_BK0_N_BK1::Size(); + static_for<0, NumBTensor, 1>{}( + [&](auto i) { p_bs_grid_grp(i) = p_bs_grid[i] + bs_batch_offset[i]; }); + + GridwiseGemm::template Run( + p_as_grid_grp, + p_bs_grid_grp, + p_ds_grid_grp, + p_e_grid + e_batch_offset, + p_shared, + a_element_op, + b_element_op, + cde_element_op, + a_grid_desc_k0_m_k1, + b_grid_desc_k0_n_k1, + ds_grid_desc_mblock_mperblock_nblock_nperblock, + e_grid_desc_mblock_mperblock_nblock_nperblock_, + block_2_ctile_map); + } + else + { + const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); + const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))); + + GridwiseGemm::template Run( + p_as_grid + a_batch_offset, + p_bs_grid + b_batch_offset, + p_ds_grid_grp, + p_e_grid + e_batch_offset, + p_shared, + a_element_op, + b_element_op, + cde_element_op, + a_grid_desc_k0_m_k1, + b_grid_desc_k0_n_k1, + ds_grid_desc_mblock_mperblock_nblock_nperblock, + e_grid_desc_mblock_mperblock_nblock_nperblock_, + block_2_ctile_map); + } +#else + ignore = p_as_grid; + ignore = p_bs_grid; + ignore = p_ds_grid; + ignore = p_e_grid; + ignore = batch_count; + ignore = a_grid_desc_k0_m_k1; + ignore = b_grid_desc_k0_n_k1; + ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock; + ignore = e_grid_desc_mblock_mperblock_nblock_nperblock_; + ignore = a_element_op; + ignore = b_element_op; + ignore = cde_element_op; + ignore = compute_ptr_offset_of_batch; + ignore = block_2_ctile_map; +#endif +} + +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_grouped_conv_fwd_multiple_abd_xdl_cshuffle( + AsPointer p_as_grid, + BsPointer p_bs_grid, + DsPointer p_ds_grid, + EDataType* __restrict__ p_e_grid, + const AElementwiseOperation a_element_op, + const BElementwiseOperation b_element_op, + const CDEElementwiseOperation cde_element_op, + const index_t batch_count, + const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1, + const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1, + const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + ds_grid_desc_mblock_mperblock_nblock_nperblock, + const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock + e_grid_desc_mblock_mperblock_nblock_nperblock_, + const Block2ETileMap block_2_ctile_map, + const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch) +{ + + device_grouped_conv_fwd_multiple_abd_xdl_cshuffle< + GridwiseGemm, + AsPointer, // tuples if multi AB, pointers if no + BsPointer, + DsPointer, + EDataType, + AElementwiseOperation, + BElementwiseOperation, + CDEElementwiseOperation, + AGridDesc_AK0_M_AK1, + BGridDesc_BK0_N_BK1, + DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, + EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, + Block2ETileMap, + ComputePtrOffsetOfBatch, + HasMainKBlockLoop, + isMultiA, + isMultiB>(p_as_grid, + p_bs_grid, + p_ds_grid, + *p_e_grid, + a_element_op, + b_element_op, + cde_element_op, + batch_count, + a_grid_desc_k0_m_k1, + b_grid_desc_k0_n_k1, + ds_grid_desc_mblock_mperblock_nblock_nperblock, + e_grid_desc_mblock_mperblock_nblock_nperblock_, + block_2_ctile_map, + compute_ptr_offset_of_batch); +} + +} // namespace + +template +using is_tuple = decltype(std::declval().IsTuple()); + +// +// @brief Device Convolution operation. +// +// Supports: +// @li Forward convolution with up to 3 spatial dimentions +// @li Input tensor in GNWC data format +// @li Weight tensor in GKXC data format +// @li Output tensor in GNWK data format +// +// 1D: +// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C] +// 2D: +// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C] +// 3D: +// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C] +// +template ::value, + Number<0>, + ADataType>()), // ComputeType is InputType by default (first + // in tuple for MultiAB), unpack if tuple was + // passed + LoopScheduler LoopSched = make_default_loop_scheduler()> +struct CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle + : public DeviceGroupedConvFwdMultipleABD +{ + using DeviceOp = CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle; + + static constexpr bool isMultiA = is_detected::value; + static constexpr bool isMultiB = is_detected::value; + + static constexpr index_t NumATensor = GetNumABTensors(); + static constexpr index_t NumBTensor = GetNumABTensors(); + static constexpr index_t NumDTensor = DsDataType::Size(); + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + + static constexpr auto conv_to_gemm_transformer = + TransformConvFwdToGemm{}; + + static constexpr auto matrix_padder = + MatrixPadder{MPerBlock, NPerBlock, KPerBlock}; + + template + __host__ __device__ static auto + MakeAGridDescriptor_M_K(const ck::Array& a_g_n_c_wis_lengths, + const ck::Array& a_g_n_c_wis_strides, + const ck::Array& b_g_k_c_xs_lengths, + const ck::Array& b_g_k_c_xs_strides, + const ck::Array& e_g_n_k_wos_lengths, + const ck::Array& e_g_n_k_wos_strides, + const ck::Array& conv_filter_strides, + const ck::Array& conv_filter_dilations, + const ck::Array& input_left_pads, + const ck::Array& input_right_pads) + { + const auto in_gemmmraw_gemmkraw_desc = + conv_to_gemm_transformer.template MakeADescriptor_M_K(a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads); + + const auto in_gemmm_gemmk_desc = + matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc); + + return in_gemmm_gemmk_desc; + } + + template + __host__ __device__ static auto + MakeBGridDescriptor_N_K(const ck::Array& b_g_k_c_xs_lengths, + const ck::Array& b_g_k_c_xs_strides) + { + const auto wei_gemmnraw_gemmkraw_desc = + conv_to_gemm_transformer.template MakeBDescriptor_N_K(b_g_k_c_xs_lengths, + b_g_k_c_xs_strides); + + const auto wei_gemmn_gemmk_desc = + matrix_padder.PadBDescriptor_N_K(wei_gemmnraw_gemmkraw_desc); + + return wei_gemmn_gemmk_desc; + } + + template + __host__ __device__ static auto + MakeEGridDescriptor_M_N(const ck::Array& e_g_n_k_wos_lengths, + const ck::Array& e_g_n_k_wos_strides) + { + const auto out_gemmmraw_gemmnraw_desc = + conv_to_gemm_transformer.template MakeCDescriptor_M_N(e_g_n_k_wos_lengths, + e_g_n_k_wos_strides); + + const auto out_gemmm_gemmn_desc = + matrix_padder.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_desc); + + return out_gemmm_gemmn_desc; + } + + // Shape of Ds and E must be aligned. Strides can be different. + // Pass e_g_n_k_wos_lengths for logical broadcast. + __host__ __device__ static auto MakeDsGridDescriptor_M_N( + const ck::Array& e_g_n_k_wos_lengths, + const ck::Array, NumDTensor>& ds_g_n_k_wos_strides) + { + return generate_tuple( + [&](auto i) { + using DLayout = remove_cvref_t>; + + return DeviceOp::MakeEGridDescriptor_M_N(e_g_n_k_wos_lengths, + ds_g_n_k_wos_strides[i]); + }, + Number{}); + } + + // desc for problem definition + using AGridDesc_M_K = remove_cvref_t( + {}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>; + using BGridDesc_N_K = remove_cvref_t({}, {}))>; + using DsGridDesc_M_N = remove_cvref_t; + using EGridDesc_M_N = remove_cvref_t({}, {}))>; + + // If we are using multiAB and one of the template datatype parameters is not a tuple, convert + // it to it + using GemmADataType = std::conditional_t, ADataType>; + using GemmBDataType = std::conditional_t, BDataType>; + +#define GridwiseGemmTemplateParameters \ + GemmADataType, GemmBDataType, ComputeDataType, AccDataType, CShuffleDataType, DsDataType, \ + EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, \ + InMemoryDataOperationEnum::Set, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, \ + KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, \ + ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, \ + ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, \ + ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, \ + ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, \ + BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, \ + BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, \ + BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, \ + CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, \ + CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, \ + CDEBlockTransferScalarPerVector_NPerBlock, LoopSched + // Use appropriate gridwise gemm + using GridwiseGemm = + std::conditional_t, + GridwiseGemmMultipleD_xdl_cshuffle>; + + // If ADataTypes or BDataTypes is tuple, user has to pass ck::Array with pointers. + using APointers = + std::conditional_t&, const void*>; + using BPointers = + std::conditional_t&, const void*>; + // Use Tuple for the both cases for GridPointer to initialize it in Argument constructor (not + // in initializer list what is required for single const pointer). + using AGridPointer = remove_cvref_t< + decltype(GetAGridPointer < isMultiA || isMultiB, GridwiseGemm, ADataType > ())>; + using BGridPointer = remove_cvref_t< + decltype(GetBGridPointer < isMultiA || isMultiB, GridwiseGemm, BDataType > ())>; + + // desc for blockwise copy + using AGridDesc_AK0_M_AK1 = + remove_cvref_t; + using BGridDesc_BK0_N_BK1 = + remove_cvref_t; + using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t< + decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + DsGridDesc_M_N{}))>; + using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = + remove_cvref_t; + + // block-to-e-tile map + using Block2ETileMap = + remove_cvref_t; + + // Argument + struct Argument + { + __device__ __host__ Argument( + APointers p_as, + BPointers p_bs, + const ck::Array& p_ds, + void* p_e, + const ck::Array& a_g_n_c_wis_lengths, + const ck::Array& a_g_n_c_wis_strides, + const ck::Array& b_g_k_c_xs_lengths, + const ck::Array& b_g_k_c_xs_strides, + const ck::Array, NumDTensor>& ds_g_n_k_wos_lengths, + const ck::Array, NumDTensor>& ds_g_n_k_wos_strides, + const ck::Array& e_g_n_k_wos_lengths, + const ck::Array& e_g_n_k_wos_strides, + const ck::Array& conv_filter_strides, + const ck::Array& conv_filter_dilations, + const ck::Array& input_left_pads, + const ck::Array& input_right_pads, + const AElementwiseOperation& a_element_op, + const BElementwiseOperation& b_element_op, + const CDEElementwiseOperation& cde_element_op) + : p_as_grid_{}, + p_bs_grid_{}, + p_ds_grid_{}, + p_e_grid_{static_cast(p_e)}, + num_group_{a_g_n_c_wis_lengths[0]}, + a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads)}, + b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(b_g_k_c_xs_lengths, + b_g_k_c_xs_strides)}, + ds_grid_desc_m_n_{}, + e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(e_g_n_k_wos_lengths, + e_g_n_k_wos_strides)}, + a_grid_desc_ak0_m_ak1_{ + GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)}, + b_grid_desc_bk0_n_bk1_{ + GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)}, + ds_grid_desc_mblock_mperblock_nblock_nperblock_{}, + e_grid_desc_mblock_mperblock_nblock_nperblock_{}, + block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)}, + compute_ptr_offset_of_batch_{}, + a_element_op_{a_element_op}, + b_element_op_{b_element_op}, + cde_element_op_{cde_element_op}, + a_g_n_c_wis_lengths_{a_g_n_c_wis_lengths}, + a_g_n_c_wis_strides_{a_g_n_c_wis_strides}, + b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths}, + b_g_k_c_xs_strides_{b_g_k_c_xs_strides}, + ds_g_n_k_wos_lengths_{ds_g_n_k_wos_lengths}, + ds_g_n_k_wos_strides_{ds_g_n_k_wos_strides}, + e_g_n_k_wos_lengths_{e_g_n_k_wos_lengths}, + e_g_n_k_wos_strides_{e_g_n_k_wos_strides}, + conv_filter_strides_{conv_filter_strides}, + conv_filter_dilations_{conv_filter_dilations}, + input_left_pads_{input_left_pads}, + input_right_pads_{input_right_pads} + { + // A/B/E Batch Stride + if constexpr(isMultiA || isMultiB) + { + static_for<0, NumATensor, 1>{}([&](auto i) { + // Init compute_ptr_offset_of_batch_ for multiple AB + compute_ptr_offset_of_batch_.BatchStrideA_(i) = a_g_n_c_wis_strides[0]; + + // Use GemmADataType/GemmBDataType to iterate over tuple (even if passed data + // type is not tuple) + using DataType = remove_cvref_t>; + // It is possible that one of the AB is a pointer and one is a tuple. + // Then also use multiAB but we have to cast single pointer instead of tuple of + // pointer. + if constexpr(isMultiA) + { + // p_as is tuple + p_as_grid_(i) = static_cast(p_as[i.value]); + } + else + { + // if MultiB and not MultiA then p_as is single pointer + p_as_grid_(i) = static_cast(p_as); + } + }); + static_for<0, NumBTensor, 1>{}([&](auto i) { + // Init compute_ptr_offset_of_batch_ for multiple AB + compute_ptr_offset_of_batch_.BatchStrideB_(i) = b_g_k_c_xs_strides[0]; + + using DataType = remove_cvref_t>; + // It is possible that one of the AB is a pointer and one is a tuple. + // Then also use multiAB but we have to cast single pointer instead of tuple of + // pointer. + if constexpr(isMultiB) + { + // p_bs is tuple + p_bs_grid_(i) = static_cast(p_bs[i.value]); + } + else + { + // if MultiA and not MultiB then p_bs is single pointer + p_bs_grid_(i) = static_cast(p_bs); + } + }); + } + else + { + compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_c_wis_strides[0]; + compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0]; + + // p_as and p_bs are pointers + p_as_grid_(I0) = static_cast(p_as); + p_bs_grid_(I0) = static_cast(p_bs); + } + + // populate pointer, batch stride, desc for Ds + static_for<0, NumDTensor, 1>{}([&](auto i) { + using DLayout = remove_cvref_t>; + using DDataType = remove_cvref_t>; + + // D pointer + p_ds_grid_(i) = static_cast(p_ds[i]); + + // D batch stride + compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_k_wos_strides[i][0]; + + // D desc + ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N( + e_g_n_k_wos_lengths, ds_g_n_k_wos_strides[i]); + }); + compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_k_wos_strides[0]; + + // populate desc for Ds/E + if constexpr(isMultiA || isMultiB) + { + const auto as_grid_desc_ak0_m_ak1 = + generate_tuple([&](auto) { return a_grid_desc_m_k_; }, Number{}); + const auto bs_grid_desc_bk0_n_bk1 = + generate_tuple([&](auto) { return b_grid_desc_n_k_; }, Number{}); + + if(GridwiseGemm::CheckValidity(as_grid_desc_ak0_m_ak1, + bs_grid_desc_bk0_n_bk1, + ds_grid_desc_m_n_, + e_grid_desc_m_n_, + block_2_etile_map_)) + { + e_grid_desc_mblock_mperblock_nblock_nperblock_ = + GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + e_grid_desc_m_n_); + + ds_grid_desc_mblock_mperblock_nblock_nperblock_ = + GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + ds_grid_desc_m_n_); + } + } + else + { + if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_, + b_grid_desc_n_k_, + ds_grid_desc_m_n_, + e_grid_desc_m_n_, + block_2_etile_map_)) + { + e_grid_desc_mblock_mperblock_nblock_nperblock_ = + GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + e_grid_desc_m_n_); + + ds_grid_desc_mblock_mperblock_nblock_nperblock_ = + GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + ds_grid_desc_m_n_); + } + } + } + + // private: + // pointers (tuple if multi AB, pointer if no) + AGridPointer p_as_grid_; + BGridPointer p_bs_grid_; + typename GridwiseGemm::DsGridPointer p_ds_grid_; + EDataType* p_e_grid_; + + // tensor descriptors for problem definiton + index_t num_group_; + AGridDesc_M_K a_grid_desc_m_k_; + BGridDesc_N_K b_grid_desc_n_k_; + DsGridDesc_M_N ds_grid_desc_m_n_; + EGridDesc_M_N e_grid_desc_m_n_; + + // tensor descriptors for block/thread-wise copy + AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_; + BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_; + DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock + ds_grid_desc_mblock_mperblock_nblock_nperblock_; + EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_; + + // block-to-e-tile map + Block2ETileMap block_2_etile_map_; + + // for computing batch offset + ComputePtrOffsetOfStridedBatch + compute_ptr_offset_of_batch_; + + // element-wise op + AElementwiseOperation a_element_op_; + BElementwiseOperation b_element_op_; + CDEElementwiseOperation cde_element_op_; + + // for checking IsSupportedArgument() + ck::Array a_g_n_c_wis_lengths_; + ck::Array a_g_n_c_wis_strides_; + ck::Array b_g_k_c_xs_lengths_; + ck::Array b_g_k_c_xs_strides_; + ck::Array, NumDTensor> ds_g_n_k_wos_lengths_; + ck::Array, NumDTensor> ds_g_n_k_wos_strides_; + ck::Array e_g_n_k_wos_lengths_; + ck::Array e_g_n_k_wos_strides_; + ck::Array conv_filter_strides_; + ck::Array conv_filter_dilations_; + ck::Array input_left_pads_; + ck::Array input_right_pads_; + }; + + static __device__ __host__ auto MakeArgument( + APointers p_as, + BPointers p_bs, + const ck::Array& p_ds, + void* p_e, + const ck::Array& a_g_n_c_wis_lengths, + const ck::Array& a_g_n_c_wis_strides, + const ck::Array& b_g_k_c_xs_lengths, + const ck::Array& b_g_k_c_xs_strides, + const ck::Array, NumDTensor>& ds_g_n_k_wos_lengths, + const ck::Array, NumDTensor>& ds_g_n_k_wos_strides, + const ck::Array& e_g_n_k_wos_lengths, + const ck::Array& e_g_n_k_wos_strides, + const ck::Array& conv_filter_strides, + const ck::Array& conv_filter_dilations, + const ck::Array& input_left_pads, + const ck::Array& input_right_pads, + const AElementwiseOperation& a_element_op, + const BElementwiseOperation& b_element_op, + const CDEElementwiseOperation& cde_element_op) + { + return Argument{p_as, + p_bs, + p_ds, + p_e, + a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + ds_g_n_k_wos_lengths, + ds_g_n_k_wos_strides, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + a_element_op, + b_element_op, + cde_element_op}; + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/matrix_padder.hpp b/include/ck/tensor_operation/gpu/device/matrix_padder.hpp index c66d2fc516..0294153147 100644 --- a/include/ck/tensor_operation/gpu/device/matrix_padder.hpp +++ b/include/ck/tensor_operation/gpu/device/matrix_padder.hpp @@ -180,6 +180,19 @@ struct MatrixPadder : public GemmPadder +auto grid_desc(MatrixPadder matrix_padder, + CDesc_MRaw_NRaw conv_desc) +{ + auto res = matrix_padder.PadCDescriptor_M_N(conv_desc); + return res; +} // M/N/KPerTileType could be index_t or Number<> template +__host__ __device__ auto mult_accumulate_n(ForwardIterator first, Size count, T init) +{ + for(ForwardIterator x = first; x != first + count; x++) + { + init *= *x; + } + return init; +} + template struct TransformConvFwdToGemm { @@ -607,6 +618,559 @@ struct TransformConvFwdToGemm return out_gemmm_gemmn_desc; } + + // Overloaded functions for hipRTC purposes + template || + is_same_v || + is_same_v), + bool>::type = false> + __host__ __device__ static auto + MakeADescriptor_M_K(const ck::Array& a_g_n_c_wis_lengths, + const ck::Array& a_g_n_c_wis_strides, + const ck::Array& b_g_k_c_xs_lengths, + const ck::Array& /* b_g_k_c_xs_strides */, + const ck::Array& c_g_n_k_wos_lengths, + const ck::Array& /* c_g_n_k_wos_strides */, + const ck::Array& conv_filter_strides, + const ck::Array& conv_filter_dilations, + const ck::Array& input_left_pads, + const ck::Array& input_right_pads) + { + const index_t N = a_g_n_c_wis_lengths[1]; + const index_t C = a_g_n_c_wis_lengths[2]; + + const index_t Wi = a_g_n_c_wis_lengths[3]; + + const index_t Wo = c_g_n_k_wos_lengths[3]; + + const index_t ConvStrideW = conv_filter_strides[0]; + + if constexpr(ConvForwardSpecialization == + device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) + { + const index_t NHoWo = + N * ck::accumulate_n( + c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>()); + + // This is different + const index_t WiStride = a_g_n_c_wis_strides[2 + NDimSpatial]; + const auto CStride = I1; + + const auto in_gemmm_gemmk_desc = + make_naive_tensor_descriptor(make_tuple(NHoWo, C), make_tuple(WiStride, CStride)); + + return in_gemmm_gemmk_desc; + } + else if constexpr(ConvForwardSpecialization == + device::ConvolutionForwardSpecialization::Filter1x1Pad0) + { + // This is different + const index_t NStride = a_g_n_c_wis_strides[1]; + const index_t WiStride = a_g_n_c_wis_strides[3]; + const auto CStride = I1; + + const auto in_n_wi_c_desc = make_naive_tensor_descriptor( + make_tuple(N, Wi, C), make_tuple(NStride, WiStride, CStride)); + + const auto in_n_wo_c_desc = transform_tensor_descriptor( + in_n_wi_c_desc, + make_tuple(make_pass_through_transform(N), + make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)), + make_pass_through_transform(C)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + const auto in_gemmm_gemmk_desc = transform_tensor_descriptor( + in_n_wo_c_desc, + make_tuple(make_merge_transform(make_tuple(N, Wo)), make_pass_through_transform(C)), + make_tuple(Sequence<0, 1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return in_gemmm_gemmk_desc; + } + else + { + const index_t X = b_g_k_c_xs_lengths[3]; + const index_t ConvDilationW = conv_filter_dilations[0]; + const index_t InLeftPadW = input_left_pads[0]; + const index_t InRightPadW = input_right_pads[0]; + + // This is different + const index_t NStride = a_g_n_c_wis_strides[1]; + const index_t WiStride = a_g_n_c_wis_strides[3]; + const auto CStride = I1; + + const auto in_n_wi_c_desc = make_naive_tensor_descriptor( + make_tuple(N, Wi, C), make_tuple(NStride, WiStride, CStride)); + + const auto in_n_wip_c_desc = transform_tensor_descriptor( + in_n_wi_c_desc, + make_tuple(make_pass_through_transform(N), + make_pad_transform(Wi, InLeftPadW, InRightPadW), + make_pass_through_transform(C)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + const auto in_n_x_wo_c_desc = transform_tensor_descriptor( + in_n_wip_c_desc, + make_tuple( + make_pass_through_transform(N), + make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)), + make_pass_through_transform(C)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{})); + + const auto in_gemmm_gemmk_desc = + transform_tensor_descriptor(in_n_x_wo_c_desc, + make_tuple(make_merge_transform(make_tuple(N, Wo)), + make_merge_transform(make_tuple(X, C))), + make_tuple(Sequence<0, 2>{}, Sequence<1, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return in_gemmm_gemmk_desc; + } + } + + template || + is_same_v || + is_same_v), + bool>::type = false> + __host__ __device__ static auto + MakeADescriptor_M_K(const ck::Array& a_g_n_c_wis_lengths, + const ck::Array& a_g_n_c_wis_strides, + const ck::Array& b_g_k_c_xs_lengths, + const ck::Array& /* b_g_k_c_xs_strides */, + const ck::Array& c_g_n_k_wos_lengths, + const ck::Array& /* c_g_n_k_wos_strides */, + const ck::Array& conv_filter_strides, + const ck::Array& conv_filter_dilations, + const ck::Array& input_left_pads, + const ck::Array& input_right_pads) + { + const index_t N = a_g_n_c_wis_lengths[1]; + const index_t C = a_g_n_c_wis_lengths[2]; + + const index_t Hi = a_g_n_c_wis_lengths[3]; + const index_t Wi = a_g_n_c_wis_lengths[4]; + + const index_t Ho = c_g_n_k_wos_lengths[3]; + const index_t Wo = c_g_n_k_wos_lengths[4]; + + const index_t ConvStrideH = conv_filter_strides[0]; + const index_t ConvStrideW = conv_filter_strides[1]; + + if constexpr(ConvForwardSpecialization == + device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) + { + const index_t NHoWo = + N * ck::accumulate_n( + c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>()); + + // This is different + const index_t WiStride = a_g_n_c_wis_strides[2 + NDimSpatial]; + const auto CStride = I1; + + const auto in_gemmm_gemmk_desc = + make_naive_tensor_descriptor(make_tuple(NHoWo, C), make_tuple(WiStride, CStride)); + + return in_gemmm_gemmk_desc; + } + else if constexpr(ConvForwardSpecialization == + device::ConvolutionForwardSpecialization::Filter1x1Pad0) + { + // This is different + const index_t NStride = a_g_n_c_wis_strides[1]; + const index_t HiStride = a_g_n_c_wis_strides[3]; + const index_t WiStride = a_g_n_c_wis_strides[4]; + const auto CStride = I1; + + const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor( + make_tuple(N, Hi, Wi, C), make_tuple(NStride, HiStride, WiStride, CStride)); + + const auto in_n_ho_wo_c_desc = transform_tensor_descriptor( + in_n_hi_wi_c_desc, + make_tuple(make_pass_through_transform(N), + make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)), + make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)), + make_pass_through_transform(C)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + + const auto in_gemmm_gemmk_desc = + transform_tensor_descriptor(in_n_ho_wo_c_desc, + make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)), + make_pass_through_transform(C)), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return in_gemmm_gemmk_desc; + } + else + { + const index_t Y = b_g_k_c_xs_lengths[3]; + const index_t X = b_g_k_c_xs_lengths[4]; + + const index_t ConvDilationH = conv_filter_dilations[0]; + const index_t ConvDilationW = conv_filter_dilations[1]; + + const index_t InLeftPadH = input_left_pads[0]; + const index_t InLeftPadW = input_left_pads[1]; + + const index_t InRightPadH = input_right_pads[0]; + const index_t InRightPadW = input_right_pads[1]; + + // This is different + const index_t NStride = a_g_n_c_wis_strides[1]; + const index_t HiStride = a_g_n_c_wis_strides[3]; + const index_t WiStride = a_g_n_c_wis_strides[4]; + const auto CStride = I1; + + const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor( + make_tuple(N, Hi, Wi, C), make_tuple(NStride, HiStride, WiStride, CStride)); + + const auto in_n_hip_wip_c_desc = transform_tensor_descriptor( + in_n_hi_wi_c_desc, + make_tuple(make_pass_through_transform(N), + make_pad_transform(Hi, InLeftPadH, InRightPadH), + make_pad_transform(Wi, InLeftPadW, InRightPadW), + make_pass_through_transform(C)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + + const auto in_n_y_ho_x_wo_c_desc = transform_tensor_descriptor( + in_n_hip_wip_c_desc, + make_tuple( + make_pass_through_transform(N), + make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)), + make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)), + make_pass_through_transform(C)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); + + const auto in_gemmm_gemmk_desc = + transform_tensor_descriptor(in_n_y_ho_x_wo_c_desc, + make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)), + make_merge_transform(make_tuple(Y, X, C))), + make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return in_gemmm_gemmk_desc; + } + } + + template || + is_same_v || + is_same_v), + bool>::type = false> + static auto + MakeADescriptor_M_K(const ck::Array& a_g_n_c_wis_lengths, + const ck::Array& a_g_n_c_wis_strides, + const ck::Array& b_g_k_c_xs_lengths, + const ck::Array& /* b_g_k_c_xs_strides */, + const ck::Array& c_g_n_k_wos_lengths, + const ck::Array& /* c_g_n_k_wos_strides */, + const ck::Array& conv_filter_strides, + const ck::Array& conv_filter_dilations, + const ck::Array& input_left_pads, + const ck::Array& input_right_pads) + { + const index_t N = a_g_n_c_wis_lengths[1]; + const index_t C = a_g_n_c_wis_lengths[2]; + + const index_t Di = a_g_n_c_wis_lengths[3]; + const index_t Hi = a_g_n_c_wis_lengths[4]; + const index_t Wi = a_g_n_c_wis_lengths[5]; + + const index_t Do = c_g_n_k_wos_lengths[3]; + const index_t Ho = c_g_n_k_wos_lengths[4]; + const index_t Wo = c_g_n_k_wos_lengths[5]; + + const index_t ConvStrideD = conv_filter_strides[0]; + const index_t ConvStrideH = conv_filter_strides[1]; + const index_t ConvStrideW = conv_filter_strides[2]; + + if constexpr(ConvForwardSpecialization == + device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) + { + const index_t NDoHoWo = + N * ck::accumulate_n( + c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>()); + + // This is different + const index_t WiStride = a_g_n_c_wis_strides[2 + NDimSpatial]; + const auto CStride = I1; + + const auto in_gemmm_gemmk_desc = + make_naive_tensor_descriptor(make_tuple(NDoHoWo, C), make_tuple(WiStride, CStride)); + + return in_gemmm_gemmk_desc; + } + else if constexpr(ConvForwardSpecialization == + device::ConvolutionForwardSpecialization::Filter1x1Pad0) + { + // This is different + const index_t NStride = a_g_n_c_wis_strides[1]; + const index_t DiStride = a_g_n_c_wis_strides[3]; + const index_t HiStride = a_g_n_c_wis_strides[4]; + const index_t WiStride = a_g_n_c_wis_strides[5]; + const auto CStride = I1; + + const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor( + make_tuple(N, Di, Hi, Wi, C), + make_tuple(NStride, DiStride, HiStride, WiStride, CStride)); + + const auto in_n_do_ho_wo_c_desc = transform_tensor_descriptor( + in_n_di_hi_wi_c_desc, + make_tuple(make_pass_through_transform(N), + make_embed_transform(make_tuple(Do), make_tuple(ConvStrideD)), + make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)), + make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)), + make_pass_through_transform(C)), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + + const auto in_gemmm_gemmk_desc = transform_tensor_descriptor( + in_n_do_ho_wo_c_desc, + make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)), + make_pass_through_transform(C)), + make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return in_gemmm_gemmk_desc; + } + else + { + const index_t Z = b_g_k_c_xs_lengths[3]; + const index_t Y = b_g_k_c_xs_lengths[4]; + const index_t X = b_g_k_c_xs_lengths[5]; + + const index_t ConvDilationD = conv_filter_dilations[0]; + const index_t ConvDilationH = conv_filter_dilations[1]; + const index_t ConvDilationW = conv_filter_dilations[2]; + + const index_t InLeftPadD = input_left_pads[0]; + const index_t InLeftPadH = input_left_pads[1]; + const index_t InLeftPadW = input_left_pads[2]; + + const index_t InRightPadD = input_right_pads[0]; + const index_t InRightPadH = input_right_pads[1]; + const index_t InRightPadW = input_right_pads[2]; + + // This is different + const index_t NStride = a_g_n_c_wis_strides[1]; + const index_t DiStride = a_g_n_c_wis_strides[3]; + const index_t HiStride = a_g_n_c_wis_strides[4]; + const index_t WiStride = a_g_n_c_wis_strides[5]; + const auto CStride = I1; + + const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor( + make_tuple(N, Di, Hi, Wi, C), + make_tuple(NStride, DiStride, HiStride, WiStride, CStride)); + + const auto in_n_hip_wip_c_desc = transform_tensor_descriptor( + in_n_di_hi_wi_c_desc, + make_tuple(make_pass_through_transform(N), + make_pad_transform(Di, InLeftPadD, InRightPadD), + make_pad_transform(Hi, InLeftPadH, InRightPadH), + make_pad_transform(Wi, InLeftPadW, InRightPadW), + make_pass_through_transform(C)), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + + const auto in_n_z_do_y_ho_x_wo_c_desc = transform_tensor_descriptor( + in_n_hip_wip_c_desc, + make_tuple( + make_pass_through_transform(N), + make_embed_transform(make_tuple(Z, Do), make_tuple(ConvDilationD, ConvStrideD)), + make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)), + make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)), + make_pass_through_transform(C)), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), + make_tuple(Sequence<0>{}, + Sequence<1, 2>{}, + Sequence<3, 4>{}, + Sequence<5, 6>{}, + Sequence<7>{})); + + const auto in_gemmm_gemmk_desc = transform_tensor_descriptor( + in_n_z_do_y_ho_x_wo_c_desc, + make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)), + make_merge_transform(make_tuple(Z, Y, X, C))), + make_tuple(Sequence<0, 2, 4, 6>{}, Sequence<1, 3, 5, 7>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return in_gemmm_gemmk_desc; + } + } + + template || + is_same_v || + is_same_v, + bool>::type = false> + __host__ __device__ static auto + MakeBDescriptor_N_K(const ck::Array& b_g_k_c_xs_lengths, + const ck::Array& /* b_g_k_c_xs_strides */) + { + const index_t K = b_g_k_c_xs_lengths[1]; + const index_t C = b_g_k_c_xs_lengths[2]; + + const index_t YX = + mult_accumulate_n(b_g_k_c_xs_lengths.begin() + 3, NDimSpatial, 1); + + const auto wei_gemmn_gemmk_desc = + make_naive_tensor_descriptor_packed(make_tuple(K, YX * C)); + + return wei_gemmn_gemmk_desc; + } + + template < + typename BLayout, + typename std::enable_if || + is_same_v || + is_same_v || + is_same_v || + is_same_v || + is_same_v, + bool>::type = false> + __host__ __device__ static auto + MakeBDescriptor_N_K(const ck::Array& b_g_k_c_xs_lengths, + const ck::Array& b_g_k_c_xs_strides) + { + const index_t K = b_g_k_c_xs_lengths[1]; + const index_t C = b_g_k_c_xs_lengths[2]; + + const index_t YX = + mult_accumulate_n(b_g_k_c_xs_lengths.begin() + 3, NDimSpatial, 1); + + const index_t KStride = b_g_k_c_xs_strides[1]; + const index_t XStride = b_g_k_c_xs_strides[2 + NDimSpatial]; + const auto CStride = I1; + + const auto wei_k_yx_c_desc = make_naive_tensor_descriptor( + make_tuple(K, YX, C), make_tuple(KStride, XStride, CStride)); + + const auto wei_gemmn_gemmk_desc = transform_tensor_descriptor( + wei_k_yx_c_desc, + make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(YX, C))), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return wei_gemmn_gemmk_desc; + } + + template || + is_same_v || + is_same_v, + bool>::type = false> + __host__ __device__ static auto + MakeCDescriptor_M_N(const ck::Array& c_g_n_k_wos_lengths, + const ck::Array& /* c_g_n_k_wos_strides */) + { + const index_t N = c_g_n_k_wos_lengths[1]; + const index_t K = c_g_n_k_wos_lengths[2]; + + const index_t NHoWo = + N * mult_accumulate_n(c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1); + + const auto out_gemmm_gemmn_desc = make_naive_tensor_descriptor_packed(make_tuple(NHoWo, K)); + + return out_gemmm_gemmn_desc; + } + + template < + typename CLayout, + typename std::enable_if || + is_same_v || + is_same_v || + is_same_v || + is_same_v || + is_same_v, + bool>::type = false> + __host__ __device__ static auto + MakeCDescriptor_M_N(const ck::Array& c_g_n_k_wos_lengths, + const ck::Array& c_g_n_k_wos_strides) + { + const index_t N = c_g_n_k_wos_lengths[1]; + const index_t K = c_g_n_k_wos_lengths[2]; + + const auto KStride = I1; + const index_t WoStride = c_g_n_k_wos_strides[NDimSpatial + 2]; + + const index_t NHoWo = + N * mult_accumulate_n(c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1); + + const auto out_gemmm_gemmn_desc = + make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(WoStride, KStride)); + + return out_gemmm_gemmn_desc; + } + + // for output bias + template , + bool>::type = false> + __host__ __device__ static auto + MakeCDescriptor_M_N(const ck::Array& c_g_n_k_wos_lengths, + const ck::Array& c_g_n_k_wos_strides) + { + const index_t N = c_g_n_k_wos_lengths[1]; + const index_t K = c_g_n_k_wos_lengths[2]; + const index_t KStride = c_g_n_k_wos_strides[2]; + + const index_t NHoWo = + N * mult_accumulate_n(c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1); + + const auto out_gemmm_gemmn_desc = + make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(I0, KStride)); + + return out_gemmm_gemmn_desc; + } +}; + +// wrapper class to call member functions on TransformConvToGemm struct at runtime +// TODO: figure out aq way to properly pass in layout as an argument +struct TransformConv +{ + TransformConv() {} + + template + auto + transform_func(ck::Array out_lengths, + ck::Array out_strides, + TransformConvFwdToGemm conv_fwd_to_gemm) + { + if(NDimSpatial == 2) + { + return conv_fwd_to_gemm + .template MakeCDescriptor_M_N(out_lengths, + out_strides); + } + else if(NDimSpatial == 3) + { + return conv_fwd_to_gemm + .template MakeCDescriptor_M_N(out_lengths, + out_strides); + } + else if(NDimSpatial == 1) + { + return conv_fwd_to_gemm.template MakeCDescriptor_M_N( + out_lengths, out_strides); + } + } }; } // namespace tensor_operation diff --git a/include/ck/utility/array.hpp b/include/ck/utility/array.hpp index f63ce5e5a0..5366c56a9d 100644 --- a/include/ck/utility/array.hpp +++ b/include/ck/utility/array.hpp @@ -36,6 +36,8 @@ struct Array return *this; } + __host__ __device__ constexpr const TData* begin() const { return &mData[0]; } + __host__ __device__ constexpr const TData* end() const { return &mData[NSize]; } }; // empty Array