From dd6fd8bb62309cf19a7075bb25ff0d8633dcb581 Mon Sep 17 00:00:00 2001 From: Paul Date: Thu, 25 May 2023 15:54:15 -0500 Subject: [PATCH] Move functions to cpp file --- library/src/jit_library/CMakeLists.txt | 20 +- .../jit_library/include/ck/host/common.hpp | 25 ++ .../ck/host/device_gemm_multiple_d.hpp | 91 ++++++++ .../include/device_gemm_multiple_d.hpp | 217 ------------------ library/src/jit_library/src/common.cpp | 19 ++ .../src/device_gemm_multiple_d.cpp | 142 ++++++++++++ 6 files changed, 284 insertions(+), 230 deletions(-) create mode 100644 library/src/jit_library/include/ck/host/common.hpp create mode 100644 library/src/jit_library/include/ck/host/device_gemm_multiple_d.hpp delete mode 100644 library/src/jit_library/include/device_gemm_multiple_d.hpp create mode 100644 library/src/jit_library/src/common.cpp create mode 100644 library/src/jit_library/src/device_gemm_multiple_d.cpp diff --git a/library/src/jit_library/CMakeLists.txt b/library/src/jit_library/CMakeLists.txt index 539b50aa8f..ee9163f708 100644 --- a/library/src/jit_library/CMakeLists.txt +++ b/library/src/jit_library/CMakeLists.txt @@ -10,17 +10,18 @@ execute_process( ) -set(JIT_LIB_SOURCE - ${CMAKE_CURRENT_SOURCE_DIR}/include/device_gemm_multiple_d.hpp +add_library(jit_library STATIC + src/device_gemm_multiple_d.cpp + src/common.cpp ) - -add_library(jit_library STATIC ${JIT_LIB_SOURCE}) add_library(composable_kernel::jit_library ALIAS jit_library) set_target_properties(jit_library PROPERTIES LINKER_LANGUAGE CXX) -target_include_directories(jit_library PUBLIC +target_include_directories(jit_library PRIVATE $ + $ + $ ) target_link_libraries(jit_library PRIVATE ck_headers) @@ -30,14 +31,7 @@ rocm_install( EXPORT jit_libraryTargets ) -set(INCLUDE_DIRS - ${PROJECT_SOURCE_DIR}/include/ck/ - ${PROJECT_SOURCE_DIR}/library/src/jit_library/include - ${PROJECT_SOURCE_DIR}/library/src/jit_library/solution_instances - ${CMAKE_CURRENT_BINARY_DIR}/embed/ck_headers/include -) - -rocm_install(DIRECTORY ${INCLUDE_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck) +rocm_install(DIRECTORY include DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) rocm_install( EXPORT jit_libraryTargets diff --git a/library/src/jit_library/include/ck/host/common.hpp b/library/src/jit_library/include/ck/host/common.hpp new file mode 100644 index 0000000000..ebddbee0fa --- /dev/null +++ b/library/src/jit_library/include/ck/host/common.hpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +namespace ck { +namespace host { + +enum class DataType { + Half, + Float, + Int8, + Int32 +}; + +std::unordered_map> GetHeaders(); + +std::size_t integer_divide_ceil(std::size_t x, std::size_t y); + +} // namespace host +} // namespace ck diff --git a/library/src/jit_library/include/ck/host/device_gemm_multiple_d.hpp b/library/src/jit_library/include/ck/host/device_gemm_multiple_d.hpp new file mode 100644 index 0000000000..241c5b3e96 --- /dev/null +++ b/library/src/jit_library/include/ck/host/device_gemm_multiple_d.hpp @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include +#include + + +namespace ck { +namespace host { +namespace device_gemm_multiple_d { + + +struct Solution +{ + std::string template_str; + std::size_t block_size; + std::size_t grid_size; +}; + +struct Problem +{ + std::size_t M = 0; + std::size_t N = 0; + std::size_t K = 0; + bool TransA = false; + bool TransB = false; + bool TransE = false; + std::vector DsLayout = {}; + std::string ADataType = "ck::half_t"; + std::string BDataType = "ck::half_t"; + std::string EDataType = "ck::half_t"; + std::vector DsDataType = {}; + std::string AElementOp = "ck::tensor_operation::element_wise::PassThrough"; + std::string BElementOp = "ck::tensor_operation::element_wise::PassThrough"; + std::string CDEElementOp = "ck::Tuple<>"; + + static const std::size_t ds_layout_idx = 3; + static const std::size_t ds_data_type_idx = 9; + static const std::size_t e_data_type_idx = 10; + static const std::size_t a_elementwise_op_idx = 11; + static const std::size_t b_elementwise_op_idx = 12; + static const std::size_t ds_elementwise_op_idx = 13; + static const std::size_t gemm_spec_idx = 14; + static const std::size_t block_size_idx = 16; + static const std::size_t m_per_block_idx = 17; + static const std::size_t n_per_block_idx = 18; + static const std::size_t k_per_block_idx = 19; + +private: + std::vector GetInstances(const std::string& arch) const; + + std::string MakeLayoutTuple(const std::vector& layouts) const; + + std::string MakeTypeTuple(const std::vector& types) const; + + Solution MakeSolution(std::size_t idx, const std::string& arch) const +; + +public: + auto GetHeaders() const + { + return ck_headers(); + } + + std::string GetIncludeHeader() const + { + return instance::gemm_add_add_fastgelu_instances{}.get_include_header(); + } + + std::vector GetSolutions(const std::string& arch) const + { + std::vector solutions; + const auto num_instances = GetInstances(arch).size(); + for (auto i = 0; i < num_instances; ++i) + { + solutions.push_back(MakeSolution(i, arch)); + } + + return solutions; + } +}; + +} // namespace device_gemm_multiple_d +} // namespace host +} // namespace ck diff --git a/library/src/jit_library/include/device_gemm_multiple_d.hpp b/library/src/jit_library/include/device_gemm_multiple_d.hpp deleted file mode 100644 index 821821f1f5..0000000000 --- a/library/src/jit_library/include/device_gemm_multiple_d.hpp +++ /dev/null @@ -1,217 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include "ck/solution_instances/gemm_add_add_fastgelu_instances.hpp" -#include "ck/ck.hpp" -#include "ck/utility/math.hpp" -#include "ck_headers.hpp" - - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_gemm_multiple_d { - - -struct Solution -{ - std::string template_str; - index_t block_size; - index_t grid_size; -}; - -std::string GetGemmSpec(const index_t m, - const index_t n, - const index_t k, - const index_t m_per_block, - const index_t n_per_block, - const index_t k_per_block) -{ - std::string spec = ""; - if(math::integer_divide_ceil(m, m_per_block) * m_per_block - m != 0) - spec += "M"; - if(math::integer_divide_ceil(n, n_per_block) * n_per_block - n != 0) - spec += "N"; - if(math::integer_divide_ceil(k, k_per_block) * k_per_block - k != 0) - spec += "K"; - if(spec == "") - return "ck::tensor_operation::device::GemmSpecialization::Default"; - - return "ck::tensor_operation::device::GemmSpecialization::" + spec + "Padding"; -} - -index_t GetGridSize(const index_t m, - const index_t n, - const index_t m_per_block, - const index_t n_per_block) -{ - return math::integer_divide_ceil(m, m_per_block) * - math::integer_divide_ceil(n, n_per_block); -} - -const std::unordered_set& get_xdlop_archs() -{ - static std::unordered_set supported_archs{"gfx90a"}; - return supported_archs; -} - -struct Problem -{ - index_t M = 0; - index_t N = 0; - index_t K = 0; - bool TransA = false; - bool TransB = false; - bool TransE = false; - std::vector DsLayout = {}; - std::string ADataType = "ck::half_t"; - std::string BDataType = "ck::half_t"; - std::string EDataType = "ck::half_t"; - std::vector DsDataType = {}; - std::string AElementOp = "ck::tensor_operation::element_wise::PassThrough"; - std::string BElementOp = "ck::tensor_operation::element_wise::PassThrough"; - std::string CDEElementOp = "ck::Tuple<>"; - - static const index_t ds_layout_idx = 3; - static const index_t ds_data_type_idx = 9; - static const index_t e_data_type_idx = 10; - static const index_t a_elementwise_op_idx = 11; - static const index_t b_elementwise_op_idx = 12; - static const index_t ds_elementwise_op_idx = 13; - static const index_t gemm_spec_idx = 14; - static const index_t block_size_idx = 16; - static const index_t m_per_block_idx = 17; - static const index_t n_per_block_idx = 18; - static const index_t k_per_block_idx = 19; - -private: - auto GetInstances(const std::string& arch) const - { - std::vector instances; - const bool quantize = ADataType == "int8_t" and BDataType == "int8_t"; - if (get_xdlop_archs().find(arch) != get_xdlop_archs().end()) - { - instance::gemm_add_add_fastgelu_instances all_instances{}; - if(TransA and TransB) - instances = all_instances.get_col_col_instances(quantize); - else if(TransA and not TransB) - instances = all_instances.get_col_row_instances(quantize); - else if(not TransA and not TransB) - instances = all_instances.get_row_row_instances(quantize); - else - instances = all_instances.get_row_col_instances(quantize); - } - return instances; - } - - auto MakeLayoutTuple(const std::vector& layouts) const - { - std::string layout_tuple = "ck::Tuple<"; - auto it = layouts.begin(); - while(it != layouts.end()) - { - layout_tuple += *it ? "ck::tensor_layout::gemm::ColumnMajor" : "ck::tensor_layout::gemm::RowMajor"; - it = std::next(it); - if (it != layouts.end()) - layout_tuple += ", "; - } - - return layout_tuple + ">"; - } - - auto MakeTypeTuple(const std::vector& types) const - { - std::string type_tuple = "ck::Tuple<"; - auto it = types.begin(); - while(it != types.end()) - { - type_tuple += *it; - it = std::next(it); - if (it != types.end()) - type_tuple += ", "; - } - return type_tuple + ">"; - } - - auto MakeSolution(index_t idx, const std::string& arch) const - { - auto template_str = GetInstances(arch).at(idx); - std::istringstream iss(template_str); - std::vector params(std::istream_iterator{iss}, - std::istream_iterator()); - - if (ADataType == "int8_t" and BDataType == "int8_t") - { - // Change CBlockTransfer ScalarPerVector if Ds contains other types - if (std::any_of(DsDataType.begin(), DsDataType.end(), [](auto t) { return t == "ck::half_t"; })) - { - params[params.size() - 3] = "8"; - } - if (std::any_of(DsDataType.begin(), DsDataType.end(), [](auto t) { return t == "float"; })) - { - params[params.size() - 3] = "4"; - } - } - - params[a_elementwise_op_idx] = AElementOp; - params[b_elementwise_op_idx] = BElementOp; - params[ds_layout_idx] = MakeLayoutTuple(DsLayout); - params[ds_data_type_idx] = MakeTypeTuple(DsDataType); - params[ds_elementwise_op_idx] = CDEElementOp; - params[e_data_type_idx] = EDataType; - auto block_size_str = params[block_size_idx]; - auto m_per_block_str = params[m_per_block_idx]; - auto n_per_block_str = params[n_per_block_idx]; - auto k_per_block_str = params[k_per_block_idx]; - const auto block_size = std::stoi(block_size_str); - const auto m_per_block = std::stoi(m_per_block_str); - const auto n_per_block = std::stoi(n_per_block_str); - const auto k_per_block = std::stoi(k_per_block_str); - const auto grid_size = GetGridSize(M, N, m_per_block, n_per_block); - params[gemm_spec_idx] = GetGemmSpec(M, N, K, m_per_block, n_per_block, k_per_block); - - std::string str = std::accumulate(params.begin() + 1, params.end(), std::string{}, - [](const std::string& a, const std::string& b) { - return a.empty() ? b : a + ", " + b; - }); - str = params.front() + "< " + str + ">"; - - return Solution{str, block_size, grid_size}; - } - -public: - auto GetHeaders() const - { - return ck_headers(); - } - - auto GetIncludeHeader() const - { - return instance::gemm_add_add_fastgelu_instances{}.get_include_header(); - } - - auto GetSolutions(const std::string& arch) const - { - std::vector solutions; - const auto num_instances = GetInstances(arch).size(); - for (auto i = 0; i < num_instances; ++i) - { - solutions.push_back(MakeSolution(i, arch)); - } - - return solutions; - } -}; - -} // namespace device_gemm_multiple_d -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/jit_library/src/common.cpp b/library/src/jit_library/src/common.cpp new file mode 100644 index 0000000000..92ffa5876b --- /dev/null +++ b/library/src/jit_library/src/common.cpp @@ -0,0 +1,19 @@ + +#include "ck/host/common.hpp" +#include "ck_headers.hpp" + +namespace ck { +namespace host { + +std::unordered_map> GetHeaders() +{ + return ck_headers(); +} + +std::size_t integer_divide_ceil(std::size_t x, std::size_t y) +{ + return (x + y - std::size_t{1}) / y; +} + +} // namespace host +} // namespace ck diff --git a/library/src/jit_library/src/device_gemm_multiple_d.cpp b/library/src/jit_library/src/device_gemm_multiple_d.cpp new file mode 100644 index 0000000000..e4f8c07f3a --- /dev/null +++ b/library/src/jit_library/src/device_gemm_multiple_d.cpp @@ -0,0 +1,142 @@ +#include "ck/host/device_gemm_multiple_d.hpp" +#include "ck/host/common.hpp" +#include "ck/solution_instances/gemm_add_add_fastgelu_instances.hpp" +#include + +namespace ck { +namespace host { +namespace device_gemm_multiple_d { + +std::string GetGemmSpec(const std::size_t m, + const std::size_t n, + const std::size_t k, + const std::size_t m_per_block, + const std::size_t n_per_block, + const std::size_t k_per_block) +{ + std::string spec = ""; + if(integer_divide_ceil(m, m_per_block) * m_per_block - m != 0) + spec += "M"; + if(integer_divide_ceil(n, n_per_block) * n_per_block - n != 0) + spec += "N"; + if(integer_divide_ceil(k, k_per_block) * k_per_block - k != 0) + spec += "K"; + if(spec == "") + return "ck::tensor_operation::device::GemmSpecialization::Default"; + + return "ck::tensor_operation::device::GemmSpecialization::" + spec + "Padding"; +} + +std::size_t GetGridSize(const std::size_t m, + const std::size_t n, + const std::size_t m_per_block, + const std::size_t n_per_block) +{ + return integer_divide_ceil(m, m_per_block) * + integer_divide_ceil(n, n_per_block); +} + +const std::unordered_set& get_xdlop_archs() +{ + static std::unordered_set supported_archs{"gfx90a"}; + return supported_archs; +} + +std::vector Problem::GetInstances(const std::string& arch) const +{ + std::vector instances; + const bool quantize = ADataType == "int8_t" and BDataType == "int8_t"; + if (get_xdlop_archs().find(arch) != get_xdlop_archs().end()) + { + instance::gemm_add_add_fastgelu_instances all_instances{}; + if(TransA and TransB) + instances = all_instances.get_col_col_instances(quantize); + else if(TransA and not TransB) + instances = all_instances.get_col_row_instances(quantize); + else if(not TransA and not TransB) + instances = all_instances.get_row_row_instances(quantize); + else + instances = all_instances.get_row_col_instances(quantize); + } + return instances; +} + +std::string Problem::MakeLayoutTuple(const std::vector& layouts) const +{ + std::string layout_tuple = "ck::Tuple<"; + auto it = layouts.begin(); + while(it != layouts.end()) + { + layout_tuple += *it ? "ck::tensor_layout::gemm::ColumnMajor" : "ck::tensor_layout::gemm::RowMajor"; + it = std::next(it); + if (it != layouts.end()) + layout_tuple += ", "; + } + + return layout_tuple + ">"; +} + +std::string Problem::MakeTypeTuple(const std::vector& types) const +{ + std::string type_tuple = "ck::Tuple<"; + auto it = types.begin(); + while(it != types.end()) + { + type_tuple += *it; + it = std::next(it); + if (it != types.end()) + type_tuple += ", "; + } + return type_tuple + ">"; +} + +Solution Problem::MakeSolution(std::size_t idx, const std::string& arch) const +{ + auto template_str = GetInstances(arch).at(idx); + std::istringstream iss(template_str); + std::vector params(std::istream_iterator{iss}, + std::istream_iterator()); + + if (ADataType == "int8_t" and BDataType == "int8_t") + { + // Change CBlockTransfer ScalarPerVector if Ds contains other types + if (std::any_of(DsDataType.begin(), DsDataType.end(), [](auto t) { return t == "ck::half_t"; })) + { + params[params.size() - 3] = "8"; + } + if (std::any_of(DsDataType.begin(), DsDataType.end(), [](auto t) { return t == "float"; })) + { + params[params.size() - 3] = "4"; + } + } + + params[a_elementwise_op_idx] = AElementOp; + params[b_elementwise_op_idx] = BElementOp; + params[ds_layout_idx] = MakeLayoutTuple(DsLayout); + params[ds_data_type_idx] = MakeTypeTuple(DsDataType); + params[ds_elementwise_op_idx] = CDEElementOp; + params[e_data_type_idx] = EDataType; + auto block_size_str = params[block_size_idx]; + auto m_per_block_str = params[m_per_block_idx]; + auto n_per_block_str = params[n_per_block_idx]; + auto k_per_block_str = params[k_per_block_idx]; + const auto block_size = std::stoi(block_size_str); + const auto m_per_block = std::stoi(m_per_block_str); + const auto n_per_block = std::stoi(n_per_block_str); + const auto k_per_block = std::stoi(k_per_block_str); + const auto grid_size = GetGridSize(M, N, m_per_block, n_per_block); + params[gemm_spec_idx] = GetGemmSpec(M, N, K, m_per_block, n_per_block, k_per_block); + + std::string str = std::accumulate(params.begin() + 1, params.end(), std::string{}, + [](const std::string& a, const std::string& b) { + return a.empty() ? b : a + ", " + b; + }); + str = params.front() + "< " + str + ">"; + + return Solution{str, block_size, grid_size}; +} + + +} // namespace device_gemm_multiple_d +} // namespace host +} // namespace ck