mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-30 03:37:38 +00:00
Move functions to cpp file
This commit is contained in:
@@ -10,17 +10,18 @@ execute_process(
|
||||
)
|
||||
|
||||
|
||||
set(JIT_LIB_SOURCE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/include/device_gemm_multiple_d.hpp
|
||||
add_library(jit_library STATIC
|
||||
src/device_gemm_multiple_d.cpp
|
||||
src/common.cpp
|
||||
)
|
||||
|
||||
add_library(jit_library STATIC ${JIT_LIB_SOURCE})
|
||||
add_library(composable_kernel::jit_library ALIAS jit_library)
|
||||
|
||||
set_target_properties(jit_library PROPERTIES LINKER_LANGUAGE CXX)
|
||||
|
||||
target_include_directories(jit_library PUBLIC
|
||||
target_include_directories(jit_library PRIVATE
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
||||
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/library/src/jit_library/solution_instances>
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/embed/ck_headers/include>
|
||||
)
|
||||
|
||||
target_link_libraries(jit_library PRIVATE ck_headers)
|
||||
@@ -30,14 +31,7 @@ rocm_install(
|
||||
EXPORT jit_libraryTargets
|
||||
)
|
||||
|
||||
set(INCLUDE_DIRS
|
||||
${PROJECT_SOURCE_DIR}/include/ck/
|
||||
${PROJECT_SOURCE_DIR}/library/src/jit_library/include
|
||||
${PROJECT_SOURCE_DIR}/library/src/jit_library/solution_instances
|
||||
${CMAKE_CURRENT_BINARY_DIR}/embed/ck_headers/include
|
||||
)
|
||||
|
||||
rocm_install(DIRECTORY ${INCLUDE_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
|
||||
rocm_install(DIRECTORY include DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
|
||||
rocm_install(
|
||||
EXPORT jit_libraryTargets
|
||||
|
||||
25
library/src/jit_library/include/ck/host/common.hpp
Normal file
25
library/src/jit_library/include/ck/host/common.hpp
Normal file
@@ -0,0 +1,25 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
|
||||
enum class DataType {
|
||||
Half,
|
||||
Float,
|
||||
Int8,
|
||||
Int32
|
||||
};
|
||||
|
||||
std::unordered_map<std::string, std::pair<const char*,const char*>> GetHeaders();
|
||||
|
||||
std::size_t integer_divide_ceil(std::size_t x, std::size_t y);
|
||||
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,91 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace device_gemm_multiple_d {
|
||||
|
||||
|
||||
struct Solution
|
||||
{
|
||||
std::string template_str;
|
||||
std::size_t block_size;
|
||||
std::size_t grid_size;
|
||||
};
|
||||
|
||||
struct Problem
|
||||
{
|
||||
std::size_t M = 0;
|
||||
std::size_t N = 0;
|
||||
std::size_t K = 0;
|
||||
bool TransA = false;
|
||||
bool TransB = false;
|
||||
bool TransE = false;
|
||||
std::vector<bool> DsLayout = {};
|
||||
std::string ADataType = "ck::half_t";
|
||||
std::string BDataType = "ck::half_t";
|
||||
std::string EDataType = "ck::half_t";
|
||||
std::vector<std::string> DsDataType = {};
|
||||
std::string AElementOp = "ck::tensor_operation::element_wise::PassThrough";
|
||||
std::string BElementOp = "ck::tensor_operation::element_wise::PassThrough";
|
||||
std::string CDEElementOp = "ck::Tuple<>";
|
||||
|
||||
static const std::size_t ds_layout_idx = 3;
|
||||
static const std::size_t ds_data_type_idx = 9;
|
||||
static const std::size_t e_data_type_idx = 10;
|
||||
static const std::size_t a_elementwise_op_idx = 11;
|
||||
static const std::size_t b_elementwise_op_idx = 12;
|
||||
static const std::size_t ds_elementwise_op_idx = 13;
|
||||
static const std::size_t gemm_spec_idx = 14;
|
||||
static const std::size_t block_size_idx = 16;
|
||||
static const std::size_t m_per_block_idx = 17;
|
||||
static const std::size_t n_per_block_idx = 18;
|
||||
static const std::size_t k_per_block_idx = 19;
|
||||
|
||||
private:
|
||||
std::vector<std::string> GetInstances(const std::string& arch) const;
|
||||
|
||||
std::string MakeLayoutTuple(const std::vector<bool>& layouts) const;
|
||||
|
||||
std::string MakeTypeTuple(const std::vector<std::string>& types) const;
|
||||
|
||||
Solution MakeSolution(std::size_t idx, const std::string& arch) const
|
||||
;
|
||||
|
||||
public:
|
||||
auto GetHeaders() const
|
||||
{
|
||||
return ck_headers();
|
||||
}
|
||||
|
||||
std::string GetIncludeHeader() const
|
||||
{
|
||||
return instance::gemm_add_add_fastgelu_instances{}.get_include_header();
|
||||
}
|
||||
|
||||
std::vector<Solution> GetSolutions(const std::string& arch) const
|
||||
{
|
||||
std::vector<Solution> solutions;
|
||||
const auto num_instances = GetInstances(arch).size();
|
||||
for (auto i = 0; i < num_instances; ++i)
|
||||
{
|
||||
solutions.push_back(MakeSolution(i, arch));
|
||||
}
|
||||
|
||||
return solutions;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace device_gemm_multiple_d
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
@@ -1,217 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include "ck/solution_instances/gemm_add_add_fastgelu_instances.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/math.hpp"
|
||||
#include "ck_headers.hpp"
|
||||
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_gemm_multiple_d {
|
||||
|
||||
|
||||
struct Solution
|
||||
{
|
||||
std::string template_str;
|
||||
index_t block_size;
|
||||
index_t grid_size;
|
||||
};
|
||||
|
||||
std::string GetGemmSpec(const index_t m,
|
||||
const index_t n,
|
||||
const index_t k,
|
||||
const index_t m_per_block,
|
||||
const index_t n_per_block,
|
||||
const index_t k_per_block)
|
||||
{
|
||||
std::string spec = "";
|
||||
if(math::integer_divide_ceil(m, m_per_block) * m_per_block - m != 0)
|
||||
spec += "M";
|
||||
if(math::integer_divide_ceil(n, n_per_block) * n_per_block - n != 0)
|
||||
spec += "N";
|
||||
if(math::integer_divide_ceil(k, k_per_block) * k_per_block - k != 0)
|
||||
spec += "K";
|
||||
if(spec == "")
|
||||
return "ck::tensor_operation::device::GemmSpecialization::Default";
|
||||
|
||||
return "ck::tensor_operation::device::GemmSpecialization::" + spec + "Padding";
|
||||
}
|
||||
|
||||
index_t GetGridSize(const index_t m,
|
||||
const index_t n,
|
||||
const index_t m_per_block,
|
||||
const index_t n_per_block)
|
||||
{
|
||||
return math::integer_divide_ceil(m, m_per_block) *
|
||||
math::integer_divide_ceil(n, n_per_block);
|
||||
}
|
||||
|
||||
const std::unordered_set<std::string>& get_xdlop_archs()
|
||||
{
|
||||
static std::unordered_set<std::string> supported_archs{"gfx90a"};
|
||||
return supported_archs;
|
||||
}
|
||||
|
||||
struct Problem
|
||||
{
|
||||
index_t M = 0;
|
||||
index_t N = 0;
|
||||
index_t K = 0;
|
||||
bool TransA = false;
|
||||
bool TransB = false;
|
||||
bool TransE = false;
|
||||
std::vector<bool> DsLayout = {};
|
||||
std::string ADataType = "ck::half_t";
|
||||
std::string BDataType = "ck::half_t";
|
||||
std::string EDataType = "ck::half_t";
|
||||
std::vector<std::string> DsDataType = {};
|
||||
std::string AElementOp = "ck::tensor_operation::element_wise::PassThrough";
|
||||
std::string BElementOp = "ck::tensor_operation::element_wise::PassThrough";
|
||||
std::string CDEElementOp = "ck::Tuple<>";
|
||||
|
||||
static const index_t ds_layout_idx = 3;
|
||||
static const index_t ds_data_type_idx = 9;
|
||||
static const index_t e_data_type_idx = 10;
|
||||
static const index_t a_elementwise_op_idx = 11;
|
||||
static const index_t b_elementwise_op_idx = 12;
|
||||
static const index_t ds_elementwise_op_idx = 13;
|
||||
static const index_t gemm_spec_idx = 14;
|
||||
static const index_t block_size_idx = 16;
|
||||
static const index_t m_per_block_idx = 17;
|
||||
static const index_t n_per_block_idx = 18;
|
||||
static const index_t k_per_block_idx = 19;
|
||||
|
||||
private:
|
||||
auto GetInstances(const std::string& arch) const
|
||||
{
|
||||
std::vector<std::string> instances;
|
||||
const bool quantize = ADataType == "int8_t" and BDataType == "int8_t";
|
||||
if (get_xdlop_archs().find(arch) != get_xdlop_archs().end())
|
||||
{
|
||||
instance::gemm_add_add_fastgelu_instances all_instances{};
|
||||
if(TransA and TransB)
|
||||
instances = all_instances.get_col_col_instances(quantize);
|
||||
else if(TransA and not TransB)
|
||||
instances = all_instances.get_col_row_instances(quantize);
|
||||
else if(not TransA and not TransB)
|
||||
instances = all_instances.get_row_row_instances(quantize);
|
||||
else
|
||||
instances = all_instances.get_row_col_instances(quantize);
|
||||
}
|
||||
return instances;
|
||||
}
|
||||
|
||||
auto MakeLayoutTuple(const std::vector<bool>& layouts) const
|
||||
{
|
||||
std::string layout_tuple = "ck::Tuple<";
|
||||
auto it = layouts.begin();
|
||||
while(it != layouts.end())
|
||||
{
|
||||
layout_tuple += *it ? "ck::tensor_layout::gemm::ColumnMajor" : "ck::tensor_layout::gemm::RowMajor";
|
||||
it = std::next(it);
|
||||
if (it != layouts.end())
|
||||
layout_tuple += ", ";
|
||||
}
|
||||
|
||||
return layout_tuple + ">";
|
||||
}
|
||||
|
||||
auto MakeTypeTuple(const std::vector<std::string>& types) const
|
||||
{
|
||||
std::string type_tuple = "ck::Tuple<";
|
||||
auto it = types.begin();
|
||||
while(it != types.end())
|
||||
{
|
||||
type_tuple += *it;
|
||||
it = std::next(it);
|
||||
if (it != types.end())
|
||||
type_tuple += ", ";
|
||||
}
|
||||
return type_tuple + ">";
|
||||
}
|
||||
|
||||
auto MakeSolution(index_t idx, const std::string& arch) const
|
||||
{
|
||||
auto template_str = GetInstances(arch).at(idx);
|
||||
std::istringstream iss(template_str);
|
||||
std::vector<std::string> params(std::istream_iterator<std::string>{iss},
|
||||
std::istream_iterator<std::string>());
|
||||
|
||||
if (ADataType == "int8_t" and BDataType == "int8_t")
|
||||
{
|
||||
// Change CBlockTransfer ScalarPerVector if Ds contains other types
|
||||
if (std::any_of(DsDataType.begin(), DsDataType.end(), [](auto t) { return t == "ck::half_t"; }))
|
||||
{
|
||||
params[params.size() - 3] = "8";
|
||||
}
|
||||
if (std::any_of(DsDataType.begin(), DsDataType.end(), [](auto t) { return t == "float"; }))
|
||||
{
|
||||
params[params.size() - 3] = "4";
|
||||
}
|
||||
}
|
||||
|
||||
params[a_elementwise_op_idx] = AElementOp;
|
||||
params[b_elementwise_op_idx] = BElementOp;
|
||||
params[ds_layout_idx] = MakeLayoutTuple(DsLayout);
|
||||
params[ds_data_type_idx] = MakeTypeTuple(DsDataType);
|
||||
params[ds_elementwise_op_idx] = CDEElementOp;
|
||||
params[e_data_type_idx] = EDataType;
|
||||
auto block_size_str = params[block_size_idx];
|
||||
auto m_per_block_str = params[m_per_block_idx];
|
||||
auto n_per_block_str = params[n_per_block_idx];
|
||||
auto k_per_block_str = params[k_per_block_idx];
|
||||
const auto block_size = std::stoi(block_size_str);
|
||||
const auto m_per_block = std::stoi(m_per_block_str);
|
||||
const auto n_per_block = std::stoi(n_per_block_str);
|
||||
const auto k_per_block = std::stoi(k_per_block_str);
|
||||
const auto grid_size = GetGridSize(M, N, m_per_block, n_per_block);
|
||||
params[gemm_spec_idx] = GetGemmSpec(M, N, K, m_per_block, n_per_block, k_per_block);
|
||||
|
||||
std::string str = std::accumulate(params.begin() + 1, params.end(), std::string{},
|
||||
[](const std::string& a, const std::string& b) {
|
||||
return a.empty() ? b : a + ", " + b;
|
||||
});
|
||||
str = params.front() + "< " + str + ">";
|
||||
|
||||
return Solution{str, block_size, grid_size};
|
||||
}
|
||||
|
||||
public:
|
||||
auto GetHeaders() const
|
||||
{
|
||||
return ck_headers();
|
||||
}
|
||||
|
||||
auto GetIncludeHeader() const
|
||||
{
|
||||
return instance::gemm_add_add_fastgelu_instances{}.get_include_header();
|
||||
}
|
||||
|
||||
auto GetSolutions(const std::string& arch) const
|
||||
{
|
||||
std::vector<Solution> solutions;
|
||||
const auto num_instances = GetInstances(arch).size();
|
||||
for (auto i = 0; i < num_instances; ++i)
|
||||
{
|
||||
solutions.push_back(MakeSolution(i, arch));
|
||||
}
|
||||
|
||||
return solutions;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace device_gemm_multiple_d
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
19
library/src/jit_library/src/common.cpp
Normal file
19
library/src/jit_library/src/common.cpp
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
#include "ck/host/common.hpp"
|
||||
#include "ck_headers.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
|
||||
std::unordered_map<std::string, std::pair<const char*,const char*>> GetHeaders()
|
||||
{
|
||||
return ck_headers();
|
||||
}
|
||||
|
||||
std::size_t integer_divide_ceil(std::size_t x, std::size_t y)
|
||||
{
|
||||
return (x + y - std::size_t{1}) / y;
|
||||
}
|
||||
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
142
library/src/jit_library/src/device_gemm_multiple_d.cpp
Normal file
142
library/src/jit_library/src/device_gemm_multiple_d.cpp
Normal file
@@ -0,0 +1,142 @@
|
||||
#include "ck/host/device_gemm_multiple_d.hpp"
|
||||
#include "ck/host/common.hpp"
|
||||
#include "ck/solution_instances/gemm_add_add_fastgelu_instances.hpp"
|
||||
#include <unordered_set>
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace device_gemm_multiple_d {
|
||||
|
||||
std::string GetGemmSpec(const std::size_t m,
|
||||
const std::size_t n,
|
||||
const std::size_t k,
|
||||
const std::size_t m_per_block,
|
||||
const std::size_t n_per_block,
|
||||
const std::size_t k_per_block)
|
||||
{
|
||||
std::string spec = "";
|
||||
if(integer_divide_ceil(m, m_per_block) * m_per_block - m != 0)
|
||||
spec += "M";
|
||||
if(integer_divide_ceil(n, n_per_block) * n_per_block - n != 0)
|
||||
spec += "N";
|
||||
if(integer_divide_ceil(k, k_per_block) * k_per_block - k != 0)
|
||||
spec += "K";
|
||||
if(spec == "")
|
||||
return "ck::tensor_operation::device::GemmSpecialization::Default";
|
||||
|
||||
return "ck::tensor_operation::device::GemmSpecialization::" + spec + "Padding";
|
||||
}
|
||||
|
||||
std::size_t GetGridSize(const std::size_t m,
|
||||
const std::size_t n,
|
||||
const std::size_t m_per_block,
|
||||
const std::size_t n_per_block)
|
||||
{
|
||||
return integer_divide_ceil(m, m_per_block) *
|
||||
integer_divide_ceil(n, n_per_block);
|
||||
}
|
||||
|
||||
const std::unordered_set<std::string>& get_xdlop_archs()
|
||||
{
|
||||
static std::unordered_set<std::string> supported_archs{"gfx90a"};
|
||||
return supported_archs;
|
||||
}
|
||||
|
||||
std::vector<std::string> Problem::GetInstances(const std::string& arch) const
|
||||
{
|
||||
std::vector<std::string> instances;
|
||||
const bool quantize = ADataType == "int8_t" and BDataType == "int8_t";
|
||||
if (get_xdlop_archs().find(arch) != get_xdlop_archs().end())
|
||||
{
|
||||
instance::gemm_add_add_fastgelu_instances all_instances{};
|
||||
if(TransA and TransB)
|
||||
instances = all_instances.get_col_col_instances(quantize);
|
||||
else if(TransA and not TransB)
|
||||
instances = all_instances.get_col_row_instances(quantize);
|
||||
else if(not TransA and not TransB)
|
||||
instances = all_instances.get_row_row_instances(quantize);
|
||||
else
|
||||
instances = all_instances.get_row_col_instances(quantize);
|
||||
}
|
||||
return instances;
|
||||
}
|
||||
|
||||
std::string Problem::MakeLayoutTuple(const std::vector<bool>& layouts) const
|
||||
{
|
||||
std::string layout_tuple = "ck::Tuple<";
|
||||
auto it = layouts.begin();
|
||||
while(it != layouts.end())
|
||||
{
|
||||
layout_tuple += *it ? "ck::tensor_layout::gemm::ColumnMajor" : "ck::tensor_layout::gemm::RowMajor";
|
||||
it = std::next(it);
|
||||
if (it != layouts.end())
|
||||
layout_tuple += ", ";
|
||||
}
|
||||
|
||||
return layout_tuple + ">";
|
||||
}
|
||||
|
||||
std::string Problem::MakeTypeTuple(const std::vector<std::string>& types) const
|
||||
{
|
||||
std::string type_tuple = "ck::Tuple<";
|
||||
auto it = types.begin();
|
||||
while(it != types.end())
|
||||
{
|
||||
type_tuple += *it;
|
||||
it = std::next(it);
|
||||
if (it != types.end())
|
||||
type_tuple += ", ";
|
||||
}
|
||||
return type_tuple + ">";
|
||||
}
|
||||
|
||||
Solution Problem::MakeSolution(std::size_t idx, const std::string& arch) const
|
||||
{
|
||||
auto template_str = GetInstances(arch).at(idx);
|
||||
std::istringstream iss(template_str);
|
||||
std::vector<std::string> params(std::istream_iterator<std::string>{iss},
|
||||
std::istream_iterator<std::string>());
|
||||
|
||||
if (ADataType == "int8_t" and BDataType == "int8_t")
|
||||
{
|
||||
// Change CBlockTransfer ScalarPerVector if Ds contains other types
|
||||
if (std::any_of(DsDataType.begin(), DsDataType.end(), [](auto t) { return t == "ck::half_t"; }))
|
||||
{
|
||||
params[params.size() - 3] = "8";
|
||||
}
|
||||
if (std::any_of(DsDataType.begin(), DsDataType.end(), [](auto t) { return t == "float"; }))
|
||||
{
|
||||
params[params.size() - 3] = "4";
|
||||
}
|
||||
}
|
||||
|
||||
params[a_elementwise_op_idx] = AElementOp;
|
||||
params[b_elementwise_op_idx] = BElementOp;
|
||||
params[ds_layout_idx] = MakeLayoutTuple(DsLayout);
|
||||
params[ds_data_type_idx] = MakeTypeTuple(DsDataType);
|
||||
params[ds_elementwise_op_idx] = CDEElementOp;
|
||||
params[e_data_type_idx] = EDataType;
|
||||
auto block_size_str = params[block_size_idx];
|
||||
auto m_per_block_str = params[m_per_block_idx];
|
||||
auto n_per_block_str = params[n_per_block_idx];
|
||||
auto k_per_block_str = params[k_per_block_idx];
|
||||
const auto block_size = std::stoi(block_size_str);
|
||||
const auto m_per_block = std::stoi(m_per_block_str);
|
||||
const auto n_per_block = std::stoi(n_per_block_str);
|
||||
const auto k_per_block = std::stoi(k_per_block_str);
|
||||
const auto grid_size = GetGridSize(M, N, m_per_block, n_per_block);
|
||||
params[gemm_spec_idx] = GetGemmSpec(M, N, K, m_per_block, n_per_block, k_per_block);
|
||||
|
||||
std::string str = std::accumulate(params.begin() + 1, params.end(), std::string{},
|
||||
[](const std::string& a, const std::string& b) {
|
||||
return a.empty() ? b : a + ", " + b;
|
||||
});
|
||||
str = params.front() + "< " + str + ">";
|
||||
|
||||
return Solution{str, block_size, grid_size};
|
||||
}
|
||||
|
||||
|
||||
} // namespace device_gemm_multiple_d
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
Reference in New Issue
Block a user