diff --git a/README.md b/README.md index 5f9f95859b..aa1100dd13 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ cmake \ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3" \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_PREFIX_PATH=/opt/rocm \ +-D CMAKE_INSTALL_PREFIX=${PATH_TO_CK_INSTALL_DIRECTORY} \ .. ``` @@ -47,6 +48,12 @@ Instructions for running each individual examples are under ```example/``` ``` Instructions for running ckProfiler are under ```profiler/``` +## Install CK +```bash +make install +``` + +## Using CK as pre-built kernel library ## Caveat ### Kernel Timing and Verification diff --git a/client_example/01_gemm/CMakeLists.txt b/client_example/01_gemm/CMakeLists.txt new file mode 100644 index 0000000000..9e741192f9 --- /dev/null +++ b/client_example/01_gemm/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_gemm gemm.cpp) +target_link_libraries(client_gemm PRIVATE composable_kernel::device_operations) diff --git a/client_example/01_gemm/gemm.cpp b/client_example/01_gemm/gemm.cpp new file mode 100644 index 0000000000..9b7b7a6603 --- /dev/null +++ b/client_example/01_gemm/gemm.cpp @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/gemm.hpp" + +using F16 = ck::half_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CElementOp = PassThrough; + +using ADataType = F16; +using BDataType = F16; +using CDataType = F16; + +using ALayout = Row; +using BLayout = Col; +using CLayout = Row; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + // GEMM shape + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; + + ck::index_t StrideA = 4096; + ck::index_t StrideB = 4096; + ck::index_t StrideC = 4096; + + if(argc == 1) + { + // use default case + } + else if(argc == 5) + { + M = std::stoi(argv[1]); + N = std::stoi(argv[2]); + K = std::stoi(argv[3]); + + StrideA = std::stoi(argv[4]); + StrideB = std::stoi(argv[5]); + StrideC = std::stoi(argv[6]); + } + else + { + printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideC\n"); + exit(0); + } + + auto f_matrix_space_size = + [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) { + using Layout = decltype(layout); + + if(std::is_same::value) + { + return (nRow - 1) * stride + nCol; + } + else + { + return (nCol - 1) * stride + nRow; + } + }; + + SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{})); + SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{})); + SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{})); + + using DeviceOp = + ck::tensor_operation::device::DeviceGemm; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + const auto a_element_op = AElementOp{}; + const auto b_element_op = BElementOp{}; + const auto c_element_op = CElementOp{}; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + c_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + StrideC, + a_element_op, + b_element_op, + c_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = std::size_t(2) * M * N * K; + + std::size_t num_btype = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " + << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + c_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + StrideC, + a_element_op, + b_element_op, + c_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp index bdd6e05029..dbf2e634f0 100644 --- a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp +++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp @@ -10,7 +10,7 @@ #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/device_gemm_add_add_fastgelu_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp" using F16 = ck::half_t; using F32 = float; @@ -25,18 +25,17 @@ using AElementOp = PassThrough; using BElementOp = PassThrough; using CDEElementOp = AddAddFastGelu; -using ADataType = F16; -using BDataType = F16; -using AccDataType = F32; -using D0DataType = F16; -using D1DataType = F16; -using EDataType = F16; +using ADataType = F16; +using BDataType = F16; +using D0DataType = F16; +using D1DataType = F16; +using EDataType = F16; -using ALayout = Row; -using BLayout = Col; -using D0Layout = Row; -using D1Layout = Row; -using ELayout = Row; +using ALayout = Row; +using BLayout = Col; +using DDELayout = Row; +using DDELayout = Row; +using DELayout = Row; struct SimpleDeviceMem { @@ -106,24 +105,27 @@ int main(int argc, char* argv[]) SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{})); SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{})); SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) * - f_matrix_space_size(M, N, StrideD0, D0Layout{})); + f_matrix_space_size(M, N, StrideD0, DDELayout{})); SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) * - f_matrix_space_size(M, N, StrideD1, D1Layout{})); - SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{})); + f_matrix_space_size(M, N, StrideD1, DDELayout{})); + SimpleDeviceMem e_device_buf(sizeof(EDataType) * + f_matrix_space_size(M, N, StrideE, DELayout{})); - // add device op instances - const auto op_ptrs = ck::tensor_operation::device::device_gemm_instance:: - get_device_gemm_add_add_fastgelu_instances(); + using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD< + ALayout, + BLayout, + DDELayout, + ADataType, + BDataType, + ck::Tuple, + EDataType, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::AddAddFastGelu>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); std::cout << "found " << op_ptrs.size() << " instances" << std::endl; @@ -231,6 +233,8 @@ int main(int argc, char* argv[]) { invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); } + + std::cout << "Done" << std::endl; } return 0; diff --git a/client_example/03_gemm_layernorm/CMakeLists.txt b/client_example/03_gemm_layernorm/CMakeLists.txt index 8eeaffc008..3742e70844 100644 --- a/client_example/03_gemm_layernorm/CMakeLists.txt +++ b/client_example/03_gemm_layernorm/CMakeLists.txt @@ -1,2 +1,2 @@ -add_executable(gemm_add_add_reduce_normalize gemm_add_add_layernorm.cpp) -target_link_libraries(gemm_add_add_reduce_normalize PRIVATE composable_kernel::device_operations) +add_executable(client_gemm_add_add_reduce_normalize gemm_add_add_layernorm.cpp) +target_link_libraries(client_gemm_add_add_reduce_normalize PRIVATE composable_kernel::device_operations) diff --git a/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp index bc47a3929a..8f14293728 100644 --- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp +++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp @@ -160,16 +160,17 @@ int main() ck::index_t StrideC = 1024; ck::index_t StrideD0 = 1024; - const auto gemm_reduce_ptrs = ck::tensor_operation::device::device_gemm_instance:: - get_device_gemm_add_add_mean_squaremean_instances(); + const auto gemm_reduce_ptrs = + ck::tensor_operation::device::instance::get_device_gemm_add_add_mean_squaremean_instances< + ADataType, + BDataType, + CDataType, + ALayout, + BLayout, + CLayout>(); const auto normalize_ptrs = - ck::tensor_operation::device::get_device_normalize_from_mean_meansquare_instances< + ck::tensor_operation::device::instance::get_device_normalize_from_mean_meansquare_instances< CDataType, ReduceDataType, ReduceDataType, @@ -267,4 +268,4 @@ int main() << std::endl; } } -} \ No newline at end of file +} diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt index a8a566703b..41acd47dc3 100644 --- a/client_example/CMakeLists.txt +++ b/client_example/CMakeLists.txt @@ -6,5 +6,6 @@ find_package(composable_kernel 1.0.0 COMPONENTS device_operations) find_package(hip REQUIRED PATHS /opt/rocm) message(STATUS "Build with HIP ${hip_VERSION}") +add_subdirectory(01_gemm) add_subdirectory(02_gemm_add_add_fastgelu) add_subdirectory(03_gemm_layernorm) diff --git a/client_example/README.md b/client_example/README.md index dc6b9c48fc..64a7130d53 100644 --- a/client_example/README.md +++ b/client_example/README.md @@ -1,17 +1,6 @@ ## Client application links to CK library, and therefore CK library needs to be installed before building client applications. -## Docker script -```bash -docker run \ --it \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm5.1-tf2.6-dev \ -/bin/bash -``` ## Build ```bash @@ -22,7 +11,7 @@ cd client_example/build ```bash cmake \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ +-D CMAKE_PREFIX_PATH="/opt/rocm;${PATH_TO_CK_INSTALL_DIRECTORY}" \ .. ``` diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp index 4fc953b3a6..57ba31549e 100644 --- a/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp +++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp @@ -12,7 +12,13 @@ namespace ck { namespace tensor_operation { namespace device { -template struct DeviceBatchedGemm : public BaseOperator @@ -34,11 +40,24 @@ struct DeviceBatchedGemm : public BaseOperator virtual std::unique_ptr MakeInvokerPointer() = 0; }; -template -using DeviceBatchedGemmPtr = std::unique_ptr< - DeviceBatchedGemm>; +using DeviceBatchedGemmPtr = std::unique_ptr>; } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp index c63dfd2c53..881bc976fb 100644 --- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp @@ -113,7 +113,7 @@ __global__ void ignore = c_element_op; ignore = compute_ptr_offset_of_batch; ignore = block_2_ctile_map; -#endif // end of if (defined(__gfx908__) || defined(__gfx90a__)) +#endif } template -struct DeviceBatchedGemmXdl - : public DeviceBatchedGemm +struct DeviceBatchedGemmXdl : public DeviceBatchedGemm { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_gemm.hpp index 2b9e367579..231f611c46 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm.hpp @@ -17,33 +17,52 @@ struct GemmShape ck::index_t StrideA, StrideB, StrideC; }; -template struct DeviceGemm : public BaseOperator { - virtual std::unique_ptr MakeArgumentPointer(const void* p_a, - const void* p_b, - void* p_c, - ck::index_t M, - ck::index_t N, - ck::index_t K, - ck::index_t StrideA, - ck::index_t StrideB, - ck::index_t StrideC, - AElementwiseOperation a_element_op, - BElementwiseOperation b_element_op, - CElementwiseOperation c_element_op, - ck::index_t KBatch = 1) = 0; + virtual std::unique_ptr + MakeArgumentPointer(const void* p_a, + const void* p_b, + void* p_c, + ck::index_t M, + ck::index_t N, + ck::index_t K, + ck::index_t StrideA, + ck::index_t StrideB, + ck::index_t StrideC, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) = 0; virtual std::unique_ptr MakeInvokerPointer() = 0; }; -template -using DeviceGemmPtr = std::unique_ptr< - DeviceGemm>; +using DeviceGemmPtr = std::unique_ptr>; template && is_same_v, bool> = false> -struct DeviceGemmDl - : public DeviceGemm +struct DeviceGemmDl : public DeviceGemm + { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -534,8 +542,7 @@ struct DeviceGemmDl index_t StrideC, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, - CElementwiseOperation c_element_op, - index_t /* KBatch */ = 1) override + CElementwiseOperation c_element_op) override { return std::make_unique(static_cast(p_a), static_cast(p_b), diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp index bbd4c3461d..2f5248e76c 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp @@ -16,12 +16,20 @@ namespace device { // output : E[M, N] // C = a_op(A) * b_op(B) // E = cde_op(C, D0, D1, ...) -template struct DeviceGemmMultipleD : public BaseOperator { + static constexpr index_t NumDTensor = DsDataType::Size(); + virtual std::unique_ptr MakeArgumentPointer(const void* p_a, const void* p_b, @@ -41,14 +49,26 @@ struct DeviceGemmMultipleD : public BaseOperator virtual std::unique_ptr MakeInvokerPointer() = 0; }; -template -using DeviceGemmMultipleDPtr = std::unique_ptr +using DeviceGemmMultipleDPtr = std::unique_ptr>; + CDEElementwiseOperation>>; } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp index 13446056fa..4e8381a3fd 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp @@ -96,7 +96,7 @@ namespace device { // E = cde_op(C, D0, D1, ...) template -struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD @@ -360,12 +366,12 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD::value) + if constexpr(is_same::value) { return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw), make_tuple(StrideE, I1)); } - else if constexpr(is_same::value) + else if constexpr(is_same::value) { return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw), make_tuple(I1, StrideE)); diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp index 9bbc19eb49..fcc088ca43 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp @@ -2,13 +2,16 @@ // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. #pragma once + #include + #include "device_base.hpp" namespace ck { namespace tensor_operation { namespace device { +// FIXME: DeviceGemmReduce type need to well define the problem template struct DeviceGemmReduce : public BaseOperator { diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp index 5950d8f8dd..c701bff57f 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp @@ -2,6 +2,7 @@ // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. #pragma once + #include #include @@ -11,7 +12,13 @@ namespace ck { namespace tensor_operation { namespace device { -template struct DeviceGemmSplitK : public BaseOperator @@ -33,11 +40,24 @@ struct DeviceGemmSplitK : public BaseOperator virtual std::unique_ptr MakeInvokerPointer() = 0; }; -template -using DeviceGemmSplitKPtr = std::unique_ptr< - DeviceGemmSplitK>; +using DeviceGemmSplitKPtr = std::unique_ptr>; } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp index b323bb8fef..98028e1f28 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp @@ -57,8 +57,15 @@ template -struct DeviceGemmXdl - : public DeviceGemm +struct DeviceGemmXdl : public DeviceGemm { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -487,8 +494,7 @@ struct DeviceGemmXdl index_t StrideC, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, - CElementwiseOperation c_element_op, - index_t /* KBatch */ = 1) override + CElementwiseOperation c_element_op) override { return std::make_unique(static_cast(p_a), static_cast(p_b), diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp index 851d965f9b..9c8b189add 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp @@ -65,8 +65,15 @@ template -struct DeviceGemm_Xdl_CShuffle - : public DeviceGemm +struct DeviceGemm_Xdl_CShuffle : public DeviceGemm { using DeviceOp = DeviceGemm_Xdl_CShuffle; @@ -622,8 +629,7 @@ struct DeviceGemm_Xdl_CShuffle index_t StrideC, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, - CElementwiseOperation c_element_op, - index_t /* KBatch */ = 1) override + CElementwiseOperation c_element_op) override { return std::make_unique(static_cast(p_a), static_cast(p_b), diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp index 9d24a4932d..306a73dff1 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp @@ -56,8 +56,15 @@ template -struct DeviceGemmXdlSplitK - : public DeviceGemmSplitK +struct DeviceGemmXdlSplitK : public DeviceGemmSplitK { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp index f484de324a..52bdacf7db 100644 --- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp @@ -58,8 +58,15 @@ template -struct DeviceGemmXdlSplitKCShuffle - : public DeviceGemmSplitK +struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance.hpp b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp similarity index 72% rename from library/include/ck/library/tensor_operation_instance/device_operation_instance.hpp rename to library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp index 60343a17b8..20df1b3616 100644 --- a/library/include/ck/library/tensor_operation_instance/device_operation_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp @@ -4,14 +4,17 @@ #pragma once #include +#include + #include "ck/utility/functional2.hpp" namespace ck { namespace tensor_operation { namespace device { +namespace instance { -template -void add_device_operation_instances(std::vector>& op_instances, +template +void add_device_operation_instances(std::vector>& op_instances, const NewOpInstances& new_op_instances) { ck::static_for<0, std::tuple_size_v, 1>{}([&](auto i) { @@ -19,10 +22,14 @@ void add_device_operation_instances(std::vector>& op using NewOpInstance = remove_cvref_t; + static_assert(std::is_base_of_v, + "wrong! NewOpInstance should be derived from BaseOp"); + op_instances.push_back(std::make_unique(new_op_instance)); }); } +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp new file mode 100644 index 0000000000..d453bb0c79 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// aliasing, for commonly used type +using F64 = double; +using F32 = float; +using F16 = ck::half_t; +using BF16 = ck::bhalf_t; + +using F16_F16 = ck::Tuple; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu; + +template +struct DeviceOperationInstanceFactory; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp new file mode 100644 index 0000000000..0655fd92e4 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances( + std::vector>>& + instances); + +void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances( + std::vector>>& + instances); + +void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances( + std::vector>>& + instances); + +void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances( + std::vector>>& + instances); + +void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances( + std::vector>>& + instances); + +void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances( + std::vector>>& + instances); + +void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances( + std::vector>>& + instances); + +void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances( + std::vector>>& + instances); + +void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances( + std::vector>>& + instances); + +void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances( + std::vector>>& + instances); + +void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances( + std::vector>>& + instances); + +void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances( + std::vector>>& + instances); + +void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances( + std::vector>>& instances); + +void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances( + std::vector>>& instances); + +void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances( + std::vector>>& instances); + +void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances( + std::vector>>& instances); + +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = DeviceBatchedGemm; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances(op_ptrs); + } + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_batched_gemm_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_batched_gemm_instance.hpp deleted file mode 100644 index 6379ac26cd..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/device_batched_gemm_instance.hpp +++ /dev/null @@ -1,203 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_batched_gemm_instance { - -using DeviceBatchedGemmNoOpPtr = ck::tensor_operation::device::DeviceBatchedGemmPtr< - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough>; - -void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances( - std::vector&); -void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances( - std::vector&); -void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances( - std::vector&); -void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances( - std::vector&); -void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances( - std::vector&); -void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances( - std::vector&); -void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances( - std::vector&); -void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances( - std::vector&); -void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances( - std::vector&); -void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances( - std::vector&); -void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances( - std::vector&); -void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances( - std::vector&); -void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances( - std::vector&); -void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances( - std::vector&); -void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances( - std::vector&); -void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances( - std::vector&); - -template -auto get_device_batched_gemm_instances() -{ - std::vector op_ptrs; - - if constexpr(is_same::value && is_same::value && - is_same::value) - { - if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances(op_ptrs); - } - } - else if constexpr(is_same::value && is_same::value && - is_same::value) - { - if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances(op_ptrs); - } - } - else if constexpr(is_same::value && is_same::value && - is_same::value) - { - if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances(op_ptrs); - } - } - else if constexpr(is_same::value && is_same::value && - is_same::value) - { - if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_batched_gemm_instance:: - add_device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances(op_ptrs); - } - } - - return op_ptrs; -} - -} // namespace device_batched_gemm_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp index a668f67c49..a9cc8b79dd 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp @@ -10,11 +10,12 @@ #include "ck/tensor_operation/gpu/device/device_elementwise.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { +namespace instance { using Normalize = ck::tensor_operation::element_wise::Normalize; using DeviceNormalizeFromMeanMeanSquarePtr = @@ -37,13 +38,14 @@ auto get_device_normalize_from_mean_meansquare_instances() is_same::value && is_same::value && is_same::value && is_same::value) { - ck::tensor_operation::device:: + ck::tensor_operation::device::instance:: add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(op_ptrs); } return op_ptrs; } +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_add_add_fastgelu_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_add_add_fastgelu_instance.hpp deleted file mode 100644 index 6aa33e4d20..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_add_add_fastgelu_instance.hpp +++ /dev/null @@ -1,93 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_gemm_instance { - -using DeviceGemmAddAddFastGeluPtr = ck::tensor_operation::device::DeviceGemmMultipleDPtr< - 2, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::AddAddFastGelu>; - -void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances( - std::vector&); -void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances( - std::vector&); -void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances( - std::vector&); -void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances( - std::vector&); - -template -auto get_device_gemm_add_add_fastgelu_instances() -{ - std::vector op_ptrs; - - if constexpr(is_same_v && is_same_v && - is_same_v) - { - if constexpr(is_same_v && - is_same_v && - is_same_v) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances( - op_ptrs); - } - else if constexpr(is_same_v && - is_same_v && - is_same_v) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances( - op_ptrs); - } - else if constexpr(is_same_v && - is_same_v && - is_same_v) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances( - op_ptrs); - } - else if constexpr(is_same_v && - is_same_v && - is_same_v) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances( - op_ptrs); - } - } - - return op_ptrs; -} - -} // namespace device_gemm_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_instance.hpp deleted file mode 100644 index 665b63c942..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_instance.hpp +++ /dev/null @@ -1,286 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_gemm_instance { - -using DeviceGemmNoOpPtr = - ck::tensor_operation::device::DeviceGemmPtr; - -void add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(std::vector&); - -void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances( - std::vector&); -void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances( - std::vector&); -void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances( - std::vector&); -void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances( - std::vector&); - -void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(std::vector&); - -void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(std::vector&); - -void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances( - std::vector&); - -void add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances(std::vector&); - -void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances(std::vector&); - -void add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(std::vector&); -void add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(std::vector&); -void add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(std::vector&); -void add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(std::vector&); - -void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(std::vector&); -void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(std::vector&); -void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(std::vector&); -void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(std::vector&); - -void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(std::vector&); -void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(std::vector&); -void add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(std::vector&); -void add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(std::vector&); - -template -auto get_device_gemm_instances() -{ - std::vector op_ptrs; - - if constexpr(is_same::value && is_same::value && - is_same::value) - { - if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances(op_ptrs); - } - } - else if constexpr(is_same::value && is_same::value && - is_same::value) - { - if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs); - } - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances(op_ptrs); - } - } - else if constexpr(is_same::value && is_same::value && - is_same::value) - { - if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(op_ptrs); - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(op_ptrs); - } - } - - return op_ptrs; -} - -} // namespace device_gemm_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp index 32eeaaa1fd..682f546759 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp @@ -10,12 +10,12 @@ #include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using DeviceGemmAddAddMeanSquareMeanPtr = ck::tensor_operation::device::DeviceGemmReducePtr<1, 2>; @@ -45,7 +45,7 @@ auto get_device_gemm_add_add_mean_squaremean_instances() is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances( op_ptrs); } @@ -53,7 +53,7 @@ auto get_device_gemm_add_add_mean_squaremean_instances() is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances( op_ptrs); } @@ -61,7 +61,7 @@ auto get_device_gemm_add_add_mean_squaremean_instances() is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances( op_ptrs); } @@ -69,7 +69,7 @@ auto get_device_gemm_add_add_mean_squaremean_instances() is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances( op_ptrs); } @@ -78,7 +78,7 @@ auto get_device_gemm_add_add_mean_squaremean_instances() return op_ptrs; } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_splitk_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_splitk_instance.hpp deleted file mode 100644 index c1fa54ad2a..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_splitk_instance.hpp +++ /dev/null @@ -1,124 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_gemm_instance { - -using DeviceGemmSplitKNoOpPtr = ck::tensor_operation::device::DeviceGemmSplitKPtr< - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough>; - -void add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances( - std::vector&); -void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances( - std::vector&); -void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances( - std::vector&); -void add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances( - std::vector&); - -void add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances( - std::vector&); -void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances( - std::vector&); -void add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances( - std::vector&); -void add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances( - std::vector&); - -template -auto get_device_gemm_splitk_instances() -{ - std::vector op_ptrs; - - if constexpr(is_same::value && is_same::value && - is_same::value) - { - if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(op_ptrs); - } - } - else if constexpr(is_same::value && is_same::value && - is_same::value) - { - if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(op_ptrs); - } - else if constexpr(is_same::value && - is_same::value && - is_same::value) - { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(op_ptrs); - } - } - - return op_ptrs; -} - -} // namespace device_gemm_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp new file mode 100644 index 0000000000..55ca8f4294 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances( + std::vector>>& + + instances); + +void add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances( + std::vector>>& + instances); + +template +struct DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceGemm> +{ + using DeviceOp = DeviceGemm; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(op_ptrs); + add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(op_ptrs); + add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(op_ptrs); + add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(op_ptrs); + add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(op_ptrs); + add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(op_ptrs); + add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances(op_ptrs); + add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(op_ptrs); + add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs); + add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(op_ptrs); + add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(op_ptrs); + add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(op_ptrs); + add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(op_ptrs); + add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(op_ptrs); + add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(op_ptrs); + add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(op_ptrs); + add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(op_ptrs); + add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(op_ptrs); + add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(op_ptrs); + add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(op_ptrs); + add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(op_ptrs); + add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(op_ptrs); + } + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp new file mode 100644 index 0000000000..55e4dbe106 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances( + std::vector>>&); + +void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances( + std::vector>>&); + +void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances( + std::vector>>&); + +void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances( + std::vector>>&); + +// GEMM + Add + Add + FastGelu +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = DeviceGemmMultipleD; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v && is_same_v && + is_same_v> && is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances( + op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances( + op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances( + op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances( + op_ptrs); + } + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp new file mode 100644 index 0000000000..8986a79344 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances( + std::vector>>& + instances); + +template +struct DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceGemmSplitK> +{ + using DeviceOp = DeviceGemmSplitK; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(op_ptrs); + } + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp index 43a7033f72..5fd8c95f84 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { using reduce_configuration_1_instances_blockwise = std::tuple< // clang-format off @@ -174,7 +174,7 @@ void add_device_reduce_instance_blockwise( Rank, \ NumReduceDim) -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp index 7fb427a9b3..8d1fed046a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -53,7 +53,7 @@ ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp index db9ed38f95..ae7f13ce97 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -40,7 +40,7 @@ ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp index 1aee1aa549..c26e136593 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -28,7 +28,7 @@ ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp index 5bf0ef6a81..30064d588d 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -52,7 +52,7 @@ ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp index b9dc1d669d..c9f6a1a5ff 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -28,7 +28,7 @@ ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp index 4b757fda29..c598e64cde 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -52,7 +52,7 @@ ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp index cf8343d704..cd15949929 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -24,7 +24,7 @@ ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp index 5ec8656e6c..bf62f92ad8 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -40,7 +40,7 @@ ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp index 105e12aa5d..9fc409a08e 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { template struct ReductionConfiguration_1 @@ -34,7 +34,7 @@ struct ReductionConfiguration_2 #define QUICK_REDUCE_TEST 1 -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp index c5a8fc0f4a..a74e92ecab 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp @@ -11,7 +11,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { using reduce_configuration_1_instances_multiblock_atomic_add = std::tuple< // clang-format off @@ -193,7 +193,7 @@ void add_device_reduce_instance_multiblock_atomic_add( Rank, \ NumReduceDim) -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp index 43ebd93fea..3efc585068 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -24,7 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp index a47e6a1bda..804cba12cc 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -24,7 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp index f20752c500..32eb843a1c 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -24,7 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp index c5a30654fe..9f2a892475 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -24,7 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp index 11957046b8..bd20069992 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -24,7 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp index 487c1d4137..6b84b25d0e 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { #ifdef QUICK_REDUCE_TEST using reduce_configuration_2_instances_threadwise = std::tuple< @@ -151,7 +151,7 @@ void add_device_reduce_instance_threadwise( Rank, \ NumReduceDim) -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp index 2c6139a095..5f7f5c7af5 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -53,7 +53,7 @@ ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp index f61983344e..3c21b408cc 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -40,7 +40,7 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp index effdb1945b..cd116986d9 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -28,7 +28,7 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp index e293c79d49..a764735fa9 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -52,7 +52,7 @@ ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp index 75894702b8..7d47c79f84 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -28,7 +28,7 @@ ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp index add0b28cb8..faced808a2 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -52,7 +52,7 @@ ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp index 307be917ef..111ba7a0cf 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -24,7 +24,7 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp index bc4ff97b31..c771f057d6 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp @@ -10,7 +10,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -40,7 +40,7 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/utility/conv_util.hpp b/library/include/ck/library/utility/conv_util.hpp index 0d4f8f8796..e57bde8add 100644 --- a/library/include/ck/library/utility/conv_util.hpp +++ b/library/include/ck/library/utility/conv_util.hpp @@ -31,15 +31,15 @@ namespace device { using DeviceConvFwdNoOpPtr = DeviceConvFwdPtr; -namespace device_conv1d_fwd_instance { +namespace instance { void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(std::vector&); void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(std::vector&); void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(std::vector&); void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(std::vector&); -} // namespace device_conv1d_fwd_instance -namespace device_conv2d_fwd_instance { +} // namespace instance +namespace instance { void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(std::vector&); void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector&); @@ -48,15 +48,15 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances( void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector&); void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector&); -} // namespace device_conv2d_fwd_instance -namespace device_conv3d_fwd_instance { +} // namespace instance +namespace instance { void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(std::vector&); void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(std::vector&); void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(std::vector&); void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(std::vector&); -} // namespace device_conv3d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation @@ -295,17 +295,17 @@ struct ConvolutionFwdInstances std::vector conv_ptrs; if constexpr(NumDimSpatial == 1) { - ck::tensor_operation::device::device_conv1d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs); } else if constexpr(NumDimSpatial == 2) { - ck::tensor_operation::device::device_conv2d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs); } else if constexpr(NumDimSpatial == 3) { - ck::tensor_operation::device::device_conv3d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs); } return conv_ptrs; @@ -322,20 +322,20 @@ struct ConvolutionFwdInstances std::vector conv_ptrs; if constexpr(NumDimSpatial == 1) { - ck::tensor_operation::device::device_conv1d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs); return conv_ptrs; } else if constexpr(NumDimSpatial == 2) { - ck::tensor_operation::device::device_conv2d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); - ck::tensor_operation::device::device_conv2d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); } else if constexpr(NumDimSpatial == 3) { - ck::tensor_operation::device::device_conv3d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs); } return conv_ptrs; @@ -352,17 +352,17 @@ struct ConvolutionFwdInstances std::vector conv_ptrs; if constexpr(NumDimSpatial == 1) { - ck::tensor_operation::device::device_conv1d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs); } else if constexpr(NumDimSpatial == 2) { - ck::tensor_operation::device::device_conv2d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs); } else if constexpr(NumDimSpatial == 3) { - ck::tensor_operation::device::device_conv3d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs); } return conv_ptrs; @@ -379,17 +379,17 @@ struct ConvolutionFwdInstances std::vector conv_ptrs; if constexpr(NumDimSpatial == 1) { - ck::tensor_operation::device::device_conv1d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(conv_ptrs); } else if constexpr(NumDimSpatial == 2) { - ck::tensor_operation::device::device_conv2d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs); } else if constexpr(NumDimSpatial == 3) { - ck::tensor_operation::device::device_conv3d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(conv_ptrs); } return conv_ptrs; diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp index 6a262b7929..1cc92524c6 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp @@ -7,12 +7,13 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using BF16 = ck::bhalf_t; using F32 = float; @@ -28,29 +29,31 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; // Compilation parameters for a[k, m] * b[k, n] = c[m, n] using device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances = std::tuple< // clang-format off - //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| - //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| - //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| - //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>, - DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>, - DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 128, 128, 128, 4, 8, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>, - DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 128, 4, 8, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>, - DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>, - DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 128, 64, 128, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>, - DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 64, 4, 8, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 7, 1>, - DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 64, 128, 4, 8, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1> + //##################| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| + //##################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| + //##################| | | | | | | | Operation| Operation| Operation| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| + //##################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>, + DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 256, 4, 8, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>, + DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 128, 128, 128, 4, 8, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>, + DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 128, 4, 8, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>, + DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 128, 128, 64, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1>, + DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 128, 64, 128, 4, 8, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 7, 1>, + DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 128, 64, 4, 8, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 7, 1>, + DeviceBatchedGemmXdl< BF16, BF16, BF16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, 256, 64, 128, 4, 8, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 7, 1> // clang-format on >; void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp index 15549d8444..c35a8d6d66 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using BF16 = ck::bhalf_t; using F32 = float; @@ -44,13 +44,15 @@ using device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances = std::tuple< >; void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp index ad9c8eff40..1bbedebeb8 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using BF16 = ck::bhalf_t; using F32 = float; @@ -48,13 +48,15 @@ using device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances = std::tuple< >; void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp index a5afc76586..2ceaa20b80 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using BF16 = ck::bhalf_t; using F32 = float; @@ -49,13 +49,15 @@ using device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances = std::tuple< >; void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp index 666c64e016..3696285726 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -44,13 +44,15 @@ using device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances = std::tuple< >; void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp index ad97d3530e..f79d304187 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -44,13 +44,15 @@ using device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances = std::tuple< >; void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp index 593903c718..8290e7565c 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -53,13 +53,15 @@ using device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances = std::tuple< >; void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp index 0220919f8e..f3345eba81 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -49,13 +49,15 @@ using device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances = std::tuple< >; void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp index 74e36e9dd2..8b671dfdb4 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -44,13 +44,15 @@ using device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances = std::tuple< >; void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp index 5873433e2d..646450e722 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -44,13 +44,15 @@ using device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances = std::tuple< >; void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp index 14b994e1f6..1696d29713 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -44,13 +44,15 @@ using device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances = std::tuple< >; void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp index 2c656e7ebb..3dbd63707d 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -49,13 +49,15 @@ using device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances = std::tuple< >; void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp index feef3b48ce..0691f4f865 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; @@ -59,13 +59,21 @@ using device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances = std::tuple< >; void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances( - std::vector>& instances) + std::vector>>& instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp index df24ae135d..efd49bf12d 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; @@ -59,13 +59,21 @@ using device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances = std::tuple< >; void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances( - std::vector>& instances) + std::vector>>& instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp index fb769fc1bb..9c3d6609ca 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; @@ -59,13 +59,21 @@ using device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances = std::tuple< >; void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances( - std::vector>& instances) + std::vector>>& instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp index 389f4225ef..330d139607 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_batched_gemm_instance { +namespace instance { using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; @@ -51,13 +51,21 @@ using device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances = std::tuple< >; void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances( - std::vector>& instances) + std::vector>>& instances) { add_device_operation_instances(instances, device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances{}); } -} // namespace device_batched_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp index e101cc41bb..f5449b117c 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp @@ -8,12 +8,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -74,7 +74,7 @@ void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp index cdd022b036..06eda85570 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp @@ -8,12 +8,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -74,7 +74,7 @@ void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp index f500455095..9214e0b1d9 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp @@ -8,12 +8,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -74,7 +74,7 @@ void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp index 3db783ce58..7e4f6226b1 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp @@ -8,12 +8,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -71,7 +71,7 @@ void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instance.cpp index 2f8af13531..d4c65ff54b 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instance.cpp @@ -8,12 +8,12 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv1d_fwd_instance { +namespace instance { using F32 = float; using BF16 = bhalf_t; @@ -109,7 +109,7 @@ void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances( device_conv1d_fwd_xdl_nwc_kxc_nwk_1x1_s1_p0_bf16_instances{}); } -} // namespace device_conv1d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instance.cpp index a1cf61ff91..166d25ba48 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instance.cpp @@ -8,12 +8,12 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv1d_fwd_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -109,7 +109,7 @@ void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances( device_conv1d_fwd_xdl_nwc_kxc_nwk_1x1_s1_p0_f16_instances{}); } -} // namespace device_conv1d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instance.cpp index b086e57ae0..2cb296e472 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instance.cpp @@ -8,12 +8,12 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv1d_fwd_instance { +namespace instance { using F32 = float; @@ -112,7 +112,7 @@ void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances( device_conv1d_fwd_xdl_nwc_kxc_nwk_1x1_s1_p0_f32_instances{}); } -} // namespace device_conv1d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp index d6ccab5cd0..2364c5ea32 100644 --- a/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv1d_fwd/device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp @@ -8,12 +8,12 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv1d_fwd_instance { +namespace instance { using F32 = float; @@ -111,7 +111,7 @@ void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances( device_conv1d_fwd_xdl_nwc_kxc_nwk_1x1_s1_p0_int8_instances{}); } -} // namespace device_conv1d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp index 74909537d6..3b716d641c 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using BF16 = ck::bhalf_t; using F32 = float; @@ -82,7 +82,7 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances( instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp index 70cca34b16..5978ffcd10 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -84,7 +84,7 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances( instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp index e758d49a07..42e80be1a0 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using F32 = float; @@ -81,7 +81,7 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances( instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp index 5d6e0fb640..ff15c0238b 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using DataType = int8_t; using AccType = int32_t; @@ -82,7 +82,7 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances( instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp index f02b9bc528..ea9fb8c6a8 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_weight_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -52,7 +52,7 @@ void add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances( device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances{}); } -} // namespace device_conv2d_bwd_weight_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instance.cpp index 318de32e99..744f2f91e8 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_weight_instance { +namespace instance { using F32 = float; @@ -51,7 +51,7 @@ void add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances( device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances{}); } -} // namespace device_conv2d_bwd_weight_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp index 968d6331dd..7766a12eb9 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_fwd_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -143,7 +143,7 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances( instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_odd_c_f16_instances{}); } -} // namespace device_conv2d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp index 19ad28dd33..efb4bd875f 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_fwd_instance { +namespace instance { using BF16 = ck::bhalf_t; using F32 = float; @@ -109,7 +109,7 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances( device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{}); } -} // namespace device_conv2d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp index b3797c879e..5c0110aa51 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_fwd_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -108,7 +108,7 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances( device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{}); } -} // namespace device_conv2d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp index eac47a5b69..3e4c8debc9 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_fwd_instance { +namespace instance { using F32 = float; @@ -107,7 +107,7 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances( device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{}); } -} // namespace device_conv2d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp index ba7b607940..cd1bf085fb 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_fwd_instance { +namespace instance { using F32 = float; @@ -108,7 +108,7 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances( device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{}); } -} // namespace device_conv2d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp index 8318934e7b..75351654ba 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_fwd_instance { +namespace instance { using BF16 = ck::bhalf_t; using F32 = float; @@ -112,7 +112,7 @@ void add_device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances( device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{}); } -} // namespace device_conv2d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp index 09fdb4e4c3..c274e7e49d 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_fwd_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -111,7 +111,7 @@ void add_device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances( device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{}); } -} // namespace device_conv2d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp index 32856e898c..22cb766415 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_fwd_instance { +namespace instance { using F32 = float; @@ -110,7 +110,7 @@ void add_device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances( device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{}); } -} // namespace device_conv2d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp index 47478524e9..076faf7f3b 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_fwd_instance { +namespace instance { using F32 = float; @@ -111,7 +111,7 @@ void add_device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances( device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{}); } -} // namespace device_conv2d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp index 483e6e3d78..ca0f9c81b1 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_fwd_bias_activation_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -148,7 +148,7 @@ void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances( instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instances{}); } -} // namespace device_conv2d_fwd_bias_activation_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp index cf5f4aadf4..91aa918287 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_fwd_bias_activation_add_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -148,7 +148,7 @@ void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instan device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_instances{}); } -} // namespace device_conv2d_fwd_bias_activation_add_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp index ed9856a082..e55a3d2b5b 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv3d_fwd_instance { +namespace instance { using F32 = float; using BF16 = bhalf_t; @@ -109,7 +109,7 @@ void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances( instances, device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_bf16_instances{}); } -} // namespace device_conv3d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp index 68e03b57a8..01c6cc6b37 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv3d_fwd_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -109,7 +109,7 @@ void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances( instances, device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f16_instances{}); } -} // namespace device_conv3d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp index b7dc6d1990..f881958c91 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv3d_fwd_instance { +namespace instance { using F32 = float; @@ -108,7 +108,7 @@ void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances( instances, device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f32_instances{}); } -} // namespace device_conv3d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp index ab12fa8cdf..d7c0a30874 100644 --- a/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/conv3d_fwd/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv3d_fwd_instance { +namespace instance { using F32 = float; @@ -111,7 +111,7 @@ void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances( instances, device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_int8_instances{}); } -} // namespace device_conv3d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp index 732f739789..a449a9053f 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using BF16 = bhalf_t; using F32 = float; @@ -83,7 +83,7 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances( instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_bf16_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp index 1f5b0c9d2e..fb97674032 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -85,7 +85,7 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances( instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_f16_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp index e6a52e6351..e8f2a45b71 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using F32 = float; @@ -82,7 +82,7 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances( instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_f32_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp index 3acf3a44be..6aad1f029f 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using DataType = int8_t; using AccType = int32_t; @@ -85,7 +85,7 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances( instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_int8_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp index 8553ec9558..010291cb47 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using BF16 = bhalf_t; using F32 = float; @@ -83,7 +83,7 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances( instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp index ba38143bdb..e7e147177a 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -83,7 +83,7 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances( instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp index 39aa4b2586..357ddabd10 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using F32 = float; @@ -82,7 +82,7 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances( instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp index 3657c25c17..3eadb0bdc9 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using DataType = int8_t; using AccType = int32_t; @@ -83,7 +83,7 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances( instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp index 9d3e628b56..6b5f71ff78 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using BF16 = bhalf_t; using F32 = float; @@ -83,7 +83,7 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances( instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_bf16_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp index 5653866d3f..214aea289b 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -83,7 +83,7 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances( instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f16_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp index 16f47ca272..c3e8b5e8c7 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using F32 = float; @@ -82,7 +82,7 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances( instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f32_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp index b5307661a1..9142b8049b 100644 --- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using DataType = int8_t; using AccType = int32_t; @@ -83,7 +83,7 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances( instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_int8_instances{}); } -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp index ecb94d4c9a..12f7901c16 100644 --- a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp @@ -7,11 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -44,6 +45,7 @@ void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances( instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances{}); } +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp index 60cfe30cba..1e77625448 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -39,12 +39,14 @@ using device_gemm_dl_f16_f16_f16_km_kn_mn_instances = std::tuple< >; void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_dl_f16_f16_f16_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp index a786378669..b281d5e9c2 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -39,12 +39,14 @@ using device_gemm_dl_f16_f16_f16_km_nk_mn_instances = std::tuple< >; void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_dl_f16_f16_f16_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp index 8583b94517..d543801ecd 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -39,12 +39,14 @@ using device_gemm_dl_f16_f16_f16_mk_kn_mn_instances = std::tuple< >; void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_dl_f16_f16_f16_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp index 41a5444ecc..568e3f1be5 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -40,12 +40,14 @@ using device_gemm_dl_f16_f16_f16_mk_nk_mn_instances = >; void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_dl_f16_f16_f16_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp index 26602de885..21f825b099 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -39,12 +39,14 @@ using device_gemm_dl_f32_f32_f32_km_kn_mn_instances = std::tuple< >; void add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_dl_f32_f32_f32_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp index b085a0cc94..3c59d1c84a 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -40,12 +40,14 @@ using device_gemm_dl_f32_f32_f32_km_nk_mn_instances = >; void add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_dl_f32_f32_f32_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp index 46f50257f7..e48c5ef501 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -40,12 +40,14 @@ using device_gemm_dl_f32_f32_f32_mk_kn_mn_instances = >; void add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_dl_f32_f32_f32_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp index ec62efaa16..d0cb4fde92 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -40,12 +40,14 @@ using device_gemm_dl_f32_f32_f32_mk_nk_mn_instances = >; void add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_dl_f32_f32_f32_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp index 1f728cdc41..6ddb623874 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; @@ -36,12 +36,14 @@ using device_gemm_dl_i8_i8_i8_km_kn_mn_instances = std::tuple< >; void add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_dl_i8_i8_i8_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp index 7a1b3011f7..f59332293a 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; @@ -36,12 +36,14 @@ using device_gemm_dl_i8_i8_i8_km_nk_mn_instances = std::tuple< >; void add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_dl_i8_i8_i8_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp index a8af057322..df6aa3ab20 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; @@ -36,12 +36,14 @@ using device_gemm_dl_i8_i8_i8_mk_kn_mn_instances = std::tuple< >; void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_dl_i8_i8_i8_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp index cafa4ff3ea..8c20689a26 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; @@ -36,12 +36,14 @@ using device_gemm_dl_i8_i8_i8_mk_nk_mn_instances = std::tuple< >; void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_dl_i8_i8_i8_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp index 3d63f880f6..5cb92831cd 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -51,13 +51,15 @@ using device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances = std::tu >; void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances( instances, device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp index 4e8fb4700f..a7e6dd5726 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using BF16 = ck::bhalf_t; using F32 = float; @@ -54,13 +54,15 @@ using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances = std::tuple< >; void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp index 6323940dcb..78806b691c 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using BF16 = ck::bhalf_t; using F32 = float; @@ -54,13 +54,15 @@ using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances = std::tuple< >; void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp index f16b2ded78..4ad378f790 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using BF16 = ck::bhalf_t; using F32 = float; @@ -54,13 +54,15 @@ using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances = std::tuple< >; void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp index 8fc725292a..84cadc73fc 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using BF16 = ck::bhalf_t; using F32 = float; @@ -51,13 +51,15 @@ using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances = std::tuple< >; void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp index c9999a3d15..48535efb18 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -54,13 +54,15 @@ using device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::tuple< >; void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp index 218106054f..184f393fd6 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -54,13 +54,15 @@ using device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances = std::tuple< >; void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp index 9fb2081838..988bc00bfe 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -54,13 +54,15 @@ using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances = std::tuple< >; void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp index 91b508f73d..61043b2018 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -51,13 +51,15 @@ using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances = std::tuple< >; void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp index 9473cb5003..f099e7975b 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F32 = float; @@ -53,13 +53,15 @@ using device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances = std::tuple< >; void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp index 49b566b2d7..c2908c508a 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F32 = float; @@ -53,13 +53,15 @@ using device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances = std::tuple< >; void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp index 9ddf33e0c0..3d3f07f59a 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F32 = float; @@ -53,13 +53,15 @@ using device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances = std::tuple< >; void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp index 8cba352e68..f1ac7ba904 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F32 = float; @@ -50,13 +50,15 @@ using device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances = std::tuple< >; void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp index d9190115ad..7aa930f66e 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F32 = float; @@ -54,13 +54,15 @@ using device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances = >; void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp index 04e6286025..b7753db873 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F32 = float; @@ -54,13 +54,15 @@ using device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances = >; void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp index 7bfadc24d1..9bba0362a1 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F32 = float; @@ -54,13 +54,15 @@ using device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances = >; void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp index 5f80a97318..39c5fe5b9b 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F32 = float; @@ -51,13 +51,15 @@ using device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances = >; void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp index ea568523c4..161ec4eca0 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -47,12 +47,14 @@ using device_gemm_xdl_f16_f16_f16_km_kn_mn_instances = >; void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_f16_f16_f16_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp index 7c915a4dea..8ce029482c 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -47,12 +47,14 @@ using device_gemm_xdl_f16_f16_f16_km_nk_mn_instances = >; void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_f16_f16_f16_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp index 424f255784..2f66e8dac5 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -56,12 +56,14 @@ using device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances = >; void add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp index bdc8312d44..1807faa495 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -66,14 +66,16 @@ using device_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances = >; void add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances{}); add_device_operation_instances(instances, device_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp index 6560c4b7ce..f4d7516c9f 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -47,12 +47,14 @@ using device_gemm_xdl_f32_f32_f32_km_kn_mn_instances = >; void add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_f32_f32_f32_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp index e9f050f63c..cac64fb924 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -47,12 +47,14 @@ using device_gemm_xdl_f32_f32_f32_km_nk_mn_instances = >; void add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_f32_f32_f32_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp index ab3e99ea30..19ae11f7f3 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -47,12 +47,14 @@ using device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances = >; void add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp index edfcb56b1b..74ace438bc 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -52,12 +52,14 @@ using device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances = >; void add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp index 278b928e40..e692463b34 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F64 = double; @@ -43,12 +43,14 @@ using device_gemm_xdl_f64_f64_f64_km_kn_mn_instances = >; void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp index 1c4468f9d2..c0a9fc3cca 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F64 = double; @@ -43,12 +43,14 @@ using device_gemm_xdl_f64_f64_f64_km_nk_mn_instances = >; void add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp index e6a6eb8209..64d65440e2 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F64 = double; @@ -43,12 +43,14 @@ using device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances = >; void add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp index 96e3f982f0..41fa131cd1 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F64 = double; @@ -48,12 +48,14 @@ using device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances = >; void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp index 372e25a45e..1dc47dfa02 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp @@ -6,12 +6,13 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -57,13 +58,22 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances >; void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances( - std::vector>& instances) + std::vector>>& instances) { add_device_operation_instances( instances, device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp index 29ba57c4d3..dc21da7031 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp @@ -6,12 +6,13 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -57,13 +58,22 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances >; void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances( - std::vector>& instances) + std::vector>>& instances) { add_device_operation_instances( instances, device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp index fb77a0289e..0cf02c1e0f 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp @@ -6,12 +6,13 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -57,13 +58,22 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances >; void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances( - std::vector>& instances) + std::vector>>& instances) { add_device_operation_instances( instances, device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp index cf894ebec5..9a753dd0ee 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp @@ -6,12 +6,13 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -54,13 +55,22 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances >; void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances( - std::vector>& instances) + std::vector>>& instances) { add_device_operation_instances( instances, device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_kn_mn_instance.cpp index 20eb5ae599..66a2462529 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -51,7 +51,7 @@ void add_device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_kn_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_nk_mn_instance.cpp index b7f02e211a..52d4fc0fb2 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -51,7 +51,7 @@ void add_device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_nk_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_kn_mn_instance.cpp index 1ee5bdbcde..69bcbf02f4 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -51,7 +51,7 @@ void add_device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_kn_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instance.cpp index 320053a023..37aeabd993 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -56,7 +56,7 @@ void add_device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_kn_mn_instance.cpp index 9d52cf000f..399b835fac 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F32 = float; @@ -50,7 +50,7 @@ void add_device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_kn_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_nk_mn_instance.cpp index f78cc76363..4289044d5b 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F32 = float; @@ -50,7 +50,7 @@ void add_device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_nk_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_kn_mn_instance.cpp index a018fc6a0a..985a8d6f57 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F32 = float; @@ -50,7 +50,7 @@ void add_device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_kn_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_nk_mn_instance.cpp index 846abd587d..ae7d411556 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias2d/device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F32 = float; @@ -55,7 +55,7 @@ void add_device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_nk_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp index 3423737311..fbc91507f4 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp @@ -9,12 +9,13 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_bias_add_reduce_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -76,7 +77,7 @@ void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp index 2351438e6f..6841b562ec 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp @@ -9,12 +9,12 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_bias_add_reduce_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -76,7 +76,7 @@ void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp index 28e90c3c6a..19f8dfebe4 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp @@ -9,12 +9,12 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_bias_add_reduce_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -76,7 +76,7 @@ void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp index c5e4411a38..b02c45e312 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp @@ -9,12 +9,12 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_bias_add_reduce_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -73,7 +73,7 @@ void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_kn_mn_instance.cpp index d2ef687a88..05a1471eab 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -51,7 +51,7 @@ void add_device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_kn_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instance.cpp index b966e38cfe..f6aea825b4 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -51,7 +51,7 @@ void add_device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_kn_mn_instance.cpp index 4dad097cd8..1d6b8ee8e0 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -51,7 +51,7 @@ void add_device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_kn_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_nk_mn_instance.cpp index a25f29688f..1c68962c46 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -56,7 +56,7 @@ void add_device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_nk_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_kn_mn_instance.cpp index c452d312e5..12ee8b4a21 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_kn_mn_instance.cpp @@ -9,12 +9,12 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -53,7 +53,7 @@ void add_device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_kn_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instance.cpp index 832ccb70f2..d7cb6522ad 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instance.cpp @@ -9,12 +9,12 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -53,7 +53,7 @@ void add_device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_kn_mn_instance.cpp index 45cd5b0c8a..c487b06665 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_kn_mn_instance.cpp @@ -9,12 +9,12 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -53,7 +53,7 @@ void add_device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_kn_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_nk_mn_instance.cpp index 2ed436c73a..25eca45be2 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_nk_mn_instance.cpp @@ -9,12 +9,12 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -58,7 +58,7 @@ void add_device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_nk_mn_instances( instances, device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp index 5036253904..8bf756c36d 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp @@ -9,12 +9,12 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -74,7 +74,7 @@ void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instances( instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp index d859bd4505..6c9d0fe2de 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp @@ -9,12 +9,12 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -74,7 +74,7 @@ void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instances( instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp index 7d42a71721..210709154e 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp @@ -9,12 +9,12 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -74,7 +74,7 @@ void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances( instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp index daf18b62bf..de707afa26 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp @@ -9,12 +9,12 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -71,7 +71,7 @@ void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instances( instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp index 311b8c088e..7a1b4a0461 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp @@ -7,12 +7,13 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -46,13 +47,15 @@ using device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances = std::tuple< >; void add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp index 657135e295..30d3034541 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp @@ -7,12 +7,13 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -46,13 +47,15 @@ using device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances = std::tuple< >; void add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp index 10229534a9..3ea117169b 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp @@ -7,12 +7,13 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -46,13 +47,15 @@ using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple< >; void add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp index 31bf3233cd..3de7c71f5f 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp @@ -7,12 +7,13 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -50,50 +51,16 @@ using device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances = std::tuple< // clang-format on >; -// using device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instances = std::tuple< -// // clang-format off -// //#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| -// B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| -// ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| -// ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| -// BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| -// CBlockTransferClusterLengths| CBlockTransfer| -// //#########################| Type| Type| Type| Type| | | | -// Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | -// XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| -// SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| -// SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| -// _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| -// //#########################| | | | | | | | -// Operation| Operation| Operation| | | | | | | | -// | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| -// PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | -// PerVector| PerVector_K1| | PerShuffle| PerShuffle| -// _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| -// //#########################| | | | | | | | | | -// | | | | | | | | | | | | -// | | | | | | | | | | | | -// | | | | | -// DeviceGemmXdlSplitKCShuffle< F16, F16, F16, F32, Row, Col, Row, -// PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 144, 4, 8, 16, -// 16, 2, 9, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 8, 8, -// true, S<1, 4, 16, 4>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 2, 2, -// true, 1, 9, S<1, 2, 1, 72>, 2> -// // clang-format on -// >; - void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances{}); - - // FIXME - IsSupportedArgument() is false, need to check validity - // add_device_operation_instances( - // instances, device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp index f3a26d6de8..d2ed833434 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp @@ -7,12 +7,13 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -46,13 +47,15 @@ using device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances = std::tuple< >; void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp index 381fc1ced5..c6e4a1f17f 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp @@ -7,12 +7,13 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -46,13 +47,15 @@ using device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances = std::tuple< >; void add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp index 47b3f2ebd0..d5cdc637e8 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp @@ -7,12 +7,13 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -51,13 +52,15 @@ using device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances = std::tuple< >; void add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp index d532fe1e77..81c73d6367 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp @@ -7,12 +7,13 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -51,13 +52,15 @@ using device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances = std::tuple< >; void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances( - std::vector>& instances) + std::vector>>& + instances) { add_device_operation_instances(instances, device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances{}); } -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp index 35737b6845..f90bc26b0a 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_grouped_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -52,7 +52,7 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances( device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances{}); } -} // namespace device_grouped_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp index c8d77576d1..0c8a0141b6 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_grouped_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -52,7 +52,7 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances( device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances{}); } -} // namespace device_grouped_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp index 1842fc713d..5c49c89407 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_grouped_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -61,7 +61,7 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances( device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances{}); } -} // namespace device_grouped_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp index 0672cc6c9e..288c909bf9 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp @@ -7,12 +7,12 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_grouped_gemm_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -72,7 +72,7 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances( instances, device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances{}); } -} // namespace device_grouped_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp index c5019c690d..8465baa17c 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp @@ -2,14 +2,15 @@ // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. #include "ck/ck.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" #include "ck/tensor_operation/gpu/device/device_softmax.hpp" #include "ck/utility/data_type.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + namespace ck { namespace tensor_operation { namespace device { -namespace device_normalization_instance { +namespace instance { using F16 = ck::half_t; using F32 = float; @@ -43,7 +44,7 @@ void add_device_softmax_f16_f16_rank4_instances(std::vector{}); } -} // namespace device_normalization_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp index 985f17012e..73ecf747b2 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp @@ -2,14 +2,14 @@ // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. #include "ck/ck.hpp" -#include "ck/library/tensor_operation_instance/device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/tensor_operation/gpu/device/device_softmax.hpp" #include "ck/utility/data_type.hpp" namespace ck { namespace tensor_operation { namespace device { -namespace device_normalization_instance { +namespace instance { using F32 = float; @@ -42,7 +42,7 @@ void add_device_softmax_f32_f32_rank4_instances(std::vector{}); } -} // namespace device_normalization_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp index 4b846b159b..c97efbc901 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -49,7 +49,7 @@ ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp index d507452202..5e73b3d8b9 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -36,7 +36,7 @@ ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp index 9c73bf8486..93d3e27016 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -24,7 +24,7 @@ ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp index db5e6cf5f5..38800ddde5 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -48,7 +48,7 @@ ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp index 85b85d0493..b821aeee0a 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -24,7 +24,7 @@ ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp index 0d2be03e46..074d0cfdf7 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -48,7 +48,7 @@ ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp index 2e284cad0c..e803fb842d 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -20,7 +20,7 @@ ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp index 2cc2756b7e..4bf4139d28 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -36,7 +36,7 @@ ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp index 406c907391..a571655cdc 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -20,7 +20,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp index 5acc536834..9ad9a630bd 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -20,7 +20,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp index 18c1973c86..4ee70702c0 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -20,7 +20,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp index 8fde2dd5be..8c5fa80e81 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -20,7 +20,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp index 80a6c29447..d2b81c486d 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -20,7 +20,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp index f2192e7451..8d678e784a 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -49,7 +49,7 @@ ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp index b0e3f2bfab..010560586a 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -36,7 +36,7 @@ ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp index ef82ed26fe..55c53dfd58 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -24,7 +24,7 @@ ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp index fb8c9705bb..367cf9a65d 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -48,7 +48,7 @@ ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp index 0d33ea290b..18fd08448c 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -24,7 +24,7 @@ ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1); ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp index ac7b3b9020..3d02f3cbe3 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -48,7 +48,7 @@ ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp index 36f350fd39..fcf072a086 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -21,7 +21,7 @@ ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); // clang-format on // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp index 4f934c8cd7..85d7ce8b4c 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp @@ -6,7 +6,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim @@ -36,7 +36,7 @@ ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); // clang-format on -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation diff --git a/profiler/include/profile_batched_gemm_impl.hpp b/profiler/include/profile_batched_gemm_impl.hpp index 21bb1d86a9..a7618e64d9 100644 --- a/profiler/include/profile_batched_gemm_impl.hpp +++ b/profiler/include/profile_batched_gemm_impl.hpp @@ -10,7 +10,7 @@ #include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/device_batched_gemm_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/host_tensor/device_memory.hpp" @@ -116,19 +116,21 @@ bool profile_batched_gemm_impl(int do_verification, b_device_buf.ToDevice(b_g_k_n.mData.data()); c_device_buf.ToDevice(c_g_m_n_device_result.mData.data()); - // add device op instances - const auto op_ptrs = ck::tensor_operation::device::device_batched_gemm_instance:: - get_device_batched_gemm_instances(); + using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemm; - if(op_ptrs.size() <= 0) - { - throw std::runtime_error("wrong! no device GEMM instance found"); - } + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; std::string best_op_name; float best_ave_time = 0; diff --git a/profiler/include/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profile_batched_gemm_reduce_impl.hpp index 42ad355d84..b7dc979577 100644 --- a/profiler/include/profile_batched_gemm_reduce_impl.hpp +++ b/profiler/include/profile_batched_gemm_reduce_impl.hpp @@ -19,7 +19,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F32 = float; using F16 = ck::half_t; @@ -44,7 +44,7 @@ void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances( std::vector&); -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck @@ -208,8 +208,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification, b_device_buf.ToDevice(b_g_k_n.mData.data()); // add device GEMM instances - std::vector - gemm_ptrs; + std::vector gemm_ptrs; if constexpr(is_same::value && is_same::value && is_same::value) @@ -218,7 +217,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances( gemm_ptrs); } @@ -226,7 +225,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances( gemm_ptrs); } @@ -234,7 +233,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances( gemm_ptrs); } @@ -242,7 +241,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances( gemm_ptrs); } diff --git a/profiler/include/profile_conv_bwd_weight_impl.hpp b/profiler/include/profile_conv_bwd_weight_impl.hpp index 9432b09c9a..9820d978fd 100644 --- a/profiler/include/profile_conv_bwd_weight_impl.hpp +++ b/profiler/include/profile_conv_bwd_weight_impl.hpp @@ -18,7 +18,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_weight_instance { +namespace instance { using DeviceConvBwdWeightNoOpPtr = DeviceConvBwdWeightPtr&); -} // namespace device_conv2d_bwd_weight_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck @@ -165,14 +165,14 @@ bool profile_conv_bwd_weight_impl(int do_verification, ck::is_same_v, float> && ck::is_same_v, float>) { - ck::tensor_operation::device::device_conv2d_bwd_weight_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs); } else if constexpr(ck::is_same_v, ck::half_t> && ck::is_same_v, ck::half_t> && ck::is_same_v, ck::half_t>) { - ck::tensor_operation::device::device_conv2d_bwd_weight_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); } diff --git a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp index 47f187d843..69bfe50a70 100644 --- a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp +++ b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp @@ -17,7 +17,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_fwd_bias_activation_add_instance { +namespace instance { using DeviceConvFwdBiasReluAddPtr = DeviceConvFwdBiasActivationAddPtr&); -} // namespace device_conv2d_fwd_bias_activation_add_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck @@ -179,7 +179,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ck::is_same_v, ck::half_t> && ck::is_same_v, ck::half_t>) { - ck::tensor_operation::device::device_conv2d_fwd_bias_activation_add_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances(op_ptrs); } diff --git a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp index 29b9fbded6..166173ca89 100644 --- a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp +++ b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp @@ -17,7 +17,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_fwd_bias_activation_instance { +namespace instance { using DeviceConvFwdBiasReluPtr = DeviceConvFwdBiasActivationPtr&); -} // namespace device_conv2d_fwd_bias_activation_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck @@ -169,7 +169,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ck::is_same_v, ck::half_t> && ck::is_same_v, ck::half_t>) { - ck::tensor_operation::device::device_conv2d_fwd_bias_activation_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances(op_ptrs); } diff --git a/profiler/include/profile_convnd_bwd_data_impl.hpp b/profiler/include/profile_convnd_bwd_data_impl.hpp index ce3642ac51..676e619b49 100644 --- a/profiler/include/profile_convnd_bwd_data_impl.hpp +++ b/profiler/include/profile_convnd_bwd_data_impl.hpp @@ -22,7 +22,7 @@ using INT8 = int8_t; namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using DeviceConvBwdDataNoOpPtr = DeviceConvBwdDataPtr&); void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances( std::vector&); -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck namespace ck { namespace profiler { -using DeviceConvBwdDataNoOpPtr = - ck::tensor_operation::device::device_conv2d_bwd_data_instance::DeviceConvBwdDataNoOpPtr; +using DeviceConvBwdDataNoOpPtr = ck::tensor_operation::device::instance::DeviceConvBwdDataNoOpPtr; template HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector& dims, @@ -144,15 +143,15 @@ void get_device_conv_bwd_data_op_ptr( switch(num_dim_spatial) { case 1: - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs); break; case 2: - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs); break; case 3: - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs); break; default: break; @@ -165,15 +164,15 @@ void get_device_conv_bwd_data_op_ptr( switch(num_dim_spatial) { case 1: - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs); break; case 2: - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); break; case 3: - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs); break; default: break; @@ -186,15 +185,15 @@ void get_device_conv_bwd_data_op_ptr( switch(num_dim_spatial) { case 1: - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs); break; case 2: - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs); break; case 3: - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs); break; default: break; @@ -207,15 +206,15 @@ void get_device_conv_bwd_data_op_ptr( switch(num_dim_spatial) { case 1: - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(conv_ptrs); break; case 2: - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs); break; case 3: - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(conv_ptrs); break; default: break; diff --git a/profiler/include/profile_gemm_add_add_fastgelu_impl.hpp b/profiler/include/profile_gemm_add_add_fastgelu_impl.hpp index a39d55acae..849b6f3ea2 100644 --- a/profiler/include/profile_gemm_add_add_fastgelu_impl.hpp +++ b/profiler/include/profile_gemm_add_add_fastgelu_impl.hpp @@ -10,13 +10,12 @@ #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/device_gemm_add_add_fastgelu_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/host_tensor/host_tensor_generator.hpp" -#include "ck/library/host_tensor/host_conv.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" namespace ck { @@ -30,9 +29,7 @@ template + typename DELayout> // assume Ds and E have same layout bool profile_gemm_add_add_fastgelu_impl(int do_verification, int init_method, bool /*do_log*/, @@ -62,10 +59,10 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); - Tensor d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{})); - Tensor d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{})); - Tensor e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); - Tensor e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); + Tensor d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, DELayout{})); + Tensor d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, DELayout{})); + Tensor e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, DELayout{})); + Tensor e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, DELayout{})); std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; @@ -100,19 +97,21 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, const auto b_element_op = BElementOp{}; const auto cde_element_op = CDEElementOp{}; - // add device op instances - const auto op_ptrs = ck::tensor_operation::device::device_gemm_instance:: - get_device_gemm_add_add_fastgelu_instances(); + using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD< + ALayout, + BLayout, + DELayout, + ADataType, + BDataType, + ck::Tuple, + EDataType, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::AddAddFastGelu>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); std::cout << "found " << op_ptrs.size() << " instances" << std::endl; diff --git a/profiler/include/profile_gemm_bias_2d_impl.hpp b/profiler/include/profile_gemm_bias_2d_impl.hpp index db19c8a4b8..b9920ccc9e 100644 --- a/profiler/include/profile_gemm_bias_2d_impl.hpp +++ b/profiler/include/profile_gemm_bias_2d_impl.hpp @@ -17,7 +17,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using DeviceGemmAlphaBetaPtr = ck::tensor_operation::device::DeviceGemmBiasPtr< ck::tensor_operation::element_wise::PassThrough, @@ -48,7 +48,7 @@ void add_device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_kn_mn_instances( void add_device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instances( std::vector&); -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck @@ -159,8 +159,7 @@ void profile_gemm_bias_2d_impl(int do_verification, c_device_buf.ToDevice(c_m_n_device_result.mData.data()); // add device GEMM instances - std::vector - gemm_ptrs; + std::vector gemm_ptrs; if constexpr(is_same::value && is_same::value && is_same::value) @@ -169,28 +168,28 @@ void profile_gemm_bias_2d_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_kn_mn_instances(gemm_ptrs); } else if constexpr(is_same::value && is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instances(gemm_ptrs); } else if constexpr(is_same::value && is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_kn_mn_instances(gemm_ptrs); } else if constexpr(is_same::value && is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_km_nk_mn_instances(gemm_ptrs); } } @@ -201,28 +200,28 @@ void profile_gemm_bias_2d_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_kn_mn_instances(gemm_ptrs); } else if constexpr(is_same::value && is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_mk_nk_mn_instances(gemm_ptrs); } else if constexpr(is_same::value && is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_kn_mn_instances(gemm_ptrs); } else if constexpr(is_same::value && is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_2d_f32_f32_f32_km_nk_mn_instances(gemm_ptrs); } } diff --git a/profiler/include/profile_gemm_bias_add_reduce_impl.hpp b/profiler/include/profile_gemm_bias_add_reduce_impl.hpp index aeb5934d27..34317c59a7 100644 --- a/profiler/include/profile_gemm_bias_add_reduce_impl.hpp +++ b/profiler/include/profile_gemm_bias_add_reduce_impl.hpp @@ -19,7 +19,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F32 = float; using F16 = ck::half_t; @@ -45,7 +45,7 @@ void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances( std::vector&); -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck @@ -236,8 +236,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, d0_device_buf.ToDevice(d0_m_n.mData.data()); // add device GEMM instances - std::vector - gemm_ptrs; + std::vector gemm_ptrs; if constexpr(is_same::value && is_same::value && is_same::value) @@ -246,7 +245,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances( gemm_ptrs); } @@ -254,7 +253,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances( gemm_ptrs); } @@ -262,7 +261,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances( gemm_ptrs); } @@ -270,7 +269,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances( gemm_ptrs); } diff --git a/profiler/include/profile_gemm_bias_relu_add_impl.hpp b/profiler/include/profile_gemm_bias_relu_add_impl.hpp index 4015bec01c..0b4183305f 100644 --- a/profiler/include/profile_gemm_bias_relu_add_impl.hpp +++ b/profiler/include/profile_gemm_bias_relu_add_impl.hpp @@ -18,7 +18,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using DeviceGemmBiasReluAddPtr = ck::tensor_operation::device::DeviceGemmBiasActivationAddPtr< ck::tensor_operation::element_wise::PassThrough, @@ -34,7 +34,7 @@ void add_device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_kn_mn_instances( void add_device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instances( std::vector&); -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck @@ -158,8 +158,7 @@ void profile_gemm_bias_relu_add_impl(int do_verification, c1_m_n_device_buf.ToDevice(c1_m_n.mData.data()); // add device GEMM instances - std::vector - gemm_ptrs; + std::vector gemm_ptrs; if constexpr(is_same::value && is_same::value && is_same::value) @@ -168,7 +167,7 @@ void profile_gemm_bias_relu_add_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_kn_mn_instances( gemm_ptrs); } @@ -176,7 +175,7 @@ void profile_gemm_bias_relu_add_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_mk_nk_mn_instances( gemm_ptrs); } @@ -184,7 +183,7 @@ void profile_gemm_bias_relu_add_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_kn_mn_instances( gemm_ptrs); } @@ -192,7 +191,7 @@ void profile_gemm_bias_relu_add_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instances( gemm_ptrs); } diff --git a/profiler/include/profile_gemm_bias_relu_impl.hpp b/profiler/include/profile_gemm_bias_relu_impl.hpp index 7cb280e131..cc51ebcc47 100644 --- a/profiler/include/profile_gemm_bias_relu_impl.hpp +++ b/profiler/include/profile_gemm_bias_relu_impl.hpp @@ -18,7 +18,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using DeviceGemmBiasReluPtr = ck::tensor_operation::device::DeviceGemmBiasActivationPtr< ck::tensor_operation::element_wise::PassThrough, @@ -34,7 +34,7 @@ void add_device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_kn_mn_instances( void add_device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instances( std::vector&); -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck @@ -144,8 +144,7 @@ void profile_gemm_bias_relu_impl(int do_verification, c0_n_device_buf.ToDevice(c0_n.mData.data()); // add device GEMM instances - std::vector - gemm_ptrs; + std::vector gemm_ptrs; if constexpr(is_same::value && is_same::value && is_same::value) @@ -154,28 +153,28 @@ void profile_gemm_bias_relu_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_kn_mn_instances(gemm_ptrs); } else if constexpr(is_same::value && is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_mk_nk_mn_instances(gemm_ptrs); } else if constexpr(is_same::value && is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_kn_mn_instances(gemm_ptrs); } else if constexpr(is_same::value && is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instances(gemm_ptrs); } } diff --git a/profiler/include/profile_gemm_impl.hpp b/profiler/include/profile_gemm_impl.hpp index 2122010c7f..54b9e05c06 100644 --- a/profiler/include/profile_gemm_impl.hpp +++ b/profiler/include/profile_gemm_impl.hpp @@ -12,7 +12,7 @@ #include "ck/tensor_operation/gpu/device/device_gemm.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/device_gemm_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/gemm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/host_tensor/device_memory.hpp" @@ -94,14 +94,21 @@ int profile_gemm_impl(int do_verification, b_device_buf.ToDevice(b_k_n.mData.data()); c_device_buf.ToDevice(c_m_n_device_result.mData.data()); - // add device op instances - const auto op_ptrs = ck::tensor_operation::device::device_gemm_instance:: - get_device_gemm_instances(); + using DeviceOp = ck::tensor_operation::device::DeviceGemm; - if(op_ptrs.size() <= 0) - { - throw std::runtime_error("wrong! no device GEMM instance found"); - } + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; // Run reference GEMM if(do_verification) @@ -141,9 +148,9 @@ int profile_gemm_impl(int do_verification, StrideA, StrideB, StrideC, - ck::tensor_operation::element_wise::PassThrough{}, - ck::tensor_operation::element_wise::PassThrough{}, - ck::tensor_operation::element_wise::PassThrough{}); + a_element_op, + b_element_op, + c_element_op); auto invoker_ptr = op_ptr->MakeInvokerPointer(); diff --git a/profiler/include/profile_gemm_reduce_impl.hpp b/profiler/include/profile_gemm_reduce_impl.hpp index 05695ae640..0f891a7aee 100644 --- a/profiler/include/profile_gemm_reduce_impl.hpp +++ b/profiler/include/profile_gemm_reduce_impl.hpp @@ -19,7 +19,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_gemm_instance { +namespace instance { using F32 = float; using F16 = ck::half_t; @@ -45,7 +45,7 @@ void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instances( void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instances( std::vector&); -} // namespace device_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck @@ -204,8 +204,7 @@ bool profile_gemm_reduce_impl(int do_verification, b_device_buf.ToDevice(b_k_n.mData.data()); // add device GEMM instances - std::vector - gemm_ptrs; + std::vector gemm_ptrs; if constexpr(is_same::value && is_same::value && is_same::value) @@ -214,7 +213,7 @@ bool profile_gemm_reduce_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances( gemm_ptrs); } @@ -222,7 +221,7 @@ bool profile_gemm_reduce_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instances( gemm_ptrs); } @@ -230,7 +229,7 @@ bool profile_gemm_reduce_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instances( gemm_ptrs); } @@ -238,7 +237,7 @@ bool profile_gemm_reduce_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instances( gemm_ptrs); } diff --git a/profiler/include/profile_gemm_splitk_impl.hpp b/profiler/include/profile_gemm_splitk_impl.hpp index 608c53af45..8be879dcbe 100644 --- a/profiler/include/profile_gemm_splitk_impl.hpp +++ b/profiler/include/profile_gemm_splitk_impl.hpp @@ -12,7 +12,7 @@ #include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/device_gemm_splitk_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/host_tensor/device_memory.hpp" @@ -95,20 +95,21 @@ bool profile_gemm_splitk_impl(int do_verification, b_device_buf.ToDevice(b_k_n.mData.data()); c_device_buf.ToDevice(c_m_n_device_result.mData.data()); - // add device op instances - const auto op_ptrs = - ck::tensor_operation::device::device_gemm_instance::get_device_gemm_splitk_instances< - ADataType, - BDataType, - CDataType, - ALayout, - BLayout, - CLayout>(); + using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK; - if(op_ptrs.size() <= 0) - { - throw std::runtime_error("wrong! no device operation instance found"); - } + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; // Run reference GEMM if(do_verification) diff --git a/profiler/include/profile_grouped_gemm_impl.hpp b/profiler/include/profile_grouped_gemm_impl.hpp index 92f45eccee..6a92b3824c 100644 --- a/profiler/include/profile_grouped_gemm_impl.hpp +++ b/profiler/include/profile_grouped_gemm_impl.hpp @@ -20,7 +20,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_grouped_gemm_instance { +namespace instance { using DeviceGroupedGemmNoOpPtr = ck::tensor_operation::device::DeviceGroupedGemmPtr< ck::tensor_operation::element_wise::PassThrough, @@ -36,7 +36,7 @@ void add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances( void add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances( std::vector&); -} // namespace device_grouped_gemm_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck @@ -171,9 +171,7 @@ void profile_grouped_gemm_impl(int do_verification, } // add device GEMM instances - std::vector< - ck::tensor_operation::device::device_grouped_gemm_instance::DeviceGroupedGemmNoOpPtr> - gemm_ptrs; + std::vector gemm_ptrs; if constexpr(is_same::value && is_same::value && is_same::value) @@ -182,28 +180,28 @@ void profile_grouped_gemm_impl(int do_verification, is_same::value && is_same::value) { - ck::tensor_operation::device::device_grouped_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(gemm_ptrs); } else if constexpr(is_same::value && is_same::value && is_same::value) { - ck::tensor_operation::device::device_grouped_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(gemm_ptrs); } else if constexpr(is_same::value && is_same::value && is_same::value) { - ck::tensor_operation::device::device_grouped_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances(gemm_ptrs); } else if constexpr(is_same::value && is_same::value && is_same::value) { - ck::tensor_operation::device::device_grouped_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(gemm_ptrs); } } diff --git a/profiler/include/profile_normalization_impl.hpp b/profiler/include/profile_normalization_impl.hpp index f7ecea43d5..6e864698c1 100644 --- a/profiler/include/profile_normalization_impl.hpp +++ b/profiler/include/profile_normalization_impl.hpp @@ -18,7 +18,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_normalization_instance { +namespace instance { void add_device_softmax_f16_f16_rank3_instances(std::vector&); void add_device_softmax_f16_f16_rank4_instances(std::vector&); @@ -26,7 +26,7 @@ void add_device_softmax_f16_f16_rank4_instances(std::vector&); void add_device_softmax_f32_f32_rank4_instances(std::vector&); -} // namespace device_normalization_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck @@ -109,23 +109,23 @@ void profile_normalization_impl(int do_verification, is_same::value) { if(in_length.size() == 3) - tensor_operation::device::device_normalization_instance:: - add_device_softmax_f16_f16_rank3_instances(instances); + tensor_operation::device::instance::add_device_softmax_f16_f16_rank3_instances( + instances); if(in_length.size() == 4) - tensor_operation::device::device_normalization_instance:: - add_device_softmax_f16_f16_rank4_instances(instances); + tensor_operation::device::instance::add_device_softmax_f16_f16_rank4_instances( + instances); } else if constexpr(is_same::value && is_same::value && is_same::value) { if(in_length.size() == 3) - tensor_operation::device::device_normalization_instance:: - add_device_softmax_f32_f32_rank3_instances(instances); + tensor_operation::device::instance::add_device_softmax_f32_f32_rank3_instances( + instances); if(in_length.size() == 4) - tensor_operation::device::device_normalization_instance:: - add_device_softmax_f32_f32_rank4_instances(instances); + tensor_operation::device::instance::add_device_softmax_f32_f32_rank4_instances( + instances); } } diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp index 71232c3875..a88b4bcd07 100644 --- a/profiler/include/profile_reduce_impl.hpp +++ b/profiler/include/profile_reduce_impl.hpp @@ -16,7 +16,7 @@ namespace ck { namespace tensor_operation { namespace device { -namespace device_reduce_instance { +namespace instance { template struct ReduceDescription @@ -91,7 +91,7 @@ bool description_match(const DescriptionType& description, return (result); }; -} // namespace device_reduce_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck @@ -142,7 +142,7 @@ bool profile_reduce_impl_impl(bool do_verification, float beta) { using namespace ck::tensor_operation::device; - using namespace ck::tensor_operation::device::device_reduce_instance; + using namespace ck::tensor_operation::device::instance; using ck::host_common::dumpBufferToFile; constexpr bool op_support_indices = @@ -464,7 +464,7 @@ bool profile_reduce_impl(bool do_verification, bool pass = true; using tuple_of_description_instances = - tensor_operation::device::device_reduce_instance::reduce_description_instances; + tensor_operation::device::instance::reduce_description_instances; const auto tuple_object = tuple_of_description_instances{}; diff --git a/profiler/src/profile_gemm_add_add_fastgelu.cpp b/profiler/src/profile_gemm_add_add_fastgelu.cpp index c4c770c293..84bcc07c7e 100644 --- a/profiler/src/profile_gemm_add_add_fastgelu.cpp +++ b/profiler/src/profile_gemm_add_add_fastgelu.cpp @@ -75,9 +75,7 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[]) auto e_type, auto a_layout, auto b_layout, - auto d0_layout, - auto d1_layout, - auto e_layout) { + auto de_layout) { using ADataType = decltype(a_type); using BDataType = decltype(b_type); using AccDataType = decltype(acc_type); @@ -87,15 +85,13 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[]) using ALayout = decltype(a_layout); using BLayout = decltype(b_layout); - using D0Layout = decltype(d0_layout); - using D1Layout = decltype(d1_layout); - using ELayout = decltype(e_layout); + using DELayout = decltype(de_layout); const int DefaultStrideA = ck::is_same_v ? K : M; const int DefaultStrideB = ck::is_same_v ? N : K; - const int DefaultStrideD0 = ck::is_same_v ? N : M; - const int DefaultStrideD1 = ck::is_same_v ? N : M; - const int DefaultStrideE = ck::is_same_v ? N : M; + const int DefaultStrideD0 = ck::is_same_v ? N : M; + const int DefaultStrideD1 = ck::is_same_v ? N : M; + const int DefaultStrideE = ck::is_same_v ? N : M; bool pass = ck::profiler::profile_gemm_add_add_fastgelu_impl( + DELayout>( do_verification, init_method, do_log, @@ -126,22 +120,22 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[]) if(data_type == MatrixDataType::F16_F16_F16_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN_MN) { - return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{}, Row{}); + return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Row{}, Row{}); } else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 && layout == MatrixLayout::MK_NK_MN_MN_MN) { - return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Col{}, Row{}, Row{}, Row{}); + return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Col{}, Row{}); } else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 && layout == MatrixLayout::KM_KN_MN_MN_MN) { - return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Col{}, Row{}, Row{}, Row{}, Row{}); + return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Col{}, Row{}, Row{}); } else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 && layout == MatrixLayout::KM_NK_MN_MN_MN) { - return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Col{}, Col{}, Row{}, Row{}, Row{}); + return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Col{}, Col{}, Row{}); } else { diff --git a/script/docker-rocm4.1.sh b/script/docker-rocm4.1.sh deleted file mode 100755 index 61cc33c5b8..0000000000 --- a/script/docker-rocm4.1.sh +++ /dev/null @@ -1,14 +0,0 @@ -WORKSPACE=$1 -echo "workspace: " $WORKSPACE - -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v $WORKSPACE:/root/workspace \ -rocm/tensorflow:rocm4.1-tf1.15-dev \ -/bin/bash - -#--network host \ diff --git a/script/docker-rocm4.3.1.sh b/script/docker-rocm4.3.1.sh deleted file mode 100755 index 48cb675b69..0000000000 --- a/script/docker-rocm4.3.1.sh +++ /dev/null @@ -1,14 +0,0 @@ -WORKSPACE=$1 -echo "workspace: " $WORKSPACE - -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v $WORKSPACE:/root/workspace \ -rocm/tensorflow:rocm4.3.1-tf2.6-dev \ -/bin/bash - -#--network host \ diff --git a/test/conv2d_bwd_data/conv2d_bwd_data.cpp b/test/conv2d_bwd_data/conv2d_bwd_data.cpp index cbb5a88c86..cb9245387a 100644 --- a/test/conv2d_bwd_data/conv2d_bwd_data.cpp +++ b/test/conv2d_bwd_data/conv2d_bwd_data.cpp @@ -20,7 +20,7 @@ using INT8 = int8_t; namespace ck { namespace tensor_operation { namespace device { -namespace device_conv2d_bwd_data_instance { +namespace instance { using DeviceConvBwdDataNoOpPtr = DeviceConvBwdDataPtr&); -} // namespace device_conv2d_bwd_data_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck @@ -220,28 +220,28 @@ int main(int argc, char* argv[]) ck::is_same_v, float> && ck::is_same_v, float>) { - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs); } else if constexpr(ck::is_same_v, ck::half_t> && ck::is_same_v, ck::half_t> && ck::is_same_v, ck::half_t>) { - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); } else if constexpr(ck::is_same_v, ck::bhalf_t> && ck::is_same_v, ck::bhalf_t> && ck::is_same_v, ck::bhalf_t>) { - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs); } else if constexpr(ck::is_same_v, int8_t> && ck::is_same_v, int8_t> && ck::is_same_v, int8_t>) { - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: + ck::tensor_operation::device::instance:: add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs); } diff --git a/test/convnd_fwd/conv_util.hpp b/test/convnd_fwd/conv_util.hpp index d04a509257..c698bbd05c 100644 --- a/test/convnd_fwd/conv_util.hpp +++ b/test/convnd_fwd/conv_util.hpp @@ -19,14 +19,14 @@ namespace device { using DeviceConvFwdNoOpPtr = DeviceConvFwdPtr; -namespace device_conv2d_fwd_instance { +namespace instance { void add_device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(std::vector&); void add_device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector&); void add_device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector&); void add_device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector&); -} // namespace device_conv2d_fwd_instance +} // namespace instance } // namespace device } // namespace tensor_operation } // namespace ck @@ -118,7 +118,7 @@ struct ConvolutionNDFwdInstances std::vector conv_ptrs; if(num_dim_spatial == 2) { - ck::tensor_operation::device::device_conv2d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs); } return conv_ptrs; @@ -133,7 +133,7 @@ struct ConvolutionNDFwdInstances std::vector conv_ptrs; if(num_dim_spatial == 2) { - ck::tensor_operation::device::device_conv2d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); } return conv_ptrs; @@ -148,7 +148,7 @@ struct ConvolutionNDFwdInstances std::vector conv_ptrs; if(num_dim_spatial == 2) { - ck::tensor_operation::device::device_conv2d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs); } return conv_ptrs; @@ -163,7 +163,7 @@ struct ConvolutionNDFwdInstances std::vector conv_ptrs; if(num_dim_spatial == 2) { - ck::tensor_operation::device::device_conv2d_fwd_instance:: + ck::tensor_operation::device::instance:: add_device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs); } return conv_ptrs; diff --git a/test/gemm/CMakeLists.txt b/test/gemm/CMakeLists.txt index b8679e3715..83b3c1e2e3 100644 --- a/test/gemm/CMakeLists.txt +++ b/test/gemm/CMakeLists.txt @@ -1,29 +1,15 @@ -# GEMM XDL -add_test_executable(test_gemm_xdl_fp32 gemm_xdl_fp32.cpp) -target_link_libraries(test_gemm_xdl_fp32 PRIVATE host_tensor) -target_link_libraries(test_gemm_xdl_fp32 PRIVATE device_gemm_instance) +add_test_executable(test_gemm_fp32 gemm_fp32.cpp) +target_link_libraries(test_gemm_fp32 PRIVATE host_tensor) +target_link_libraries(test_gemm_fp32 PRIVATE device_gemm_instance) -add_test_executable(test_gemm_xdl_fp16 gemm_xdl_fp16.cpp) -target_link_libraries(test_gemm_xdl_fp16 PRIVATE host_tensor) -target_link_libraries(test_gemm_xdl_fp16 PRIVATE device_gemm_instance) +add_test_executable(test_gemm_fp16 gemm_fp16.cpp) +target_link_libraries(test_gemm_fp16 PRIVATE host_tensor) +target_link_libraries(test_gemm_fp16 PRIVATE device_gemm_instance) -add_test_executable(test_gemm_xdl_bf16 gemm_xdl_bf16.cpp) -target_link_libraries(test_gemm_xdl_bf16 PRIVATE host_tensor) -target_link_libraries(test_gemm_xdl_bf16 PRIVATE device_gemm_instance) +add_test_executable(test_gemm_bf16 gemm_bf16.cpp) +target_link_libraries(test_gemm_bf16 PRIVATE host_tensor) +target_link_libraries(test_gemm_bf16 PRIVATE device_gemm_instance) -add_test_executable(test_gemm_xdl_int8 gemm_xdl_int8.cpp) -target_link_libraries(test_gemm_xdl_int8 PRIVATE host_tensor) -target_link_libraries(test_gemm_xdl_int8 PRIVATE device_gemm_instance) - -# GEMM DL -add_test_executable(test_gemm_dl_fp32 gemm_dl_fp32.cpp) -target_link_libraries(test_gemm_dl_fp32 PRIVATE host_tensor) -target_link_libraries(test_gemm_dl_fp32 PRIVATE device_gemm_instance) - -add_test_executable(test_gemm_dl_fp16 gemm_dl_fp16.cpp) -target_link_libraries(test_gemm_dl_fp16 PRIVATE host_tensor) -target_link_libraries(test_gemm_dl_fp16 PRIVATE device_gemm_instance) - -add_test_executable(test_gemm_dl_int8 gemm_dl_int8.cpp) -target_link_libraries(test_gemm_dl_int8 PRIVATE host_tensor) -TArget_link_libraries(test_gemm_dl_int8 PRIVATE device_gemm_instance) +add_test_executable(test_gemm_int8 gemm_int8.cpp) +target_link_libraries(test_gemm_int8 PRIVATE host_tensor) +target_link_libraries(test_gemm_int8 PRIVATE device_gemm_instance) diff --git a/test/gemm/gemm_bf16.cpp b/test/gemm/gemm_bf16.cpp new file mode 100644 index 0000000000..d7ecc892dc --- /dev/null +++ b/test/gemm/gemm_bf16.cpp @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/gemm.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/host_tensor/device_memory.hpp" +#include "ck/library/host_tensor/host_tensor.hpp" +#include "ck/library/host_tensor/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" + +#include "test/gemm/gemm_util.hpp" + +int main() +{ + using ADataType = ck::bhalf_t; + using BDataType = ck::bhalf_t; + using CDataType = ck::bhalf_t; + using AccDataType = float; + + using Row = ck::tensor_layout::gemm::RowMajor; + using Col = ck::tensor_layout::gemm::ColumnMajor; + + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + + auto test = [&](auto a_layout, auto b_layout, auto c_layout) { + bool pass = true; + + using DeviceOp = ck::tensor_operation::device::DeviceGemm; + + const auto gemmPtrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + for(auto& gemmPtr : gemmPtrs) + { + pass &= ck::gemm_util::TestGemm, + ADataType, + BDataType, + CDataType, + AccDataType, + decltype(a_layout), + decltype(b_layout), + decltype(c_layout), + PassThrough, + PassThrough, + PassThrough>{}(gemmPtr); + } + + return pass; + }; + + bool pass = test(Row{}, Row{}, Row{}) && test(Row{}, Col{}, Row{}) && + test(Col{}, Row{}, Row{}) && test(Col{}, Col{}, Row{}); + + std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; + return pass ? 0 : 1; +} diff --git a/test/gemm/gemm_dl_fp16.cpp b/test/gemm/gemm_dl_fp16.cpp deleted file mode 100644 index b4f6fea449..0000000000 --- a/test/gemm/gemm_dl_fp16.cpp +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/host_tensor/device_memory.hpp" -#include "ck/library/host_tensor/host_tensor.hpp" -#include "ck/library/host_tensor/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" - -#include "test/gemm/gemm_util.hpp" - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -using DeviceGemmNoOpPtr = - ck::tensor_operation::device::DeviceGemmPtr; - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_gemm_instance { - -void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(std::vector&); -void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(std::vector&); -void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(std::vector&); -void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(std::vector&); - -} // namespace device_gemm_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck - -int main() -{ - using ADataType = ck::half_t; - using BDataType = ck::half_t; - using CDataType = ck::half_t; - using AccDataType = float; - - using RowMajor = ck::tensor_layout::gemm::RowMajor; - using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; - - bool res = true; - - std::vector gemmPtrs; - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - return res ? 0 : 1; -} diff --git a/test/gemm/gemm_dl_fp32.cpp b/test/gemm/gemm_dl_fp32.cpp deleted file mode 100644 index 3ec88ec737..0000000000 --- a/test/gemm/gemm_dl_fp32.cpp +++ /dev/null @@ -1,135 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/host_tensor/device_memory.hpp" -#include "ck/library/host_tensor/host_tensor.hpp" -#include "ck/library/host_tensor/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" - -#include "test/gemm/gemm_util.hpp" - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -using DeviceGemmNoOpPtr = - ck::tensor_operation::device::DeviceGemmPtr; - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_gemm_instance { - -void add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(std::vector&); -void add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(std::vector&); -void add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(std::vector&); -void add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(std::vector&); - -} // namespace device_gemm_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck - -int main() -{ - using ADataType = float; - using BDataType = float; - using CDataType = float; - using AccDataType = float; - - using RowMajor = ck::tensor_layout::gemm::RowMajor; - using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; - - bool res = true; - std::vector gemmPtrs; - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - return res ? 0 : 1; -} diff --git a/test/gemm/gemm_dl_int8.cpp b/test/gemm/gemm_dl_int8.cpp deleted file mode 100644 index 105fb07733..0000000000 --- a/test/gemm/gemm_dl_int8.cpp +++ /dev/null @@ -1,135 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/host_tensor/device_memory.hpp" -#include "ck/library/host_tensor/host_tensor.hpp" -#include "ck/library/host_tensor/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" - -#include "test/gemm/gemm_util.hpp" - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -using DeviceGemmNoOpPtr = - ck::tensor_operation::device::DeviceGemmPtr; - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_gemm_instance { - -void add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(std::vector&); -void add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(std::vector&); -void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(std::vector&); -void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(std::vector&); - -} // namespace device_gemm_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck - -int main() -{ - using ADataType = int8_t; - using BDataType = int8_t; - using CDataType = int8_t; - using AccDataType = int; - - using RowMajor = ck::tensor_layout::gemm::RowMajor; - using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; - - bool res = true; - std::vector gemmPtrs; - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - return res ? 0 : 1; -} diff --git a/test/gemm/gemm_fp16.cpp b/test/gemm/gemm_fp16.cpp new file mode 100644 index 0000000000..ea9864abeb --- /dev/null +++ b/test/gemm/gemm_fp16.cpp @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/gemm.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/host_tensor/device_memory.hpp" +#include "ck/library/host_tensor/host_tensor.hpp" +#include "ck/library/host_tensor/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" + +#include "test/gemm/gemm_util.hpp" + +int main() +{ + using ADataType = ck::half_t; + using BDataType = ck::half_t; + using CDataType = ck::half_t; + using AccDataType = float; + + using Row = ck::tensor_layout::gemm::RowMajor; + using Col = ck::tensor_layout::gemm::ColumnMajor; + + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + + auto test = [&](auto a_layout, auto b_layout, auto c_layout) { + bool pass = true; + + using DeviceOp = ck::tensor_operation::device::DeviceGemm; + + const auto gemmPtrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + for(auto& gemmPtr : gemmPtrs) + { + pass &= ck::gemm_util::TestGemm, + ADataType, + BDataType, + CDataType, + AccDataType, + decltype(a_layout), + decltype(b_layout), + decltype(c_layout), + PassThrough, + PassThrough, + PassThrough>{}(gemmPtr); + } + + return pass; + }; + + bool pass = test(Row{}, Row{}, Row{}) && test(Row{}, Col{}, Row{}) && + test(Col{}, Row{}, Row{}) && test(Col{}, Col{}, Row{}); + + std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; + return pass ? 0 : 1; +} diff --git a/test/gemm/gemm_fp32.cpp b/test/gemm/gemm_fp32.cpp new file mode 100644 index 0000000000..b66addd712 --- /dev/null +++ b/test/gemm/gemm_fp32.cpp @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/gemm.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/host_tensor/device_memory.hpp" +#include "ck/library/host_tensor/host_tensor.hpp" +#include "ck/library/host_tensor/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" + +#include "test/gemm/gemm_util.hpp" + +int main() +{ + using ADataType = float; + using BDataType = float; + using CDataType = float; + using AccDataType = float; + + using Row = ck::tensor_layout::gemm::RowMajor; + using Col = ck::tensor_layout::gemm::ColumnMajor; + + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + + auto test = [&](auto a_layout, auto b_layout, auto c_layout) { + bool pass = true; + + using DeviceOp = ck::tensor_operation::device::DeviceGemm; + + const auto gemmPtrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + for(auto& gemmPtr : gemmPtrs) + { + pass &= ck::gemm_util::TestGemm, + ADataType, + BDataType, + CDataType, + AccDataType, + decltype(a_layout), + decltype(b_layout), + decltype(c_layout), + PassThrough, + PassThrough, + PassThrough>{}(gemmPtr); + } + + return pass; + }; + + bool pass = test(Row{}, Row{}, Row{}) && test(Row{}, Col{}, Row{}) && + test(Col{}, Row{}, Row{}) && test(Col{}, Col{}, Row{}); + + std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; + return pass ? 0 : 1; +} diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_fp64.cpp new file mode 100644 index 0000000000..e0b9cab370 --- /dev/null +++ b/test/gemm/gemm_fp64.cpp @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/gemm.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/host_tensor/device_memory.hpp" +#include "ck/library/host_tensor/host_tensor.hpp" +#include "ck/library/host_tensor/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" + +#include "test/gemm/gemm_util.hpp" + +int main() +{ + using ADataType = double; + using BDataType = double; + using CDataType = double; + using AccDataType = double; + + using Row = ck::tensor_layout::gemm::RowMajor; + using Col = ck::tensor_layout::gemm::ColumnMajor; + + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + + auto test = [&](auto a_layout, auto b_layout, auto c_layout) { + bool pass = true; + + using DeviceOp = ck::tensor_operation::device::DeviceGemm; + + const auto gemmPtrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + for(auto& gemmPtr : gemmPtrs) + { + pass &= ck::gemm_util::TestGemm, + ADataType, + BDataType, + CDataType, + AccDataType, + decltype(a_layout), + decltype(b_layout), + decltype(c_layout), + PassThrough, + PassThrough, + PassThrough>{}(gemmPtr); + } + + return pass; + }; + + bool pass = test(Row{}, Row{}, Row{}) && test(Row{}, Col{}, Row{}) && + test(Col{}, Row{}, Row{}) && test(Col{}, Col{}, Row{}); + + std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; + return pass ? 0 : 1; +} diff --git a/test/gemm/gemm_int8.cpp b/test/gemm/gemm_int8.cpp new file mode 100644 index 0000000000..972f407975 --- /dev/null +++ b/test/gemm/gemm_int8.cpp @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/gemm.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/host_tensor/device_memory.hpp" +#include "ck/library/host_tensor/host_tensor.hpp" +#include "ck/library/host_tensor/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" + +#include "test/gemm/gemm_util.hpp" + +int main() +{ + using ADataType = int8_t; + using BDataType = int8_t; + using CDataType = int8_t; + using AccDataType = int32_t; + + using Row = ck::tensor_layout::gemm::RowMajor; + using Col = ck::tensor_layout::gemm::ColumnMajor; + + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + + auto test = [&](auto a_layout, auto b_layout, auto c_layout) { + bool pass = true; + + using DeviceOp = ck::tensor_operation::device::DeviceGemm; + + const auto gemmPtrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + for(auto& gemmPtr : gemmPtrs) + { + pass &= ck::gemm_util::TestGemm, + ADataType, + BDataType, + CDataType, + AccDataType, + decltype(a_layout), + decltype(b_layout), + decltype(c_layout), + PassThrough, + PassThrough, + PassThrough>{}(gemmPtr); + } + + return pass; + }; + + bool pass = test(Row{}, Row{}, Row{}) && test(Row{}, Col{}, Row{}) && + test(Col{}, Row{}, Row{}) && test(Col{}, Col{}, Row{}); + + std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; + return pass ? 0 : 1; +} diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp index 7af3799e7e..4528c4aaef 100644 --- a/test/gemm/gemm_util.hpp +++ b/test/gemm/gemm_util.hpp @@ -159,7 +159,7 @@ struct TestGemm return std::make_tuple(a_m_k, b_k_n, c_m_n_host_result, c_m_n_device_result); } - auto operator()(DeviceGemmPtr_& gemmPtr) + auto operator()(const DeviceGemmPtr_& gemmPtr) { std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name << ", CLayout = " << CLayout{}.name << std::endl; diff --git a/test/gemm/gemm_xdl_bf16.cpp b/test/gemm/gemm_xdl_bf16.cpp deleted file mode 100644 index 415141c2cc..0000000000 --- a/test/gemm/gemm_xdl_bf16.cpp +++ /dev/null @@ -1,138 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/host_tensor/device_memory.hpp" -#include "ck/library/host_tensor/host_tensor.hpp" -#include "ck/library/host_tensor/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" - -#include "test/gemm/gemm_util.hpp" - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -using DeviceGemmNoOpPtr = - ck::tensor_operation::device::DeviceGemmPtr; - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_gemm_instance { -void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances( - std::vector&); -void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances( - std::vector&); -void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances( - std::vector&); -void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances( - std::vector&); -} // namespace device_gemm_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck - -int main() -{ - using ADataType = ck::bhalf_t; - using BDataType = ck::bhalf_t; - using CDataType = ck::bhalf_t; - using AccDataType = float; - - using RowMajor = ck::tensor_layout::gemm::RowMajor; - using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; - - bool res = true; - std::vector gemmPtrs; - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - return res ? 0 : 1; -} diff --git a/test/gemm/gemm_xdl_fp16.cpp b/test/gemm/gemm_xdl_fp16.cpp deleted file mode 100644 index fac4d346df..0000000000 --- a/test/gemm/gemm_xdl_fp16.cpp +++ /dev/null @@ -1,175 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/host_tensor/device_memory.hpp" -#include "ck/library/host_tensor/host_tensor.hpp" -#include "ck/library/host_tensor/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" - -#include "test/gemm/gemm_util.hpp" - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -using DeviceGemmNoOpPtr = - ck::tensor_operation::device::DeviceGemmPtr; - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_gemm_instance { -void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(std::vector&); - -#if 0 -void add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(std::vector&); -#endif - -void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(std::vector&); - -void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances( - std::vector&); -} // namespace device_gemm_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck - -int main() -{ - using ADataType = ck::half_t; - using BDataType = ck::half_t; - using CDataType = ck::half_t; - using AccDataType = float; - - using RowMajor = ck::tensor_layout::gemm::RowMajor; - using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; - - bool res = true; - std::vector gemmPtrs; - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(gemmPtrs); -#if 0 - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(gemmPtrs); -#endif - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(gemmPtrs); -#if 0 - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(gemmPtrs); -#endif - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(gemmPtrs); -#if 0 - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(gemmPtrs); -#endif - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(gemmPtrs); -#if 0 - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(gemmPtrs); -#endif - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(gemmPtrs); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - return res ? 0 : 1; -} diff --git a/test/gemm/gemm_xdl_fp32.cpp b/test/gemm/gemm_xdl_fp32.cpp deleted file mode 100644 index 0a83782629..0000000000 --- a/test/gemm/gemm_xdl_fp32.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/host_tensor/device_memory.hpp" -#include "ck/library/host_tensor/host_tensor.hpp" -#include "ck/library/host_tensor/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" - -#include "test/gemm/gemm_util.hpp" - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -using DeviceGemmNoOpPtr = - ck::tensor_operation::device::DeviceGemmPtr; - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_gemm_instance { -void add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(std::vector&); - -#if 0 -void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(std::vector&); -#endif - -void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(std::vector&); - -} // namespace device_gemm_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck - -int main() -{ - using ADataType = float; - using BDataType = float; - using CDataType = float; - using AccDataType = float; - - using RowMajor = ck::tensor_layout::gemm::RowMajor; - using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; - - bool res = true; - std::vector gemmPtrs; - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(gemmPtrs); -#if 0 - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(gemmPtrs); -#endif - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances(gemmPtrs); -#if 0 - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(gemmPtrs); -#endif - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(gemmPtrs); -#if 0 - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(gemmPtrs); -#endif - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(gemmPtrs); -#if 0 - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(gemmPtrs); -#endif - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - return res ? 0 : 1; -} diff --git a/test/gemm/gemm_xdl_fp64.cpp b/test/gemm/gemm_xdl_fp64.cpp deleted file mode 100644 index 014396520b..0000000000 --- a/test/gemm/gemm_xdl_fp64.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/host_tensor/device_memory.hpp" -#include "ck/library/host_tensor/host_tensor.hpp" -#include "ck/library/host_tensor/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" - -#include "test/gemm/gemm_util.hpp" - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -using DeviceGemmNoOpPtr = - ck::tensor_operation::device::DeviceGemmPtr; - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_gemm_instance { -void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(std::vector&); - -} // namespace device_gemm_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck - -inline std::string get_device_name() -{ - hipDeviceProp_t props{}; - int device; - auto status = hipGetDevice(&device); - if(status != hipSuccess) - { - return std::string(); - } - - status = hipGetDeviceProperties(&props, device); - if(status != hipSuccess) - { - return std::string(); - } - const std::string name(props.gcnArchName); - - return name; -} - -int main() -{ - if(get_device_name().find("gfx90a") == std::string::npos) - { - std::cout << "TestGemm ..... SUCCESS" << std::endl; - return 0; - } - using ADataType = double; - using BDataType = double; - using CDataType = double; - using AccDataType = double; - - using RowMajor = ck::tensor_layout::gemm::RowMajor; - using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; - - bool res = true; - std::vector gemmPtrs; - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - return res ? 0 : 1; -} diff --git a/test/gemm/gemm_xdl_int8.cpp b/test/gemm/gemm_xdl_int8.cpp deleted file mode 100644 index 952ddb9721..0000000000 --- a/test/gemm/gemm_xdl_int8.cpp +++ /dev/null @@ -1,135 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/device_gemm.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/host_tensor/device_memory.hpp" -#include "ck/library/host_tensor/host_tensor.hpp" -#include "ck/library/host_tensor/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" - -#include "test/gemm/gemm_util.hpp" - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -using DeviceGemmNoOpPtr = - ck::tensor_operation::device::DeviceGemmPtr; - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_gemm_instance { -void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(std::vector&); -void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(std::vector&); -} // namespace device_gemm_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck - -int main() -{ - using ADataType = int8_t; - using BDataType = int8_t; - using CDataType = int8_t; - using AccDataType = int32_t; - - using RowMajor = ck::tensor_layout::gemm::RowMajor; - using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; - - std::vector gemmPtrs; - bool res = true; - - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - gemmPtrs.clear(); - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(gemmPtrs); - - for(auto& gemmPtr : gemmPtrs) - { - res &= ck::gemm_util::TestGemm{}(gemmPtr); - } - - std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; - return res ? 0 : 1; -} diff --git a/test/gemm_split_k/gemm_split_k.cpp b/test/gemm_split_k/gemm_split_k.cpp index ed732b09c3..fa06d76e36 100644 --- a/test/gemm_split_k/gemm_split_k.cpp +++ b/test/gemm_split_k/gemm_split_k.cpp @@ -11,6 +11,8 @@ #include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp" + #include "ck/library/utility/check_err.hpp" #include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/host_tensor/host_tensor.hpp" @@ -27,30 +29,6 @@ enum struct GemmMatrixLayout KM_NK_MN, // 3 }; -using DeviceGemmSplitKNoOpPtr = ck::tensor_operation::device::DeviceGemmSplitKPtr< - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough>; - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_gemm_instance { - -void add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances( - std::vector&); -void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances( - std::vector&); -void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances( - std::vector&); -void add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances( - std::vector&); - -} // namespace device_gemm_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck - template static bool check_out(const Tensor& ref, const Tensor& result) { @@ -82,6 +60,11 @@ struct gemmArgs int test_gemm(const gemmArgs& args) { + using Row = ck::tensor_layout::gemm::RowMajor; + using Col = ck::tensor_layout::gemm::ColumnMajor; + + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + bool a_row_major, b_row_major, c_row_major; switch(args.layout) @@ -152,64 +135,79 @@ int test_gemm(const gemmArgs& args) b_device_buf.ToDevice(b_k_n.mData.data()); c_device_buf.ToDevice(c_m_n_device_result.mData.data()); - // add device GEMM instances - std::vector gemm_ptrs; + auto test = [&](auto a_layout, auto b_layout, auto c_layout) { + bool success = false; + + using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK; + + const auto gemm_ptrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + for(auto& gemm_ptr : gemm_ptrs) + { + auto argument_ptr = + gemm_ptr->MakeArgumentPointer(static_cast(a_device_buf.GetDeviceBuffer()), + static_cast(b_device_buf.GetDeviceBuffer()), + static_cast(c_device_buf.GetDeviceBuffer()), + args.M, + args.N, + args.K, + args.StrideA, + args.StrideB, + args.StrideC, + ck::tensor_operation::element_wise::PassThrough{}, + ck::tensor_operation::element_wise::PassThrough{}, + ck::tensor_operation::element_wise::PassThrough{}, + args.KBatch); + + auto invoker_ptr = gemm_ptr->MakeInvokerPointer(); + + if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get()); + + c_device_buf.FromDevice(c_m_n_device_result.mData.data()); + + if(!check_out(c_m_n_host_result, c_m_n_device_result)) + { + success = false; + break; + } + success = true; + } + } + + return success; + }; + + bool success = false; if(args.layout == GemmMatrixLayout::MK_KN_MN) { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(gemm_ptrs); + success = test(Row{}, Row{}, Row{}); } else if(args.layout == GemmMatrixLayout::MK_NK_MN) { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(gemm_ptrs); + success = test(Row{}, Col{}, Row{}); } else if(args.layout == GemmMatrixLayout::KM_KN_MN) { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(gemm_ptrs); + success = test(Col{}, Row{}, Row{}); } else { - ck::tensor_operation::device::device_gemm_instance:: - add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(gemm_ptrs); + success = test(Col{}, Col{}, Row{}); } - bool success = false; - for(auto& gemm_ptr : gemm_ptrs) - { - auto argument_ptr = - gemm_ptr->MakeArgumentPointer(static_cast(a_device_buf.GetDeviceBuffer()), - static_cast(b_device_buf.GetDeviceBuffer()), - static_cast(c_device_buf.GetDeviceBuffer()), - args.M, - args.N, - args.K, - args.StrideA, - args.StrideB, - args.StrideC, - ck::tensor_operation::element_wise::PassThrough{}, - ck::tensor_operation::element_wise::PassThrough{}, - ck::tensor_operation::element_wise::PassThrough{}, - args.KBatch); - - auto invoker_ptr = gemm_ptr->MakeInvokerPointer(); - - if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) - { - invoker_ptr->Run(argument_ptr.get()); - - c_device_buf.FromDevice(c_m_n_device_result.mData.data()); - - if(!check_out(c_m_n_host_result, c_m_n_device_result)) - { - success = false; - break; - } - success = true; - } - } auto error_code = 0; if(success) { diff --git a/test/grouped_gemm/grouped_gemm_fp16.cpp b/test/grouped_gemm/grouped_gemm_fp16.cpp index 4e8ebf6174..5418ee02bd 100644 --- a/test/grouped_gemm/grouped_gemm_fp16.cpp +++ b/test/grouped_gemm/grouped_gemm_fp16.cpp @@ -28,7 +28,7 @@ using DeviceGroupedGemmPtr_ = ck::tensor_operation::device::DeviceGroupedGemmPtr namespace ck { namespace tensor_operation { namespace device { -namespace device_grouped_gemm_instance { +namespace instance { void add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances( std::vector&); } @@ -197,7 +197,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr) int main() { std::vector groupedGemmPtrs; - ck::tensor_operation::device::device_grouped_gemm_instance:: + ck::tensor_operation::device::instance:: add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(groupedGemmPtrs); bool res = true;