mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 10:59:55 +00:00
Reorganize project folders (#6)
This commit is contained in:
2
client_example/01_gemm/CMakeLists.txt
Normal file
2
client_example/01_gemm/CMakeLists.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
add_executable(client_gemm gemm.cpp)
|
||||
target_link_libraries(client_gemm PRIVATE composable_kernel::device_other_operations composable_kernel::device_gemm_operations)
|
||||
126
client_example/01_gemm/README.md
Normal file
126
client_example/01_gemm/README.md
Normal file
@@ -0,0 +1,126 @@
|
||||
[Back to supported operations](../../../include/ck/README.md)
|
||||
# Composable Kernel GEMM
|
||||
|
||||
## GEMM
|
||||
General matrix multiplications operation. In CK GEMM operation is called as `DeviceGemm` and requires following types as template parameters:
|
||||
|
||||
* **ALayout** - A matrix layout (RowMajor/ColumnMajor).
|
||||
* **BLayout** - B matrix layout (RowMajor/ColumnMajor).
|
||||
* **CLayout** - B matrix layout (RowMajor/ColumnMajor).
|
||||
* **ADataType** - A matrix data type.
|
||||
* **BDataType** - B matrix data type.
|
||||
* **CDataType** - B matrix data type.
|
||||
* **AElementwiseOperation** - Fused operation on tensor A before GEMM.
|
||||
* **BElementwiseOperation** - Fused operation on tensor B before GEMM.
|
||||
* **CElementwiseOperation** - Fused operation on tensor C after GEMM.
|
||||
|
||||
For matrices with large K dimension `DeviceGemmSplitK` implementation is available. This implementation allows user to split K dimension between work groups. This implementation uses `AtomicAdd` operation on global memory, thus need to zero-out output buffer for correct results.
|
||||
|
||||
For fused operations with additional tensor there are `DeviceGemmMultipleABD` or `DeviceGemmMultipleD` operation which require following parameters:
|
||||
* **DsLayout** - layouts for additional tensors for fused operations.
|
||||
* **DsDataType** - data types for additional tensors for fused operations.
|
||||
|
||||
For `DeviceGemmMultipleABD` **ALayout**, **BLayout**, **ADataType** and **BDataType** user should pass a tuple.
|
||||
|
||||
List of the device operations in CK:
|
||||
|
||||
* **DeviceGemmDl** - Device operation with DL instructions.
|
||||
* **DeviceGemmDpp** - Device operation with DL instructions with DPP instructions during data load.
|
||||
* **DeviceGemmWmma_CShuffle** - Device operation with WMMA instructions with CShuffle optimization for more optimized data store.
|
||||
* **DeviceGemm_Xdl_CShuffle_LdsDirectLoad** - Device operation with XDL instructions and CShuffle optimization for more optimized data store and direct load from global memory to shared memory.
|
||||
* **DeviceGemm_Xdl_CShuffle** - Device operation with XDL instructions with CShuffle optimization for more optimized data store.
|
||||
* **DeviceGemm_Xdl_CShuffleV2** - Device operation with XDL instructions with CShuffle optimization for more optimized data store. GEMM pipeline has been optimized compared to **DeviceGemm_Xdl_CShuffle**.
|
||||
* **DeviceGemmXdlSkipBLds** - Device operation with XDL instructions. Load to shared memory has been skiped for B matrix.
|
||||
* **DeviceGemm_Xdl_WaveletModel_CShuffle** - Device operation with XDL instructions with CShuffle optimization for more optimized data store. Producer and consumer scheme cooperation between waves in workgroup.
|
||||
* **DeviceGemmXdl** - Device operation with XDL instructions.
|
||||
|
||||
Table of supported cases by instance factory with XDL instruction for Row/Row/Row, Row/Column/Row, Column/Row/Row or Column/Column/Row:
|
||||
|
||||
| |Is supported|
|
||||
|-------|---|
|
||||
|bf16|✓|
|
||||
|fp16|✓|
|
||||
|fp32|✓|
|
||||
|int8|✓|
|
||||
|fp8 |✓|
|
||||
|
||||
Table of supported cases by instance factory with WMMA instruction for Row/Row/Row, Row/Column/Row, Column/Row/Row or Column/Column/Row:
|
||||
|
||||
| |Is supported|
|
||||
|-------|---|
|
||||
|bf16|✓|
|
||||
|fp16|✓|
|
||||
|fp32|✗|
|
||||
|int8|✓|
|
||||
|fp8 |✗|
|
||||
|
||||
Table of supported cases by instance factory with DL instruction for Row/Row/Row, Row/Column/Row, Column/Row/Row or Column/Column/Row:
|
||||
|
||||
| |Is supported|
|
||||
|-------|---|
|
||||
|bf16|✗|
|
||||
|fp16|✓|
|
||||
|fp32|✓|
|
||||
|int8|✓|
|
||||
|fp8 |✗|
|
||||
|
||||
Table of supported cases by instance factory with fused output elementwise operation:
|
||||
|
||||
* **B Matrix Multiply + Add + Gelu** - bf16 (int8 for B matrix)
|
||||
* **B Matrix Multiply + Add** - bf16 (int8 for B matrix)
|
||||
* **B Matrix Multiply + Gelu** - bf16 (int8 for B matrix)
|
||||
* **B Matrix Multiply** - bf16 (int8 for B matrix)
|
||||
|
||||
* **Add + Add + Gelu** - fp16
|
||||
* **Add + Gelu** - fp16, bf16 (int8 for B matrix) for Row/Column/Row
|
||||
* **Multiply** - fp16
|
||||
* **Add + Multiply** - fp16
|
||||
* **Add + Relu** - fp16 (int8 for B matrix) for Row/Column/Row, bf16 (int8 for B matrix) for Row/Column/Row
|
||||
* **Add + Silu** - fp16 (int8 for B matrix) for Row/Column/Row, bf16 (int8 for B matrix) for Row/Column/Row
|
||||
* **Add** - fp16 (int8 for B matrix) for Row/Column/Row, bf16 (int8 for B matrix) for Row/Column/Row
|
||||
* **Bilinear** - fp16, int8
|
||||
* **Gelu** - fp16
|
||||
* **Multiply + Add** - fp16 for Row/Column/Row and Row/Row/Row, fp16 (int8 for B matrix, fp32 for Bias) for Row/Column/Row and Row/Row/Row,
|
||||
* **Quantization** - int8
|
||||
|
||||
## GEMM V2 (Universal GEMM)
|
||||
General matrix multiplications operation optimized for MI300 series. Operation is called as `DeviceGemmV2` and requires following types as template parameters:
|
||||
|
||||
* **ALayout** - A matrix layout (RowMajor/ColumnMajor).
|
||||
* **BLayout** - B matrix layout (RowMajor/ColumnMajor).
|
||||
* **CLayout** - B matrix layout (RowMajor/ColumnMajor).
|
||||
* **ADataType** - A matrix data type.
|
||||
* **BDataType** - B matrix data type.
|
||||
* **CDataType** - B matrix data type.
|
||||
* **AElementwiseOperation** - Fused operation on tensor A before GEMM.
|
||||
* **BElementwiseOperation** - Fused operation on tensor B before GEMM.
|
||||
* **CElementwiseOperation** - Fused operation on tensor C after GEMM.
|
||||
|
||||
This implementation allows user to split K dimension between work groups. This implementation requires AtomicAdd operation on global memory (output buffer must be set to zeroes if splitK parameter is larger than one).
|
||||
|
||||
List of the device operations for in CK:
|
||||
|
||||
* **DeviceGemm_Xdl_CShuffleV3** - Device operation with XDL instructions with CShuffle optimization for more optimized data store.
|
||||
* **DeviceGemm_Xdl_CShuffleV3R1** - Device operation with XDL instructions with CShuffle optimization for more optimized data store. This implementation perform reduction on splitted K dimension after GEMM instead of AtomicAdd instruction.
|
||||
|
||||
Table of supported cases by instance factory with XDL instruction for Row/Row/Row, Row/Column/Row, Column/Row/Row or Column/Column/Row:
|
||||
|
||||
| |Is supported|
|
||||
|-------|---|
|
||||
|bf16|✓|
|
||||
|fp16|✓|
|
||||
|fp32|✗|
|
||||
|int8|✗|
|
||||
|fp8 (C bf16)|✓|
|
||||
|fp16 (A fp8)|✓|
|
||||
|fp16 (B fp8)|✓|
|
||||
|
||||
## Others
|
||||
|
||||
* **DeviceGemm_dequantB** - GEMM with dequantization (implemented with WMMA instructions).
|
||||
* **DeviceGemmMultipleD_ABScale** - GEMM with scale for A and B matrix.
|
||||
* **DeviceGemmMultipleDLayernorm** - GEMM fused with layernorm.
|
||||
* **DeviceGemmMultipleDMultipleR** - GEMM fused with reductions and custom global reductions operators.
|
||||
* **DeviceGemmReduce** - GEMM fused with reduction.
|
||||
* **DeviceGemm_Streamk_V2** - GEMM stream K implementation. Implementation allows to use reduction instead of AtomicAdd.
|
||||
* **DeviceGemmStreamK** - GEMM stream K implementation using AtomicAdd.
|
||||
219
client_example/01_gemm/gemm.cpp
Normal file
219
client_example/01_gemm/gemm.cpp
Normal file
@@ -0,0 +1,219 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CElementOp = PassThrough;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using CDataType = F16;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using CLayout = Row;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
ck::index_t N = 4096;
|
||||
ck::index_t K = 4096;
|
||||
|
||||
ck::index_t StrideA = 4096;
|
||||
ck::index_t StrideB = 4096;
|
||||
ck::index_t StrideC = 4096;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 7)
|
||||
{
|
||||
M = std::stoi(argv[1]);
|
||||
N = std::stoi(argv[2]);
|
||||
K = std::stoi(argv[3]);
|
||||
|
||||
StrideA = std::stoi(argv[4]);
|
||||
StrideB = std::stoi(argv[5]);
|
||||
StrideC = std::stoi(argv[6]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideC\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
|
||||
SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
|
||||
|
||||
using DeviceOp =
|
||||
ck::tensor_operation::device::DeviceGemm<ALayout,
|
||||
BLayout,
|
||||
CLayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
CDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto c_element_op = CElementOp{};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
c_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
StrideC,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
c_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
|
||||
std::size_t num_btype =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
c_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
StrideC,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
c_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
29
client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
Normal file
29
client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
Normal file
@@ -0,0 +1,29 @@
|
||||
if(GPU_TARGETS MATCHES "gfx9")
|
||||
add_custom_target(client_gemm_fastgelu_examples)
|
||||
|
||||
add_executable(client_gemm_add_add_fastgelu gemm_add_add_fastgelu.cpp)
|
||||
target_link_libraries(client_gemm_add_add_fastgelu PRIVATE composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_gemm_add_fastgelu gemm_add_fastgelu.cpp)
|
||||
target_link_libraries(client_gemm_add_fastgelu PRIVATE composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_gemm_fastgelu gemm_fastgelu.cpp)
|
||||
target_link_libraries(client_gemm_fastgelu PRIVATE composable_kernel::device_gemm_operations)
|
||||
|
||||
add_dependencies(client_gemm_fastgelu_examples client_gemm_add_add_fastgelu client_gemm_add_fastgelu
|
||||
client_gemm_fastgelu)
|
||||
|
||||
add_custom_target(client_gemm_fastgelu_generic_examples)
|
||||
|
||||
add_executable(client_gemm_add_add_fastgelu_generic gemm_add_add_fastgelu_generic.cpp)
|
||||
target_link_libraries(client_gemm_add_add_fastgelu_generic composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_gemm_add_fastgelu_generic gemm_add_fastgelu_generic.cpp)
|
||||
target_link_libraries(client_gemm_add_fastgelu_generic PRIVATE composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_gemm_fastgelu_generic gemm_fastgelu_generic.cpp)
|
||||
target_link_libraries(client_gemm_fastgelu_generic PRIVATE composable_kernel::device_gemm_operations)
|
||||
|
||||
add_dependencies(client_gemm_fastgelu_generic_examples client_gemm_add_add_fastgelu_generic
|
||||
client_gemm_add_fastgelu_generic client_gemm_fastgelu_generic)
|
||||
endif()
|
||||
@@ -0,0 +1,242 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = AddAddFastGelu;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using D0DataType = F16;
|
||||
using D1DataType = F16;
|
||||
using EDataType = F16;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using D0Layout = Row;
|
||||
using D1Layout = Row;
|
||||
using ELayout = Row;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
ck::index_t N = 4096;
|
||||
ck::index_t K = 4096;
|
||||
|
||||
ck::index_t StrideA = 4096;
|
||||
ck::index_t StrideB = 4096;
|
||||
ck::index_t StrideD0 = 0;
|
||||
ck::index_t StrideD1 = 4096;
|
||||
ck::index_t StrideE = 4096;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 9)
|
||||
{
|
||||
M = std::stoi(argv[1]);
|
||||
N = std::stoi(argv[2]);
|
||||
K = std::stoi(argv[3]);
|
||||
|
||||
StrideA = std::stoi(argv[4]);
|
||||
StrideB = std::stoi(argv[5]);
|
||||
StrideD0 = std::stoi(argv[6]);
|
||||
StrideD1 = std::stoi(argv[7]);
|
||||
StrideE = std::stoi(argv[8]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1 to 8: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
|
||||
SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
|
||||
f_matrix_space_size(M, N, StrideD0, D0Layout{}));
|
||||
SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) *
|
||||
f_matrix_space_size(M, N, StrideD1, D1Layout{}));
|
||||
SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
|
||||
ALayout,
|
||||
BLayout,
|
||||
ck::Tuple<D0Layout, D1Layout>,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ck::Tuple<D0DataType, D1DataType>,
|
||||
EDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::AddAddFastGelu>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
|
||||
d1_m_n_device_buf.GetDeviceBuffer()},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
std::array<ck::index_t, 2>{StrideD0, StrideD1},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
|
||||
std::size_t num_btype =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
|
||||
d1_m_n_device_buf.GetDeviceBuffer()},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
std::array<ck::index_t, 2>{StrideD0, StrideD1},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,176 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = AddAddFastGelu;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using D0DataType = F16;
|
||||
using D1DataType = F16;
|
||||
using EDataType = F16;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using D0Layout = Row;
|
||||
using D1Layout = Row;
|
||||
using ELayout = Row;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
ck::index_t N = 4096;
|
||||
ck::index_t K = 4096;
|
||||
|
||||
ck::index_t StrideA = 4096;
|
||||
ck::index_t StrideB = 4096;
|
||||
ck::index_t StrideD0 = 0;
|
||||
ck::index_t StrideD1 = 4096;
|
||||
ck::index_t StrideE = 4096;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 9)
|
||||
{
|
||||
M = std::stoi(argv[1]);
|
||||
N = std::stoi(argv[2]);
|
||||
K = std::stoi(argv[3]);
|
||||
|
||||
StrideA = std::stoi(argv[4]);
|
||||
StrideB = std::stoi(argv[5]);
|
||||
StrideD0 = std::stoi(argv[6]);
|
||||
StrideD1 = std::stoi(argv[7]);
|
||||
StrideE = std::stoi(argv[8]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1 to 8: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
|
||||
SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
|
||||
f_matrix_space_size(M, N, StrideD0, D0Layout{}));
|
||||
SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) *
|
||||
f_matrix_space_size(M, N, StrideD1, D1Layout{}));
|
||||
SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
|
||||
ALayout,
|
||||
BLayout,
|
||||
ck::Tuple<D0Layout, D1Layout>,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ck::Tuple<D0DataType, D1DataType>,
|
||||
EDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::AddAddFastGelu>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{};
|
||||
|
||||
// get generic instance
|
||||
auto& op_ptr = op_ptrs[0];
|
||||
|
||||
std::cout << "Run the generic instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
// run the generic instance
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
|
||||
d1_m_n_device_buf.GetDeviceBuffer()},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
std::array<ck::index_t, 2>{StrideD0, StrideD1},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"Generic instance should be suitable for various input lengths/strides");
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
234
client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
Normal file
234
client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
Normal file
@@ -0,0 +1,234 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = AddFastGelu;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using D0DataType = F16;
|
||||
using EDataType = F16;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using D0Layout = Row;
|
||||
using ELayout = Row;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
ck::index_t N = 4096;
|
||||
ck::index_t K = 4096;
|
||||
|
||||
ck::index_t StrideA = 4096;
|
||||
ck::index_t StrideB = 4096;
|
||||
ck::index_t StrideD0 = 0;
|
||||
ck::index_t StrideE = 4096;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 8)
|
||||
{
|
||||
M = std::stoi(argv[1]);
|
||||
N = std::stoi(argv[2]);
|
||||
K = std::stoi(argv[3]);
|
||||
|
||||
StrideA = std::stoi(argv[4]);
|
||||
StrideB = std::stoi(argv[5]);
|
||||
StrideD0 = std::stoi(argv[6]);
|
||||
StrideE = std::stoi(argv[7]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1 to 7: M, N, K, StrideA, StrideB, StrideD0, StrideE\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
|
||||
SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
|
||||
f_matrix_space_size(M, N, StrideD0, D0Layout{}));
|
||||
SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
|
||||
ALayout,
|
||||
BLayout,
|
||||
ck::Tuple<D0Layout>,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ck::Tuple<D0DataType>,
|
||||
EDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::AddFastGelu>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 1>{d0_m_n_device_buf.GetDeviceBuffer()},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
std::array<ck::index_t, 1>{StrideD0},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
|
||||
std::size_t num_btype =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 1>{d0_m_n_device_buf.GetDeviceBuffer()},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
std::array<ck::index_t, 1>{StrideD0},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,169 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = AddFastGelu;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using D0DataType = F16;
|
||||
using EDataType = F16;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using D0Layout = Row;
|
||||
using ELayout = Row;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
ck::index_t N = 4096;
|
||||
ck::index_t K = 4096;
|
||||
|
||||
ck::index_t StrideA = 4096;
|
||||
ck::index_t StrideB = 4096;
|
||||
ck::index_t StrideD0 = 0;
|
||||
ck::index_t StrideE = 4096;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 8)
|
||||
{
|
||||
M = std::stoi(argv[1]);
|
||||
N = std::stoi(argv[2]);
|
||||
K = std::stoi(argv[3]);
|
||||
|
||||
StrideA = std::stoi(argv[4]);
|
||||
StrideB = std::stoi(argv[5]);
|
||||
StrideD0 = std::stoi(argv[6]);
|
||||
StrideE = std::stoi(argv[7]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1 to 7: M, N, K, StrideA, StrideB, StrideD0, StrideE\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
|
||||
SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
|
||||
f_matrix_space_size(M, N, StrideD0, D0Layout{}));
|
||||
SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
|
||||
ALayout,
|
||||
BLayout,
|
||||
ck::Tuple<D0Layout>,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ck::Tuple<D0DataType>,
|
||||
EDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::AddFastGelu>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{};
|
||||
|
||||
// get generic instance
|
||||
auto& op_ptr = op_ptrs[0];
|
||||
|
||||
std::cout << "Run the generic instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
// run the generic instance
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 1>{d0_m_n_device_buf.GetDeviceBuffer()},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
std::array<ck::index_t, 1>{StrideD0},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"Generic instance should be suitable for various input lengths/strides");
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
226
client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
Normal file
226
client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
Normal file
@@ -0,0 +1,226 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using FastGelu = ck::tensor_operation::element_wise::FastGelu;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = FastGelu;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using EDataType = F16;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using ELayout = Row;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
ck::index_t N = 4096;
|
||||
ck::index_t K = 4096;
|
||||
|
||||
ck::index_t StrideA = 4096;
|
||||
ck::index_t StrideB = 4096;
|
||||
ck::index_t StrideE = 4096;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 7)
|
||||
{
|
||||
M = std::stoi(argv[1]);
|
||||
N = std::stoi(argv[2]);
|
||||
K = std::stoi(argv[3]);
|
||||
|
||||
StrideA = std::stoi(argv[4]);
|
||||
StrideB = std::stoi(argv[5]);
|
||||
StrideE = std::stoi(argv[6]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideE\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
|
||||
SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
|
||||
ALayout,
|
||||
BLayout,
|
||||
ck::Tuple<>,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ck::Tuple<>,
|
||||
EDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::FastGelu>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
{},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
|
||||
std::size_t num_btype =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
{},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,162 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using FastGelu = ck::tensor_operation::element_wise::FastGelu;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = FastGelu;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using EDataType = F16;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using ELayout = Row;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
ck::index_t N = 4096;
|
||||
ck::index_t K = 4096;
|
||||
|
||||
ck::index_t StrideA = 4096;
|
||||
ck::index_t StrideB = 4096;
|
||||
ck::index_t StrideE = 4096;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 7)
|
||||
{
|
||||
M = std::stoi(argv[1]);
|
||||
N = std::stoi(argv[2]);
|
||||
K = std::stoi(argv[3]);
|
||||
|
||||
StrideA = std::stoi(argv[4]);
|
||||
StrideB = std::stoi(argv[5]);
|
||||
StrideE = std::stoi(argv[6]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideE\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
|
||||
SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
|
||||
ALayout,
|
||||
BLayout,
|
||||
ck::Tuple<>,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ck::Tuple<>,
|
||||
EDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::FastGelu>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{};
|
||||
|
||||
// get generic instance
|
||||
auto& op_ptr = op_ptrs[0];
|
||||
|
||||
std::cout << "Run the generic instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
// run the generic instance
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
{},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"Generic instance should be suitable for various input lengths/strides");
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
7
client_example/03_gemm_layernorm/CMakeLists.txt
Normal file
7
client_example/03_gemm_layernorm/CMakeLists.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
if(GPU_TARGETS MATCHES "gfx9")
|
||||
add_executable(client_gemm_add_add_layernorm_naive gemm_add_add_layernorm_naive.cpp)
|
||||
target_link_libraries(client_gemm_add_add_layernorm_naive PRIVATE composable_kernel::device_gemm_operations composable_kernel::device_other_operations)
|
||||
|
||||
add_executable(client_gemm_add_relu_add_layernorm_welford gemm_add_relu_add_layernorm_welford.cpp)
|
||||
target_link_libraries(client_gemm_add_relu_add_layernorm_welford PRIVATE composable_kernel::device_gemm_operations composable_kernel::device_other_operations)
|
||||
endif()
|
||||
@@ -0,0 +1,277 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using BiasDataType = F32;
|
||||
using CDataType = F16;
|
||||
using D0DataType = F16;
|
||||
using ReduceDataType = F32;
|
||||
using GammaDataType = F16;
|
||||
using BetaDataType = F16;
|
||||
using LayerNormOutDataType = F16;
|
||||
|
||||
using ALayout = ck::tensor_layout::gemm::RowMajor;
|
||||
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using CLayout = ck::tensor_layout::gemm::RowMajor;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
template <typename gemm_reduce_op_ptr>
|
||||
bool RunDeviceGemmMeanSquareMean(gemm_reduce_op_ptr& p_op,
|
||||
const void* p_a,
|
||||
const void* p_b,
|
||||
const void* p_bias,
|
||||
const void* p_d0,
|
||||
void* p_c,
|
||||
void* p_mean,
|
||||
void* p_square_mean,
|
||||
int M,
|
||||
int N,
|
||||
int K,
|
||||
int StrideA,
|
||||
int StrideB,
|
||||
int StrideC,
|
||||
int StrideD0,
|
||||
bool time_kernel)
|
||||
{
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide;
|
||||
using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare;
|
||||
|
||||
auto passOp = PassThrough{};
|
||||
auto squareOp = UnarySquareElementOp{};
|
||||
auto divOp = UnaryDivElementOp{N};
|
||||
|
||||
auto argument_ptr =
|
||||
p_op->MakeArgumentPointer(p_a,
|
||||
p_b,
|
||||
p_bias,
|
||||
{p_d0},
|
||||
p_c,
|
||||
{p_mean, p_square_mean},
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
StrideC,
|
||||
{StrideD0},
|
||||
{&passOp, &passOp, &passOp}, // functor for a, b, c
|
||||
{&passOp}, // functor for d0
|
||||
{&passOp, &squareOp}, // functor for inputs of reduction
|
||||
{&divOp, &divOp}); // functor for outputs of reduction
|
||||
|
||||
if(p_op->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
auto invoker_ptr = p_op->MakeInvokerPointer();
|
||||
|
||||
// If we evaluate running time of gemm_reduce. The output may wrong.
|
||||
// Because we need to initialize the reduction tensor before runing the kernel.
|
||||
// However we run kernel many times for time_kernel = trie without reinitialize the out
|
||||
// of reduction tensor.
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
if(time_kernel)
|
||||
std::cout << "Gemm + reduce Perf: " << std::setw(10) << ave_time << " ms" << std::endl;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename normalize_op_ptr>
|
||||
bool RunDeviceNormalize2D(normalize_op_ptr& p_op,
|
||||
const void* p_x,
|
||||
const void* p_mean,
|
||||
const void* p_square_mean,
|
||||
const void* p_gamma,
|
||||
const void* p_beta,
|
||||
void* p_y,
|
||||
int M,
|
||||
int N,
|
||||
int StrideX,
|
||||
bool time_kernel)
|
||||
{
|
||||
std::array<const void*, 5> input = {p_x, p_mean, p_square_mean, p_gamma, p_beta};
|
||||
std::array<void*, 1> output = {p_y};
|
||||
auto normalize_functor = ck::tensor_operation::element_wise::Normalize{};
|
||||
|
||||
std::array<ck::index_t, 2> xyLengths = {M, N};
|
||||
std::array<ck::index_t, 2> xyStrides = {StrideX, 1};
|
||||
|
||||
auto argument_ptr = p_op->MakeArgumentPointer(xyLengths,
|
||||
{xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
|
||||
{xyStrides},
|
||||
input,
|
||||
output,
|
||||
ck::tensor_operation::element_wise::Normalize{});
|
||||
|
||||
if(p_op->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
auto invoker_ptr = p_op->MakeInvokerPointer();
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
if(time_kernel)
|
||||
std::cout << "Normalize Perf: " << std::setw(10) << ave_time << " ms" << std::endl;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
ck::index_t M = 1024;
|
||||
ck::index_t N = 1024;
|
||||
ck::index_t K = 1024;
|
||||
|
||||
ck::index_t StrideA = 1024;
|
||||
ck::index_t StrideB = 1024;
|
||||
ck::index_t StrideC = 1024;
|
||||
ck::index_t StrideD0 = 1024;
|
||||
|
||||
const auto gemm_reduce_ptrs =
|
||||
ck::tensor_operation::device::instance::get_device_gemm_add_add_mean_squaremean_instances<
|
||||
ADataType,
|
||||
BDataType,
|
||||
CDataType,
|
||||
ALayout,
|
||||
BLayout,
|
||||
CLayout>();
|
||||
|
||||
std::cout << "found " << gemm_reduce_ptrs.size()
|
||||
<< " gemm_reduceMean_reduceSquareMean instances" << std::endl;
|
||||
|
||||
using NormalizeDeviceOp = ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<CDataType, ReduceDataType, ReduceDataType, GammaDataType, BetaDataType>,
|
||||
ck::Tuple<LayerNormOutDataType>,
|
||||
ck::tensor_operation::element_wise::Normalize,
|
||||
2>;
|
||||
|
||||
const auto normalize_ptrs =
|
||||
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
NormalizeDeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << normalize_ptrs.size() << " normalize instances" << std::endl;
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
|
||||
SimpleDeviceMem bias_device_buf(sizeof(BiasDataType) * N);
|
||||
SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
|
||||
SimpleDeviceMem d0_device_buf(sizeof(D0DataType) *
|
||||
f_matrix_space_size(M, N, StrideD0, CLayout{}));
|
||||
SimpleDeviceMem reduceMean_device_buf(sizeof(ReduceDataType) * M);
|
||||
SimpleDeviceMem reduceMeanSquare_device_buf(sizeof(ReduceDataType) * M);
|
||||
SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
|
||||
SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
|
||||
SimpleDeviceMem layerNorm_device_buf(sizeof(LayerNormOutDataType) * M * N);
|
||||
|
||||
bool b_time_kernel = true;
|
||||
bool b_only_run_first_kernel = true;
|
||||
|
||||
// layernorm => (1) + (2)
|
||||
// (1). c = gemm(a, b), reduce_mean(c), reduce_square_mean(c)
|
||||
// (2). normalize(c, mean, square_mean, gamma, beta)
|
||||
for(auto& gemm_reduce_ptr : gemm_reduce_ptrs)
|
||||
{
|
||||
// run first available kernel
|
||||
if(RunDeviceGemmMeanSquareMean(gemm_reduce_ptr,
|
||||
a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
bias_device_buf.GetDeviceBuffer(),
|
||||
d0_device_buf.GetDeviceBuffer(),
|
||||
c_device_buf.GetDeviceBuffer(),
|
||||
reduceMean_device_buf.GetDeviceBuffer(),
|
||||
reduceMeanSquare_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
StrideC,
|
||||
StrideD0,
|
||||
b_time_kernel))
|
||||
{
|
||||
if(b_only_run_first_kernel)
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << gemm_reduce_ptr->GetTypeString() << " does not support this problem"
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
for(auto& normalize_ptr : normalize_ptrs)
|
||||
{
|
||||
if(RunDeviceNormalize2D(normalize_ptr,
|
||||
c_device_buf.GetDeviceBuffer(),
|
||||
reduceMean_device_buf.GetDeviceBuffer(),
|
||||
reduceMeanSquare_device_buf.GetDeviceBuffer(),
|
||||
gamma_device_buf.GetDeviceBuffer(),
|
||||
beta_device_buf.GetDeviceBuffer(),
|
||||
layerNorm_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
StrideC,
|
||||
b_time_kernel))
|
||||
{
|
||||
if(b_only_run_first_kernel)
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << normalize_ptr->GetTypeString() << " does not support this problem"
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,245 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using AddReluAdd = ck::tensor_operation::element_wise::AddReluAdd;
|
||||
|
||||
// DataType
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using D0DataType = F16;
|
||||
using D1DataType = F16;
|
||||
using GammaDataType = F16;
|
||||
using BetaDataType = F16;
|
||||
using HDataType = F16;
|
||||
|
||||
// Layout
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using D0Layout = Row;
|
||||
using D1Layout = Row;
|
||||
using HLayout = Row;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = AddReluAdd;
|
||||
using HElementOp = PassThrough;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}, mMemSize_(mem_size)
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
void SetZero() const { (void)hipMemset(p_mem_, 0, mMemSize_); }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
std::size_t mMemSize_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// GEMM shape
|
||||
ck::index_t M = 1024;
|
||||
ck::index_t N = 1024;
|
||||
ck::index_t K = 1024;
|
||||
|
||||
ck::index_t StrideA = K;
|
||||
ck::index_t StrideB = K;
|
||||
ck::index_t StrideD0 = 0;
|
||||
ck::index_t StrideD1 = N;
|
||||
ck::index_t StrideH = N;
|
||||
|
||||
float epsilon = 1e-5;
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
|
||||
SimpleDeviceMem d0_device_buf(sizeof(D0DataType) *
|
||||
f_matrix_space_size(M, N, StrideD0, D0Layout{}));
|
||||
SimpleDeviceMem d1_device_buf(sizeof(D1DataType) *
|
||||
f_matrix_space_size(M, N, StrideD1, D1Layout{}));
|
||||
SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
|
||||
SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
|
||||
SimpleDeviceMem h_device_buf(sizeof(HDataType) * f_matrix_space_size(M, N, StrideH, HLayout{}));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleDLayernorm<
|
||||
ALayout,
|
||||
BLayout,
|
||||
ck::Tuple<D0Layout, D1Layout>,
|
||||
HLayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ck::Tuple<D0DataType, D1DataType>,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
HDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::AddReluAdd,
|
||||
ck::tensor_operation::element_wise::PassThrough>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{};
|
||||
const auto h_element_op = HElementOp{};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
{d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
|
||||
gamma_device_buf.GetDeviceBuffer(),
|
||||
beta_device_buf.GetDeviceBuffer(),
|
||||
h_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
{StrideD0, StrideD1},
|
||||
StrideH,
|
||||
epsilon,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op,
|
||||
h_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace_dev(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
|
||||
h_device_buf.SetZero();
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_byte =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
|
||||
(sizeof(D0DataType) + sizeof(D1DataType) + sizeof(HDataType)) * M * N +
|
||||
(sizeof(GammaDataType) + sizeof(BetaDataType)) * N;
|
||||
|
||||
float gb_per_sec = num_byte / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
{d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
|
||||
gamma_device_buf.GetDeviceBuffer(),
|
||||
beta_device_buf.GetDeviceBuffer(),
|
||||
h_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
{StrideD0, StrideD1},
|
||||
StrideH,
|
||||
epsilon,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op,
|
||||
h_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace_dev(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
|
||||
h_device_buf.SetZero();
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
16
client_example/04_contraction/CMakeLists.txt
Normal file
16
client_example/04_contraction/CMakeLists.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
if(GPU_TARGETS MATCHES "gfx9")
|
||||
add_executable(client_contraction_scale_fp32 contraction_scale_fp32.cpp)
|
||||
target_link_libraries(client_contraction_scale_fp32 PRIVATE composable_kernel::device_other_operations composable_kernel::device_contraction_operations composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_contraction_bilinear_fp32 contraction_bilinear_fp32.cpp)
|
||||
target_link_libraries(client_contraction_bilinear_fp32 PRIVATE composable_kernel::device_other_operations composable_kernel::device_contraction_operations composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_contraction_scale_fp64 contraction_scale_fp64.cpp)
|
||||
target_link_libraries(client_contraction_scale_fp64 PRIVATE composable_kernel::device_other_operations composable_kernel::device_contraction_operations composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_contraction_bilinear_fp64 contraction_bilinear_fp64.cpp)
|
||||
target_link_libraries(client_contraction_bilinear_fp64 PRIVATE composable_kernel::device_other_operations composable_kernel::device_contraction_operations composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(contraction_g1m2n3k1_add_xdl_fp16 contraction_g1m2n3k1_add_xdl_fp16.cpp)
|
||||
target_link_libraries(contraction_g1m2n3k1_add_xdl_fp16 PRIVATE composable_kernel::device_other_operations composable_kernel::device_contraction_operations composable_kernel::device_gemm_operations)
|
||||
endif()
|
||||
236
client_example/04_contraction/contraction_bilinear_fp32.cpp
Normal file
236
client_example/04_contraction/contraction_bilinear_fp32.cpp
Normal file
@@ -0,0 +1,236 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
|
||||
using F32 = float;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Bilinear = ck::tensor_operation::element_wise::Bilinear;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = Bilinear;
|
||||
|
||||
using ADataType = F32;
|
||||
using BDataType = F32;
|
||||
using AccDataType = F32;
|
||||
using CShuffleDataType = F32;
|
||||
using DDataType = F32;
|
||||
using DsDataType = ck::Tuple<DDataType>;
|
||||
using EDataType = F32;
|
||||
|
||||
static constexpr ck::index_t NumDimM = 2;
|
||||
static constexpr ck::index_t NumDimN = 2;
|
||||
static constexpr ck::index_t NumDimK = 2;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// A[M0, M1, K0, K1]
|
||||
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
|
||||
// B[N0, N1, K0, K1]
|
||||
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
|
||||
std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
|
||||
// D[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
|
||||
// E[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 1.f;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 25)
|
||||
{
|
||||
const ck::index_t M0 = std::stoi(argv[1]);
|
||||
const ck::index_t M1 = std::stoi(argv[2]);
|
||||
|
||||
const ck::index_t N0 = std::stoi(argv[3]);
|
||||
const ck::index_t N1 = std::stoi(argv[4]);
|
||||
|
||||
const ck::index_t K0 = std::stoi(argv[5]);
|
||||
const ck::index_t K1 = std::stoi(argv[6]);
|
||||
|
||||
a_ms_ks_lengths = {M0, M1, K0, K1};
|
||||
a_ms_ks_strides = {
|
||||
std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
|
||||
|
||||
b_ns_ks_lengths = {N0, N1, K0, K1};
|
||||
b_ns_ks_strides = {
|
||||
std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
|
||||
|
||||
d_ms_ns_lengths = {M0, M1, N0, N1};
|
||||
d_ms_ns_strides = {
|
||||
std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
|
||||
|
||||
e_ms_ns_lengths = {M0, M1, N0, N1};
|
||||
e_ms_ns_strides = {
|
||||
std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21]), std::stoi(argv[22])};
|
||||
|
||||
alpha = std::stof(argv[23]);
|
||||
beta = std::stof(argv[24]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
|
||||
printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
|
||||
printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
|
||||
printf("arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
|
||||
printf("arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
|
||||
printf("arg23 to 24: alpha, beta\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_tensor_space_size = [](auto lengths, auto strides) {
|
||||
std::size_t space_size = 1;
|
||||
for(std::size_t i = 0; i < lengths.size(); ++i)
|
||||
{
|
||||
space_size += (lengths[i] - 1) * strides[i];
|
||||
}
|
||||
return space_size;
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) *
|
||||
f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) *
|
||||
f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
|
||||
SimpleDeviceMem d_device_buf(sizeof(DDataType) *
|
||||
f_tensor_space_size(d_ms_ns_lengths, d_ms_ns_strides));
|
||||
SimpleDeviceMem e_device_buf(sizeof(EDataType) *
|
||||
f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
|
||||
NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ck::Tuple<DDataType>,
|
||||
EDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::Bilinear>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{alpha, beta};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
a_ms_ks_lengths,
|
||||
a_ms_ks_strides,
|
||||
b_ns_ks_lengths,
|
||||
b_ns_ks_strides,
|
||||
std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
|
||||
std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
|
||||
e_ms_ns_lengths,
|
||||
e_ms_ns_strides,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
ck::index_t M = ck::accumulate_n<ck::index_t>(
|
||||
e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
|
||||
|
||||
ck::index_t N = ck::accumulate_n<ck::index_t>(
|
||||
e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
|
||||
|
||||
ck::index_t K = ck::accumulate_n<ck::index_t>(
|
||||
a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
|
||||
sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
281
client_example/04_contraction/contraction_bilinear_fp64.cpp
Normal file
281
client_example/04_contraction/contraction_bilinear_fp64.cpp
Normal file
@@ -0,0 +1,281 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
|
||||
using F64 = double;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Bilinear = ck::tensor_operation::element_wise::Bilinear;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = Bilinear;
|
||||
|
||||
using ADataType = F64;
|
||||
using BDataType = F64;
|
||||
using AccDataType = F64;
|
||||
using CShuffleDataType = F64;
|
||||
using DDataType = F64;
|
||||
using DsDataType = ck::Tuple<DDataType>;
|
||||
using EDataType = F64;
|
||||
|
||||
static constexpr ck::index_t NumDimM = 2;
|
||||
static constexpr ck::index_t NumDimN = 2;
|
||||
static constexpr ck::index_t NumDimK = 2;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// kknn
|
||||
#if 1
|
||||
// A[M0, M1, K0, K1]
|
||||
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
|
||||
// B[N0, N1, K0, K1]
|
||||
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
|
||||
std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
|
||||
// D[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
|
||||
// E[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
|
||||
// knnn
|
||||
#elif 0
|
||||
// A[M0, M1, K0, K1]
|
||||
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
|
||||
// B[N0, N1, K0, K1]
|
||||
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
|
||||
std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
|
||||
// D[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
|
||||
// E[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
|
||||
// mknn
|
||||
#elif 0
|
||||
// A[M0, M1, K0, K1]
|
||||
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
|
||||
// B[N0, N1, K0, K1]
|
||||
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
|
||||
std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
|
||||
// D[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
|
||||
// E[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
|
||||
// mnnn
|
||||
#elif 0
|
||||
// A[M0, M1, K0, K1]
|
||||
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
|
||||
// B[N0, N1, K0, K1]
|
||||
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
|
||||
std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
|
||||
// D[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
|
||||
// E[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
|
||||
#endif
|
||||
|
||||
float alpha = 1.f;
|
||||
float beta = 1.f;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 25)
|
||||
{
|
||||
const ck::index_t M0 = std::stoi(argv[1]);
|
||||
const ck::index_t M1 = std::stoi(argv[2]);
|
||||
|
||||
const ck::index_t N0 = std::stoi(argv[3]);
|
||||
const ck::index_t N1 = std::stoi(argv[4]);
|
||||
|
||||
const ck::index_t K0 = std::stoi(argv[5]);
|
||||
const ck::index_t K1 = std::stoi(argv[6]);
|
||||
|
||||
a_ms_ks_lengths = {M0, M1, K0, K1};
|
||||
a_ms_ks_strides = {
|
||||
std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
|
||||
|
||||
b_ns_ks_lengths = {N0, N1, K0, K1};
|
||||
b_ns_ks_strides = {
|
||||
std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
|
||||
|
||||
d_ms_ns_lengths = {M0, M1, N0, N1};
|
||||
d_ms_ns_strides = {
|
||||
std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
|
||||
|
||||
e_ms_ns_lengths = {M0, M1, N0, N1};
|
||||
e_ms_ns_strides = {
|
||||
std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21]), std::stoi(argv[22])};
|
||||
|
||||
alpha = std::stof(argv[23]);
|
||||
beta = std::stof(argv[24]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
|
||||
printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
|
||||
printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
|
||||
printf("arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
|
||||
printf("arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
|
||||
printf("arg23 to 24: alpha, beta\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_tensor_space_size = [](auto lengths, auto strides) {
|
||||
std::size_t space_size = 1;
|
||||
for(std::size_t i = 0; i < lengths.size(); ++i)
|
||||
{
|
||||
space_size += (lengths[i] - 1) * strides[i];
|
||||
}
|
||||
return space_size;
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) *
|
||||
f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) *
|
||||
f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
|
||||
SimpleDeviceMem d_device_buf(sizeof(DDataType) *
|
||||
f_tensor_space_size(d_ms_ns_lengths, d_ms_ns_strides));
|
||||
SimpleDeviceMem e_device_buf(sizeof(EDataType) *
|
||||
f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
|
||||
NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ck::Tuple<DDataType>,
|
||||
EDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::Bilinear>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{alpha, beta};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
a_ms_ks_lengths,
|
||||
a_ms_ks_strides,
|
||||
b_ns_ks_lengths,
|
||||
b_ns_ks_strides,
|
||||
std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
|
||||
std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
|
||||
e_ms_ns_lengths,
|
||||
e_ms_ns_strides,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
ck::index_t M = ck::accumulate_n<ck::index_t>(
|
||||
e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
|
||||
|
||||
ck::index_t N = ck::accumulate_n<ck::index_t>(
|
||||
e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
|
||||
|
||||
ck::index_t K = ck::accumulate_n<ck::index_t>(
|
||||
a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
|
||||
sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,204 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp"
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = Add;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using AccDataType = F32;
|
||||
using CShuffleDataType = F16;
|
||||
using DDataType = F16;
|
||||
using DsDataType = ck::Tuple<DDataType>;
|
||||
using EDataType = F16;
|
||||
|
||||
static constexpr ck::index_t NumDimG = 1;
|
||||
static constexpr ck::index_t NumDimM = 2;
|
||||
static constexpr ck::index_t NumDimN = 3;
|
||||
static constexpr ck::index_t NumDimK = 1;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t G0 = 1;
|
||||
|
||||
ck::index_t M0 = 64;
|
||||
ck::index_t M1 = 256;
|
||||
|
||||
ck::index_t N0 = 3;
|
||||
ck::index_t N1 = 12;
|
||||
ck::index_t N2 = 64;
|
||||
|
||||
ck::index_t K0 = 768;
|
||||
|
||||
// A[M0, M1, M2, K0]
|
||||
std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, M0, M1, K0};
|
||||
std::vector<ck::index_t> a_gs_ms_ks_strides{M0 * M1 * K0, M1 * K0, K0, 1};
|
||||
// B[N0, N1, N2, K0]
|
||||
std::vector<ck::index_t> b_gs_ns_ks_lengths{G0, N0, N1, N2, K0};
|
||||
std::vector<ck::index_t> b_gs_ns_ks_strides{N0 * N1 * N2 * K0, N1 * N2 * K0, N2 * K0, K0, 1};
|
||||
|
||||
// D[N0, M0, N1, M1, N2]
|
||||
std::vector<ck::index_t> d_gs_ms_ns_lengths{G0, M0, M1, N0, N1, N2};
|
||||
std::vector<ck::index_t> d_gs_ms_ns_strides{N0 * N1 * N2, 0, 0, N1 * N2, N2, 1};
|
||||
// E[N0 M0 N1 N2 M1]
|
||||
std::vector<ck::index_t> e_gs_ms_ns_lengths{G0, M0, M1, N0, N1, N2};
|
||||
std::vector<ck::index_t> e_gs_ms_ns_strides{
|
||||
M0 * M1 * N0 * N1 * N2, N1 * N2 * M1, 1, M0 * N1 * N2 * M1, M1 * N2, M1};
|
||||
|
||||
auto f_tensor_space_size = [](auto lengths, auto strides) {
|
||||
std::size_t space_size = 1;
|
||||
for(std::size_t i = 0; i < lengths.size(); ++i)
|
||||
{
|
||||
space_size += (lengths[i] - 1) * strides[i];
|
||||
}
|
||||
return space_size;
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) *
|
||||
f_tensor_space_size(a_gs_ms_ks_lengths, a_gs_ms_ks_strides));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) *
|
||||
f_tensor_space_size(b_gs_ns_ks_lengths, b_gs_ns_ks_strides));
|
||||
SimpleDeviceMem d_device_buf(sizeof(DDataType) *
|
||||
f_tensor_space_size(d_gs_ms_ns_lengths, d_gs_ms_ns_strides));
|
||||
SimpleDeviceMem e_device_buf(sizeof(EDataType) *
|
||||
f_tensor_space_size(e_gs_ms_ns_lengths, e_gs_ms_ns_strides));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceBatchedContractionMultipleD<
|
||||
NumDimG,
|
||||
NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::Add>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
a_gs_ms_ks_lengths,
|
||||
a_gs_ms_ks_strides,
|
||||
b_gs_ns_ks_lengths,
|
||||
b_gs_ns_ks_strides,
|
||||
std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_lengths},
|
||||
std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_strides},
|
||||
e_gs_ms_ns_lengths,
|
||||
e_gs_ms_ns_strides,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
ck::index_t M = ck::accumulate_n<ck::index_t>(
|
||||
e_gs_ms_ns_lengths.begin() + NumDimG, NumDimM, 1, std::multiplies<>{});
|
||||
|
||||
ck::index_t N = ck::accumulate_n<ck::index_t>(
|
||||
e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM, NumDimN, 1, std::multiplies<>{});
|
||||
|
||||
ck::index_t K = ck::accumulate_n<ck::index_t>(
|
||||
a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM, NumDimK, 1, std::multiplies<>{});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
|
||||
sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
222
client_example/04_contraction/contraction_scale_fp32.cpp
Normal file
222
client_example/04_contraction/contraction_scale_fp32.cpp
Normal file
@@ -0,0 +1,222 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp"
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
|
||||
using F32 = float;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Scale = ck::tensor_operation::element_wise::Scale;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = Scale;
|
||||
|
||||
using ADataType = F32;
|
||||
using BDataType = F32;
|
||||
using AccDataType = F32;
|
||||
using CShuffleDataType = F32;
|
||||
using DsDataType = ck::Tuple<>;
|
||||
using EDataType = F32;
|
||||
|
||||
static constexpr ck::index_t NumDimM = 2;
|
||||
static constexpr ck::index_t NumDimN = 2;
|
||||
static constexpr ck::index_t NumDimK = 2;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// A[M0, M1, K0, K1]
|
||||
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
|
||||
// B[N0, N1, K0, K1]
|
||||
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
|
||||
std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
|
||||
// E[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
|
||||
|
||||
float scale = 1.f;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 20)
|
||||
{
|
||||
const ck::index_t M0 = std::stoi(argv[1]);
|
||||
const ck::index_t M1 = std::stoi(argv[2]);
|
||||
|
||||
const ck::index_t N0 = std::stoi(argv[3]);
|
||||
const ck::index_t N1 = std::stoi(argv[4]);
|
||||
|
||||
const ck::index_t K0 = std::stoi(argv[5]);
|
||||
const ck::index_t K1 = std::stoi(argv[6]);
|
||||
|
||||
a_ms_ks_lengths = {M0, M1, K0, K1};
|
||||
a_ms_ks_strides = {
|
||||
std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
|
||||
|
||||
b_ns_ks_lengths = {N0, N1, K0, K1};
|
||||
b_ns_ks_strides = {
|
||||
std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
|
||||
|
||||
e_ms_ns_lengths = {M0, M1, N0, N1};
|
||||
e_ms_ns_strides = {
|
||||
std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
|
||||
|
||||
scale = std::stof(argv[19]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
|
||||
printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
|
||||
printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
|
||||
printf("arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
|
||||
printf("arg19: scale\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_tensor_space_size = [](auto lengths, auto strides) {
|
||||
std::size_t space_size = 1;
|
||||
for(std::size_t i = 0; i < lengths.size(); ++i)
|
||||
{
|
||||
space_size += (lengths[i] - 1) * strides[i];
|
||||
}
|
||||
return space_size;
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) *
|
||||
f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) *
|
||||
f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
|
||||
SimpleDeviceMem e_device_buf(sizeof(EDataType) *
|
||||
f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
|
||||
NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ck::Tuple<>,
|
||||
EDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::Scale>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{scale};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 0>{},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
a_ms_ks_lengths,
|
||||
a_ms_ks_strides,
|
||||
b_ns_ks_lengths,
|
||||
b_ns_ks_strides,
|
||||
std::array<std::vector<ck::index_t>, 0>{},
|
||||
std::array<std::vector<ck::index_t>, 0>{},
|
||||
e_ms_ns_lengths,
|
||||
e_ms_ns_strides,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
ck::index_t M = ck::accumulate_n<ck::index_t>(
|
||||
e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
|
||||
|
||||
ck::index_t N = ck::accumulate_n<ck::index_t>(
|
||||
e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
|
||||
|
||||
ck::index_t K = ck::accumulate_n<ck::index_t>(
|
||||
a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
270
client_example/04_contraction/contraction_scale_fp64.cpp
Normal file
270
client_example/04_contraction/contraction_scale_fp64.cpp
Normal file
@@ -0,0 +1,270 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp"
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
|
||||
using F64 = double;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Scale = ck::tensor_operation::element_wise::Scale;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = Scale;
|
||||
|
||||
using ADataType = F64;
|
||||
using BDataType = F64;
|
||||
using AccDataType = F64;
|
||||
using CShuffleDataType = F64;
|
||||
using DsDataType = ck::Tuple<>;
|
||||
using EDataType = F64;
|
||||
|
||||
static constexpr ck::index_t NumDimM = 2;
|
||||
static constexpr ck::index_t NumDimN = 2;
|
||||
static constexpr ck::index_t NumDimK = 2;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// kkn
|
||||
#if 1
|
||||
// A[M0, M1, K0, K1]
|
||||
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
|
||||
// B[N0, N1, K0, K1]
|
||||
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
|
||||
std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
|
||||
// D[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
|
||||
// E[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
|
||||
// knn
|
||||
#elif 0
|
||||
// A[M0, M1, K0, K1]
|
||||
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
|
||||
// B[N0, N1, K0, K1]
|
||||
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
|
||||
std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
|
||||
// D[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
|
||||
// E[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
|
||||
// mkn
|
||||
#elif 0
|
||||
// A[M0, M1, K0, K1]
|
||||
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
|
||||
// B[N0, N1, K0, K1]
|
||||
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
|
||||
std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
|
||||
// D[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
|
||||
// E[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
|
||||
// mnn
|
||||
#elif 0
|
||||
// A[M0, M1, K0, K1]
|
||||
std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> a_ms_ks_strides{128, 1, 245760, 3840};
|
||||
// B[N0, N1, K0, K1]
|
||||
std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
|
||||
std::vector<ck::index_t> b_ns_ks_strides{64, 1, 131072, 2048};
|
||||
// D[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
|
||||
// E[M0, M1, N0, N1]
|
||||
std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
|
||||
std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
|
||||
#endif
|
||||
|
||||
float scale = 1.f;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 20)
|
||||
{
|
||||
const ck::index_t M0 = std::stoi(argv[1]);
|
||||
const ck::index_t M1 = std::stoi(argv[2]);
|
||||
|
||||
const ck::index_t N0 = std::stoi(argv[3]);
|
||||
const ck::index_t N1 = std::stoi(argv[4]);
|
||||
|
||||
const ck::index_t K0 = std::stoi(argv[5]);
|
||||
const ck::index_t K1 = std::stoi(argv[6]);
|
||||
|
||||
a_ms_ks_lengths = {M0, M1, K0, K1};
|
||||
a_ms_ks_strides = {
|
||||
std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
|
||||
|
||||
b_ns_ks_lengths = {N0, N1, K0, K1};
|
||||
b_ns_ks_strides = {
|
||||
std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
|
||||
|
||||
e_ms_ns_lengths = {M0, M1, N0, N1};
|
||||
e_ms_ns_strides = {
|
||||
std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
|
||||
|
||||
scale = std::stof(argv[19]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
|
||||
printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
|
||||
printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
|
||||
printf("arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
|
||||
printf("arg19: scale\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_tensor_space_size = [](auto lengths, auto strides) {
|
||||
std::size_t space_size = 1;
|
||||
for(std::size_t i = 0; i < lengths.size(); ++i)
|
||||
{
|
||||
space_size += (lengths[i] - 1) * strides[i];
|
||||
}
|
||||
return space_size;
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) *
|
||||
f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) *
|
||||
f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
|
||||
SimpleDeviceMem e_device_buf(sizeof(EDataType) *
|
||||
f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
|
||||
NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ck::Tuple<>,
|
||||
EDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::Scale>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{scale};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 0>{},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
a_ms_ks_lengths,
|
||||
a_ms_ks_strides,
|
||||
b_ns_ks_lengths,
|
||||
b_ns_ks_strides,
|
||||
std::array<std::vector<ck::index_t>, 0>{},
|
||||
std::array<std::vector<ck::index_t>, 0>{},
|
||||
e_ms_ns_lengths,
|
||||
e_ms_ns_strides,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
ck::index_t M = ck::accumulate_n<ck::index_t>(
|
||||
e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
|
||||
|
||||
ck::index_t N = ck::accumulate_n<ck::index_t>(
|
||||
e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
|
||||
|
||||
ck::index_t K = ck::accumulate_n<ck::index_t>(
|
||||
a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
11
client_example/05_layernorm/CMakeLists.txt
Normal file
11
client_example/05_layernorm/CMakeLists.txt
Normal file
@@ -0,0 +1,11 @@
|
||||
add_executable(client_layernorm2d_bwd_data layernorm2d_bwd_data.cpp)
|
||||
target_link_libraries(client_layernorm2d_bwd_data PRIVATE composable_kernel::device_other_operations)
|
||||
|
||||
add_executable(client_layernorm2d_bwd_gamma_beta layernorm2d_bwd_gamma_beta.cpp)
|
||||
target_link_libraries(client_layernorm2d_bwd_gamma_beta PRIVATE composable_kernel::device_other_operations)
|
||||
|
||||
add_executable(client_layernorm2d_fwd layernorm2d_fwd.cpp)
|
||||
target_link_libraries(client_layernorm2d_fwd PRIVATE composable_kernel::device_other_operations)
|
||||
|
||||
add_executable(client_layernorm4d_fwd layernorm4d_fwd.cpp)
|
||||
target_link_libraries(client_layernorm4d_fwd PRIVATE composable_kernel::device_other_operations)
|
||||
170
client_example/05_layernorm/layernorm2d_bwd_data.cpp
Normal file
170
client_example/05_layernorm/layernorm2d_bwd_data.cpp
Normal file
@@ -0,0 +1,170 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_data.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/layernorm_bwd_data.hpp"
|
||||
|
||||
using DYDataType = float;
|
||||
using XDataType = float;
|
||||
using GammaDataType = float;
|
||||
using MeanInvStdDataType = float;
|
||||
using DXDataType = float;
|
||||
|
||||
constexpr int Rank = 2;
|
||||
constexpr int NumReduceDim = 1;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t M = 1024;
|
||||
ck::index_t N = 1024;
|
||||
|
||||
SimpleDeviceMem dy_dev(sizeof(DYDataType) * M * N);
|
||||
SimpleDeviceMem x_dev(sizeof(XDataType) * M * N);
|
||||
SimpleDeviceMem gamma_dev(sizeof(GammaDataType) * N);
|
||||
SimpleDeviceMem mean_dev(sizeof(MeanInvStdDataType) * M);
|
||||
SimpleDeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * M);
|
||||
SimpleDeviceMem dx_dev(sizeof(DXDataType) * M * N);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceNormalizationBwdData<DYDataType,
|
||||
XDataType,
|
||||
GammaDataType,
|
||||
MeanInvStdDataType,
|
||||
DXDataType,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
|
||||
{N, 1}, // dyStrides
|
||||
{N, 1}, // xStrides
|
||||
{0, 1}, // gammaStrides
|
||||
{1, 0}, // meanStrides
|
||||
{1, 0}, // invStdStrides
|
||||
{N, 1}, // dxStrides
|
||||
{1}, // reduceDims
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
x_dev.GetDeviceBuffer(),
|
||||
gamma_dev.GetDeviceBuffer(),
|
||||
mean_dev.GetDeviceBuffer(),
|
||||
inv_std_dev.GetDeviceBuffer(),
|
||||
dx_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_byte = sizeof(DYDataType) * M * N + sizeof(XDataType) * M * N +
|
||||
sizeof(GammaDataType) * N + sizeof(MeanInvStdDataType) * M * 2 +
|
||||
sizeof(DXDataType) * M * N;
|
||||
|
||||
float gb_per_sec = num_byte / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
|
||||
{N, 1}, // dyStrides
|
||||
{N, 1}, // xStrides
|
||||
{0, 1}, // gammaStrides
|
||||
{1, 0}, // meanStrides
|
||||
{1, 0}, // invStdStrides
|
||||
{N, 1}, // dxStrides
|
||||
{1}, // reduceDims
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
x_dev.GetDeviceBuffer(),
|
||||
gamma_dev.GetDeviceBuffer(),
|
||||
mean_dev.GetDeviceBuffer(),
|
||||
inv_std_dev.GetDeviceBuffer(),
|
||||
dx_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
171
client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp
Normal file
171
client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp
Normal file
@@ -0,0 +1,171 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp"
|
||||
|
||||
using DYDataType = float;
|
||||
using XDataType = float;
|
||||
using GammaDataType = float;
|
||||
using MeanInvStdDataType = float;
|
||||
using DGammaDataType = float;
|
||||
using DBetaDataType = float;
|
||||
|
||||
constexpr int Rank = 2;
|
||||
constexpr int NumReduceDim = 1;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t M = 1024;
|
||||
ck::index_t N = 1024;
|
||||
|
||||
SimpleDeviceMem dy_dev(sizeof(DYDataType) * M * N);
|
||||
SimpleDeviceMem x_dev(sizeof(XDataType) * M * N);
|
||||
SimpleDeviceMem mean_dev(sizeof(MeanInvStdDataType) * M);
|
||||
SimpleDeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * M);
|
||||
SimpleDeviceMem dgamma_dev(sizeof(DGammaDataType) * N);
|
||||
SimpleDeviceMem dbeta_dev(sizeof(DBetaDataType) * N);
|
||||
|
||||
using DeviceOp =
|
||||
ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
|
||||
XDataType,
|
||||
MeanInvStdDataType,
|
||||
DGammaDataType,
|
||||
DBetaDataType,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
std::size_t num_bytes = sizeof(DYDataType) * M * N + sizeof(XDataType) * M * N +
|
||||
sizeof(MeanInvStdDataType) * M * 2 + sizeof(DGammaDataType) * N +
|
||||
sizeof(DBetaDataType) * N;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // inLengths
|
||||
{N, 1}, // dyStrides
|
||||
{N, 1}, // xStrides
|
||||
{1, 0}, // meanStrides
|
||||
{1, 0}, // invStdStrides
|
||||
{N}, // outLengths
|
||||
{1}, // dgammaStrides
|
||||
{1}, // dbetaStrides
|
||||
{0}, // reduceDims
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
x_dev.GetDeviceBuffer(),
|
||||
mean_dev.GetDeviceBuffer(),
|
||||
inv_std_dev.GetDeviceBuffer(),
|
||||
dgamma_dev.GetDeviceBuffer(),
|
||||
dbeta_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
float gb_per_sec = num_bytes / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // inLengths
|
||||
{N, 1}, // dyStrides
|
||||
{N, 1}, // xStrides
|
||||
{1, 0}, // meanStrides
|
||||
{1, 0}, // invStdStrides
|
||||
{N}, // outLengths
|
||||
{1}, // dgammaStrides
|
||||
{1}, // dbetaStrides
|
||||
{0}, // reduceDims
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
x_dev.GetDeviceBuffer(),
|
||||
mean_dev.GetDeviceBuffer(),
|
||||
inv_std_dev.GetDeviceBuffer(),
|
||||
dgamma_dev.GetDeviceBuffer(),
|
||||
dbeta_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
196
client_example/05_layernorm/layernorm2d_fwd.cpp
Normal file
196
client_example/05_layernorm/layernorm2d_fwd.cpp
Normal file
@@ -0,0 +1,196 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp"
|
||||
|
||||
using XDataType = ck::half_t;
|
||||
using GammaDataType = ck::half_t;
|
||||
using BetaDataType = ck::half_t;
|
||||
using YDataType = ck::half_t;
|
||||
using SaveMeanInvStdDataType = ck::half_t;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
#define SAVE_MEAN_INV_STD
|
||||
|
||||
constexpr int Rank = 2;
|
||||
constexpr int NumReduceDim = 1;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t M = 1024;
|
||||
ck::index_t N = 1024;
|
||||
ck::index_t Stride = 1024;
|
||||
|
||||
auto xy_size = (M - 1) * Stride + N;
|
||||
|
||||
SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size);
|
||||
SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
|
||||
SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
|
||||
SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
|
||||
#ifdef SAVE_MEAN_INV_STD
|
||||
SimpleDeviceMem save_mean_device_buf(sizeof(SaveMeanInvStdDataType) * M);
|
||||
SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * M);
|
||||
#endif
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd<XDataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
YDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
PassThrough,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
|
||||
{Stride, 1}, // xStrides
|
||||
{0, 1}, // gammaStrides
|
||||
{0, 1}, // betaStrides
|
||||
{Stride, 1}, // yStrides
|
||||
{1}, // save_mean Strides
|
||||
{1}, // save_inv_std Strides
|
||||
{1}, // reduceDims
|
||||
1e-4,
|
||||
x_device_buf.GetDeviceBuffer(),
|
||||
gamma_device_buf.GetDeviceBuffer(),
|
||||
beta_device_buf.GetDeviceBuffer(),
|
||||
y_device_buf.GetDeviceBuffer(),
|
||||
#ifdef SAVE_MEAN_INV_STD
|
||||
save_mean_device_buf.GetDeviceBuffer(),
|
||||
save_inv_std_device_buf.GetDeviceBuffer(),
|
||||
#else
|
||||
nullptr,
|
||||
nullptr,
|
||||
#endif
|
||||
PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N +
|
||||
sizeof(BetaDataType) * N + sizeof(YDataType) * M * N;
|
||||
|
||||
#ifdef SAVE_MEAN_INV_STD
|
||||
num_byte += sizeof(SaveMeanInvStdDataType) * M * 2;
|
||||
#endif
|
||||
|
||||
float gb_per_sec = num_byte / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
|
||||
{Stride, 1}, // xStrides
|
||||
{0, 1}, // gammaStrides
|
||||
{0, 1}, // betaStrides
|
||||
{Stride, 1}, // yStrides
|
||||
{1}, // save_mean Strides
|
||||
{1}, // save_inv_std Strides
|
||||
{1}, // reduceDims
|
||||
1e-4,
|
||||
x_device_buf.GetDeviceBuffer(),
|
||||
gamma_device_buf.GetDeviceBuffer(),
|
||||
beta_device_buf.GetDeviceBuffer(),
|
||||
y_device_buf.GetDeviceBuffer(),
|
||||
#ifdef SAVE_MEAN_INV_STD
|
||||
save_mean_device_buf.GetDeviceBuffer(),
|
||||
save_inv_std_device_buf.GetDeviceBuffer(),
|
||||
#else
|
||||
nullptr,
|
||||
nullptr,
|
||||
#endif
|
||||
PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
202
client_example/05_layernorm/layernorm4d_fwd.cpp
Normal file
202
client_example/05_layernorm/layernorm4d_fwd.cpp
Normal file
@@ -0,0 +1,202 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp"
|
||||
|
||||
using XDataType = ck::half_t;
|
||||
using GammaDataType = ck::half_t;
|
||||
using BetaDataType = ck::half_t;
|
||||
using YDataType = ck::half_t;
|
||||
using SaveMeanInvStdDataType = ck::half_t;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
#define SAVE_MEAN_INV_STD
|
||||
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t N = 256;
|
||||
ck::index_t H = 16;
|
||||
ck::index_t W = 16;
|
||||
ck::index_t C = 8;
|
||||
|
||||
std::vector<ck::index_t> strideXY = {H * W * C, W * C, C, 1};
|
||||
std::vector<ck::index_t> strideGammaBeta = {0, W * C, C, 1};
|
||||
std::vector<ck::index_t> strideSaveMeanInvStd = {1};
|
||||
|
||||
SimpleDeviceMem x_device_buf(sizeof(XDataType) * N * H * W * C);
|
||||
SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * H * W * C);
|
||||
SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * H * W * C);
|
||||
SimpleDeviceMem y_device_buf(sizeof(YDataType) * N * H * W * C);
|
||||
#ifdef SAVE_MEAN_INV_STD
|
||||
SimpleDeviceMem save_mean_device_buf(sizeof(SaveMeanInvStdDataType) * N);
|
||||
SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * N);
|
||||
#endif
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd<XDataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
YDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
PassThrough,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer({N, H, W, C}, // lengths
|
||||
strideXY, // xStrides
|
||||
strideGammaBeta, // gammaStrides
|
||||
strideGammaBeta, // betaStrides
|
||||
strideXY, // yStrides
|
||||
strideSaveMeanInvStd, // save_mean Strides
|
||||
strideSaveMeanInvStd, // save_inv_std Strides
|
||||
{1, 2, 3}, // reduceDims
|
||||
1e-4,
|
||||
x_device_buf.GetDeviceBuffer(),
|
||||
gamma_device_buf.GetDeviceBuffer(),
|
||||
beta_device_buf.GetDeviceBuffer(),
|
||||
y_device_buf.GetDeviceBuffer(),
|
||||
#ifdef SAVE_MEAN_INV_STD
|
||||
save_mean_device_buf.GetDeviceBuffer(),
|
||||
save_inv_std_device_buf.GetDeviceBuffer(),
|
||||
#else
|
||||
nullptr,
|
||||
nullptr,
|
||||
#endif
|
||||
PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_byte =
|
||||
sizeof(XDataType) * N * H * W * C + sizeof(GammaDataType) * H * W * C +
|
||||
sizeof(BetaDataType) * H * W * C + sizeof(YDataType) * N * H * W * C;
|
||||
|
||||
#ifdef SAVE_MEAN_INV_STD
|
||||
num_byte += sizeof(SaveMeanInvStdDataType) * N * 2;
|
||||
#endif
|
||||
|
||||
float gb_per_sec = num_byte / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer({N, H, W, C}, // lengths
|
||||
strideXY, // xStrides
|
||||
strideGammaBeta, // gammaStrides
|
||||
strideGammaBeta, // betaStrides
|
||||
strideXY, // yStrides
|
||||
strideSaveMeanInvStd, // save_mean Strides
|
||||
strideSaveMeanInvStd, // save_inv_std Strides
|
||||
{1, 2, 3}, // reduceDims
|
||||
1e-4,
|
||||
x_device_buf.GetDeviceBuffer(),
|
||||
gamma_device_buf.GetDeviceBuffer(),
|
||||
beta_device_buf.GetDeviceBuffer(),
|
||||
y_device_buf.GetDeviceBuffer(),
|
||||
#ifdef SAVE_MEAN_INV_STD
|
||||
save_mean_device_buf.GetDeviceBuffer(),
|
||||
save_inv_std_device_buf.GetDeviceBuffer(),
|
||||
#else
|
||||
nullptr,
|
||||
nullptr,
|
||||
#endif
|
||||
PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
2
client_example/06_softmax/CMakeLists.txt
Normal file
2
client_example/06_softmax/CMakeLists.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
add_executable(client_softmax4d softmax4d.cpp)
|
||||
target_link_libraries(client_softmax4d PRIVATE composable_kernel::device_other_operations composable_kernel::device_reduction_operations)
|
||||
169
client_example/06_softmax/softmax4d.cpp
Normal file
169
client_example/06_softmax/softmax4d.cpp
Normal file
@@ -0,0 +1,169 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
using AccDataType = float;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumReduceDim = 2;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
std::vector<ck::index_t> in_lengths{2, 8, 128, 1024};
|
||||
std::vector<ck::index_t> in_strides{8 * 128 * 1024, 128 * 1024, 1024, 1};
|
||||
std::vector<ck::index_t> reduce_dims{2, 3};
|
||||
|
||||
ck::index_t num_elements =
|
||||
std::accumulate(in_lengths.begin(), in_lengths.end(), 1, std::multiplies<ck::index_t>());
|
||||
|
||||
double alpha{2.0};
|
||||
double beta{2.0};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * num_elements);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * num_elements);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceSoftmax<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
auto& generic_op_ptr = op_ptrs[0];
|
||||
|
||||
auto generic_argument_ptr = generic_op_ptr->MakeArgumentPointer(in_lengths,
|
||||
in_strides,
|
||||
reduce_dims,
|
||||
alpha,
|
||||
beta,
|
||||
in.GetDeviceBuffer(),
|
||||
out.GetDeviceBuffer(),
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"The generic kernel instance should be able to support any input shapes");
|
||||
};
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths,
|
||||
in_strides,
|
||||
reduce_dims,
|
||||
alpha,
|
||||
beta,
|
||||
in.GetDeviceBuffer(),
|
||||
out.GetDeviceBuffer(),
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_bytes = num_elements * sizeof(InDataType) +
|
||||
(beta == 0.0f ? 1 : 2) * num_elements * sizeof(OutDataType);
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths,
|
||||
in_strides,
|
||||
reduce_dims,
|
||||
alpha,
|
||||
beta,
|
||||
in.GetDeviceBuffer(),
|
||||
out.GetDeviceBuffer(),
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
28
client_example/07_grouped_convnd_fwd/CMakeLists.txt
Normal file
28
client_example/07_grouped_convnd_fwd/CMakeLists.txt
Normal file
@@ -0,0 +1,28 @@
|
||||
if(GPU_TARGETS MATCHES "gfx9")
|
||||
add_executable(client_grouped_conv2d_fwd grouped_conv2d_fwd.cpp)
|
||||
target_link_libraries(client_grouped_conv2d_fwd PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_grouped_conv1d_fwd grouped_conv1d_fwd.cpp)
|
||||
target_link_libraries(client_grouped_conv1d_fwd PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
if((DTYPES MATCHES "fp8") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
|
||||
add_executable(client_grouped_conv3d_fwd_fp8 grouped_conv3d_fwd_fp8.cpp)
|
||||
target_link_libraries(client_grouped_conv3d_fwd_fp8 PRIVATE composable_kernel::device_conv_operations)
|
||||
endif()
|
||||
|
||||
if((DTYPES MATCHES "bf8") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
|
||||
add_executable(client_grouped_conv3d_fwd_bf8 grouped_conv3d_fwd_bf8.cpp)
|
||||
target_link_libraries(client_grouped_conv3d_fwd_bf8 PRIVATE composable_kernel::device_conv_operations)
|
||||
endif()
|
||||
|
||||
if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
|
||||
add_executable(client_grouped_conv3d_fwd_fp8_bf8 grouped_conv3d_fwd_fp8_bf8.cpp)
|
||||
target_link_libraries(client_grouped_conv3d_fwd_fp8_bf8 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_grouped_conv3d_fwd_bf8_fp8 grouped_conv3d_fwd_bf8_fp8.cpp)
|
||||
target_link_libraries(client_grouped_conv3d_fwd_bf8_fp8 PRIVATE composable_kernel::device_conv_operations)
|
||||
endif()
|
||||
|
||||
add_executable(grouped_conv2d_fwd_ngchw grouped_conv2d_fwd_ngchw.cpp)
|
||||
target_link_libraries(grouped_conv2d_fwd_ngchw PRIVATE composable_kernel::device_conv_operations)
|
||||
endif()
|
||||
68
client_example/07_grouped_convnd_fwd/README.md
Normal file
68
client_example/07_grouped_convnd_fwd/README.md
Normal file
@@ -0,0 +1,68 @@
|
||||
[Back to supported operations](../../../include/ck/README.md)
|
||||
# Composable Kernel Grouped Convolution
|
||||
|
||||
## Grouped Convolution Forward
|
||||
Grouped convolution operation for 1D, 2D or 3D spatial dimensions. Convolution utilizes GEMM kernel after tensor coordinate transform. In CK Grouped Convolution Forward operation is called as `DeviceGroupedConvFwdMultipleABD` and requires following types as template parameters:
|
||||
|
||||
* **NumDimSpatial** - number of spatial dimensions (1D, 2D, 3D).
|
||||
* **InLayout** - input layout (NHWGC, GNHWC, NGCHW).
|
||||
* **WeiLayout** - weight layout (GKYXC).
|
||||
* **DsLayout** - layouts for additional tensors for fused operations.
|
||||
* **OutLayout** - output layout (NHWGK, GNHWK, NGKHW).
|
||||
* **ADataType** - input data type. Pass tuple if there is fused operation with input.
|
||||
* **BDataType** - weight data type. Pass tuple if there is fused operation with weight.
|
||||
* **DsDataType** - data types for additional tensors for fused operations.
|
||||
* **EDataType** - Output data type.
|
||||
* **AElementwiseOperation** - fused operation on tensor A (input).
|
||||
* **BElementwiseOperation** - fused operation on tensor B (weight).
|
||||
* **CDEElementwiseOperation** - fused operation on tensor C (output).
|
||||
* **AComputeType** - compute data type of tensor A for mfma instruction (ADataType by default).
|
||||
* **BComputeType** - compute data type of tensor B for mfma instruction (AComputeType by default).
|
||||
|
||||
Grouped convolution forward support tensors larger than 2GB.
|
||||
|
||||
List of the device operations for grouped convolution forward in CK:
|
||||
|
||||
* **DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3** - Device operation with XDL instructions. Optimized for AMD Instinct MI300 series.
|
||||
* **DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle** - Device operation with XDL instructions and support of fused operations to input, weight and output.
|
||||
* **DeviceGroupedConvFwdMultipleD_Wmma_CShuffle** - Device operation with WMMA instructions.
|
||||
* **DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK** - Device operation with DL instructions.
|
||||
|
||||
Table of supported cases by instance factory with XDL instruction:
|
||||
|
||||
| |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|NGCHW/GKCYX/NGKHW|GNHWC/GKYXC/GNHWK|
|
||||
|-------|---|---|---|---|
|
||||
|bf16 |2D, 3D|2D|2D|1D, 2D, 3D|
|
||||
|fp16 |2D, 3D|2D|2D|1D, 2D, 3D|
|
||||
|fp32 |2D, 3D|2D|2D|1D, 2D, 3D|
|
||||
|int8 |2D, 3D|2D|2D|1D, 3D|
|
||||
|fp8 |3D|✗|✗|✗|
|
||||
|bf8 |3D|✗|✗|✗|
|
||||
|
||||
Table of supported cases by instance factory with WMMA instruction:
|
||||
|
||||
| |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|GNHWC/GKYXC/GNHWK|
|
||||
|-------|---|---|---|
|
||||
|fp16 |2D, 3D|✗|2D, 3D|
|
||||
|int8 |2D, 3D|✗|2D, 3D|
|
||||
|
||||
Table of supported cases by instance factory with DL instruction:
|
||||
|
||||
| |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|GNHWC/GKYXC/GNHWK|
|
||||
|-------|---|---|---|
|
||||
|bf16 |✗|✗|2D|
|
||||
|fp16 |✗|✗|2D|
|
||||
|fp32 |✗|✗|2D|
|
||||
|int8 |✗|✗|2D|
|
||||
|
||||
Table of supported cases by instance factory with fused elementwise operation:
|
||||
|
||||
* **Dynamic elementwise operation** - 2D/3D, NHWGC, bf16/fp16/fp32/int8
|
||||
* **Bilinear** - 3D, NHWGC, bf16/fp16/fp32/int8
|
||||
* **ConvInvScale** - 3D, NHWGC, fp8
|
||||
* **ConvScale** - 3D, NHWGC, fp8/bf8
|
||||
* **ConvScale + Add** - 3D, NHWGC, fp8
|
||||
* **ConvScale + Relu** - 3D, NHWGC, fp8
|
||||
* **Scale** - 3D, NHWGC, bf16/fp16/fp32/int8
|
||||
* **Scale + Add (for A and B)** - 3D, NHWGC, bf16/fp16/fp32/int8
|
||||
* **Scale + Add + Scale + Add + Relu** - 3D, NHWGC, bf16/fp16/fp32/int8
|
||||
304
client_example/07_grouped_convnd_fwd/common.hpp
Normal file
304
client_example/07_grouped_convnd_fwd/common.hpp
Normal file
@@ -0,0 +1,304 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
template <ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
|
||||
std::size_t
|
||||
GetFlops(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& output_lengths,
|
||||
const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& weights_lengths)
|
||||
{
|
||||
// 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
|
||||
ck::index_t G = weights_lengths[0];
|
||||
ck::index_t N = output_lengths[1];
|
||||
ck::index_t K = weights_lengths[1];
|
||||
ck::index_t C = weights_lengths[2];
|
||||
|
||||
return static_cast<std::size_t>(2) * G * N * K * C *
|
||||
std::accumulate(std::next(std::begin(output_lengths), NumNonSpatialDim),
|
||||
std::end(output_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>()) *
|
||||
std::accumulate(std::next(std::begin(weights_lengths), NumNonSpatialDim),
|
||||
std::end(weights_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>());
|
||||
}
|
||||
|
||||
template <typename InDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
|
||||
std::size_t
|
||||
GetInputByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& input_lengths)
|
||||
{
|
||||
// sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
|
||||
return sizeof(InDataType) * std::accumulate(std::begin(input_lengths),
|
||||
std::end(input_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>());
|
||||
}
|
||||
|
||||
template <typename WeiDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
|
||||
std::size_t
|
||||
GetWeightByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& weights_lengths)
|
||||
{
|
||||
// sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
|
||||
return sizeof(WeiDataType) * std::accumulate(std::begin(weights_lengths),
|
||||
std::end(weights_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>());
|
||||
}
|
||||
|
||||
template <typename OutDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
|
||||
std::size_t
|
||||
GetOutputByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& output_lengths)
|
||||
{
|
||||
// sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
|
||||
return sizeof(OutDataType) * std::accumulate(std::begin(output_lengths),
|
||||
std::end(output_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>());
|
||||
}
|
||||
|
||||
template <ck::index_t NumDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
ck::index_t NumNonSpatialDim = 3,
|
||||
typename AComputeType = InDataType,
|
||||
typename BComputeType = AComputeType>
|
||||
bool run_grouped_conv_fwd(std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> in_lengths,
|
||||
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> wei_lengths,
|
||||
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> out_lengths)
|
||||
{
|
||||
std::size_t in_mem_size = GetInputByte<InDataType, NumDimSpatial>(in_lengths);
|
||||
std::size_t wei_mem_size = GetWeightByte<WeiDataType, NumDimSpatial>(wei_lengths);
|
||||
std::size_t out_mem_size = GetOutputByte<OutDataType, NumDimSpatial>(out_lengths);
|
||||
|
||||
SimpleDeviceMem in(in_mem_size);
|
||||
SimpleDeviceMem wei(wei_mem_size);
|
||||
SimpleDeviceMem out(out_mem_size);
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> in_strides;
|
||||
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> wei_strides;
|
||||
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> out_strides;
|
||||
in_strides.fill(0);
|
||||
wei_strides.fill(0);
|
||||
out_strides.fill(0);
|
||||
in_strides.back() = 1;
|
||||
wei_strides.back() = 1;
|
||||
out_strides.back() = 1;
|
||||
|
||||
std::partial_sum(rbegin(in_lengths),
|
||||
std::prev(rend(in_lengths)),
|
||||
std::next(rbegin(in_strides)),
|
||||
std::multiplies<>{});
|
||||
std::partial_sum(rbegin(wei_lengths),
|
||||
std::prev(rend(wei_lengths)),
|
||||
std::next(rbegin(wei_strides)),
|
||||
std::multiplies<>{});
|
||||
std::partial_sum(rbegin(out_lengths),
|
||||
std::prev(rend(out_lengths)),
|
||||
std::next(rbegin(out_strides)),
|
||||
std::multiplies<>{});
|
||||
|
||||
// transpose NDHWGC/KZYXGC/NDHWGK to GNDHWC/GKZYXC/GNDHWK to GNCDHW/GKCZYX/GNKDHW
|
||||
std::rotate(std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 2), rend(in_lengths));
|
||||
std::rotate(rbegin(in_lengths),
|
||||
std::next(rbegin(in_lengths)),
|
||||
std::next(rbegin(in_lengths), NumDimSpatial + 1));
|
||||
|
||||
std::rotate(std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 2), rend(in_strides));
|
||||
std::rotate(rbegin(in_strides),
|
||||
std::next(rbegin(in_strides)),
|
||||
std::next(rbegin(in_strides), NumDimSpatial + 1));
|
||||
|
||||
std::rotate(rbegin(wei_lengths),
|
||||
std::next(rbegin(wei_lengths)),
|
||||
std::next(rbegin(wei_lengths), NumDimSpatial + 1));
|
||||
|
||||
std::rotate(rbegin(wei_strides),
|
||||
std::next(rbegin(wei_strides)),
|
||||
std::next(rbegin(wei_strides), NumDimSpatial + 1));
|
||||
|
||||
std::rotate(
|
||||
std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 2), rend(out_lengths));
|
||||
std::rotate(rbegin(out_lengths),
|
||||
std::next(rbegin(out_lengths)),
|
||||
std::next(rbegin(out_lengths), NumDimSpatial + 1));
|
||||
|
||||
std::rotate(
|
||||
std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 2), rend(out_strides));
|
||||
std::rotate(rbegin(out_strides),
|
||||
std::next(rbegin(out_strides)),
|
||||
std::next(rbegin(out_strides), NumDimSpatial + 1));
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> conv_filter_strides;
|
||||
std::array<ck::index_t, NumDimSpatial> conv_filter_dilations;
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads;
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads;
|
||||
conv_filter_strides.fill(1);
|
||||
conv_filter_dilations.fill(1);
|
||||
input_left_pads.fill(1);
|
||||
input_right_pads.fill(1);
|
||||
|
||||
std::size_t flop = GetFlops<NumDimSpatial>(out_lengths, wei_lengths);
|
||||
std::size_t num_bytes = in_mem_size + wei_mem_size + out_mem_size;
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
AComputeType,
|
||||
BComputeType>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
std::array<const void*, 0>{},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{{}},
|
||||
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{{}},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
std::array<const void*, 0>{},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{{}},
|
||||
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{{}},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
39
client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp
Normal file
39
client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp
Normal file
@@ -0,0 +1,39 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::GNWK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 1;
|
||||
static constexpr ck::index_t G = 32;
|
||||
static constexpr ck::index_t N = 256;
|
||||
static constexpr ck::index_t K = 192;
|
||||
static constexpr ck::index_t C = 192;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Wi = 28;
|
||||
static constexpr ck::index_t Wo = 28;
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_fwd<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
3>({N, Wi, G, C}, {G, K, X, C}, {N, Wo, G, K})
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
42
client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
Normal file
42
client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp
Normal file
@@ -0,0 +1,42 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NHWGK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 32;
|
||||
static constexpr ck::index_t N = 256; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 32; // input channel (per group)
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 28; // input H
|
||||
static constexpr ck::index_t Wi = 28; // input W
|
||||
static constexpr ck::index_t Ho = 28; // output H
|
||||
static constexpr ck::index_t Wo = 28; // output W
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_fwd<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
3>({N, Hi, Wi, G, C}, {G, K, Y, X, C}, {N, Ho, Wo, G, K})
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
@@ -0,0 +1,216 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
// Use std tuple instead of ck tuple to avoid clang
|
||||
// implicit instantiation of undefined template error.
|
||||
using DDataTypes = std::tuple<ck::half_t>;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NGCHW;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NGKHW;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 32;
|
||||
static constexpr ck::index_t N = 64; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 32; // input channel (per group)
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 14; // input H
|
||||
static constexpr ck::index_t Wi = 14; // input W
|
||||
static constexpr ck::index_t Ho = 14; // output H
|
||||
static constexpr ck::index_t Wo = 14; // output W
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int execute_conv_fwd()
|
||||
{
|
||||
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
|
||||
std::array<ck::index_t, 5> in_strides{C * Hi * Wi, G * C * Hi * Wi, Hi * Wi, Wi, 1};
|
||||
std::array<ck::index_t, 5> wei_lengths{G, K, C, Y, X};
|
||||
std::array<ck::index_t, 5> wei_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{K * Ho * Wo, G * K * Ho * Wo, Ho * Wo, Wo, 1};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
// workspace_sz will be equal to 0 for other layout than NGCHW
|
||||
const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace_dev(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop =
|
||||
std::size_t(2) * G * N * K * C * Ho * Wo * Y * X + 3 * N * Ho * Wo * G * K;
|
||||
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
|
||||
sizeof(WeiDataType) * G * K * Y * X * C +
|
||||
sizeof(OutDataType) * 2 * N * Ho * Wo * G * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace_dev(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main() { return execute_conv_fwd(); }
|
||||
@@ -0,0 +1,46 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::bf8_t;
|
||||
using WeiDataType = ck::bf8_t;
|
||||
using OutDataType = ck::f8_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 64;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 3;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 3;
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_fwd<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
3,
|
||||
ck::bf8_t>(
|
||||
{N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::bf8_t;
|
||||
using WeiDataType = ck::f8_t;
|
||||
using OutDataType = ck::f8_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
|
||||
using AComputeType = ck::bf8_t;
|
||||
using BComputeType = ck::f8_t;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 64;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 3;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 3;
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_fwd<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
3,
|
||||
AComputeType,
|
||||
BComputeType>(
|
||||
{N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::f8_t;
|
||||
using WeiDataType = ck::f8_t;
|
||||
using OutDataType = ck::f8_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 64;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 3;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 3;
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_fwd<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
3,
|
||||
ck::f8_t>(
|
||||
{N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::f8_t;
|
||||
using WeiDataType = ck::bf8_t;
|
||||
using OutDataType = ck::f8_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
|
||||
using AComputeType = ck::f8_t;
|
||||
using BComputeType = ck::bf8_t;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 64;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 3;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 3;
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_fwd<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
3,
|
||||
AComputeType,
|
||||
BComputeType>(
|
||||
{N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
7
client_example/08_fused_attention/CMakeLists.txt
Normal file
7
client_example/08_fused_attention/CMakeLists.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
if(GPU_TARGETS MATCHES "gfx9")
|
||||
add_executable(client_fused_attention fused_attention.cpp)
|
||||
target_link_libraries(client_fused_attention PRIVATE composable_kernel::device_other_operations composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_fused_attention_bias fused_attention_bias.cpp)
|
||||
target_link_libraries(client_fused_attention_bias PRIVATE composable_kernel::device_other_operations composable_kernel::device_gemm_operations)
|
||||
endif()
|
||||
213
client_example/08_fused_attention/fused_attention.cpp
Normal file
213
client_example/08_fused_attention/fused_attention.cpp
Normal file
@@ -0,0 +1,213 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using B0ElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
|
||||
using B1ElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
constexpr static auto MaskingSpec =
|
||||
ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
|
||||
|
||||
using ADataType = ck::half_t;
|
||||
using B0DataType = ck::half_t;
|
||||
using B1DataType = ck::half_t;
|
||||
using CDataType = ck::half_t;
|
||||
using AccDataType = float;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int G0 = 48;
|
||||
int G1 = 16;
|
||||
int M = 1024;
|
||||
int N = 1024;
|
||||
int K = 64;
|
||||
int O = 64;
|
||||
|
||||
// A layout [G0, M, G1, K]
|
||||
std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
|
||||
std::vector<ck::index_t> a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1};
|
||||
|
||||
// B0 layout [G0, N, G1, K]
|
||||
std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
|
||||
std::vector<ck::index_t> b0_gs_ns_ks_strides{N * G1 * K, K, G1 * K, 1};
|
||||
|
||||
// B1 layout [G0, N, G1, O]
|
||||
std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
|
||||
std::vector<ck::index_t> b1_gs_os_ns_strides{N * G1 * O, O, 1, G1 * O};
|
||||
|
||||
// C layout [G0, M, G1, O]
|
||||
std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
|
||||
std::vector<ck::index_t> c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) * G0 * G1 * M * K);
|
||||
SimpleDeviceMem b0_device_buf(sizeof(B0DataType) * G0 * G1 * N * K);
|
||||
SimpleDeviceMem b1_device_buf(sizeof(B1DataType) * G0 * G1 * O * N);
|
||||
SimpleDeviceMem c_device_buf(sizeof(CDataType) * G0 * G1 * M * O);
|
||||
|
||||
using DeviceOp =
|
||||
ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute<2,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
ADataType,
|
||||
B0DataType,
|
||||
B1DataType,
|
||||
CDataType,
|
||||
ck::Tuple<>,
|
||||
ck::Tuple<>,
|
||||
AElementOp,
|
||||
B0ElementOp,
|
||||
Acc0ElementOp,
|
||||
B1ElementOp,
|
||||
CElementOp,
|
||||
MaskingSpec>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device op instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b0_device_buf.GetDeviceBuffer(),
|
||||
b1_device_buf.GetDeviceBuffer(),
|
||||
c_device_buf.GetDeviceBuffer(),
|
||||
{}, // p_acc0_biases
|
||||
{}, // p_acc1_biases
|
||||
a_gs_ms_ks_lengths,
|
||||
a_gs_ms_ks_strides,
|
||||
b0_gs_ns_ks_lengths,
|
||||
b0_gs_ns_ks_strides,
|
||||
b1_gs_os_ns_lengths,
|
||||
b1_gs_os_ns_strides,
|
||||
c_gs_ms_os_lengths,
|
||||
c_gs_ms_os_strides,
|
||||
{}, // acc0_biases_gs_ms_ns_lengths
|
||||
{}, // acc0_biases_gs_ms_ns_strides
|
||||
{}, // acc1_biases_gs_ms_os_lengths
|
||||
{}, // acc1_biases_gs_ms_os_strides
|
||||
AElementOp{},
|
||||
B0ElementOp{},
|
||||
Acc0ElementOp{1 / sqrtf(K)},
|
||||
B1ElementOp{},
|
||||
CElementOp{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * G0 * G1;
|
||||
std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
|
||||
sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
|
||||
G0 * G1;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
|
||||
<< " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best instance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b0_device_buf.GetDeviceBuffer(),
|
||||
b1_device_buf.GetDeviceBuffer(),
|
||||
c_device_buf.GetDeviceBuffer(),
|
||||
{}, // p_acc0_biases
|
||||
{}, // p_acc1_biases
|
||||
a_gs_ms_ks_lengths,
|
||||
a_gs_ms_ks_strides,
|
||||
b0_gs_ns_ks_lengths,
|
||||
b0_gs_ns_ks_strides,
|
||||
b1_gs_os_ns_lengths,
|
||||
b1_gs_os_ns_strides,
|
||||
c_gs_ms_os_lengths,
|
||||
c_gs_ms_os_strides,
|
||||
{}, // acc0_biases_gs_ms_ns_lengths
|
||||
{}, // acc0_biases_gs_ms_ns_strides
|
||||
{}, // acc1_biases_gs_ms_os_lengths
|
||||
{}, // acc1_biases_gs_ms_os_strides
|
||||
AElementOp{},
|
||||
B0ElementOp{},
|
||||
Acc0ElementOp{1 / sqrtf(K)},
|
||||
B1ElementOp{},
|
||||
CElementOp{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
226
client_example/08_fused_attention/fused_attention_bias.cpp
Normal file
226
client_example/08_fused_attention/fused_attention_bias.cpp
Normal file
@@ -0,0 +1,226 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using B0ElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Acc0ElementOp = ck::tensor_operation::element_wise::ScaleAdd;
|
||||
using B1ElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
constexpr static auto MaskingSpec =
|
||||
ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
|
||||
|
||||
using ADataType = ck::half_t;
|
||||
using B0DataType = ck::half_t;
|
||||
using B1DataType = ck::half_t;
|
||||
using CDataType = ck::half_t;
|
||||
using D0DataType = ck::half_t;
|
||||
using AccDataType = float;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int G0 = 48;
|
||||
int G1 = 16;
|
||||
int M = 1024;
|
||||
int N = 1024;
|
||||
int K = 64;
|
||||
int O = 64;
|
||||
|
||||
// A layout [G0, M, G1, K]
|
||||
std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
|
||||
std::vector<ck::index_t> a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1};
|
||||
|
||||
// B0 layout [G0, N, G1, K]
|
||||
std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
|
||||
std::vector<ck::index_t> b0_gs_ns_ks_strides{N * G1 * K, K, G1 * K, 1};
|
||||
|
||||
// B1 layout [G0, N, G1, O]
|
||||
std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
|
||||
std::vector<ck::index_t> b1_gs_os_ns_strides{N * G1 * O, O, 1, G1 * O};
|
||||
|
||||
// C layout [G0, M, G1, O]
|
||||
std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
|
||||
std::vector<ck::index_t> c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1};
|
||||
|
||||
// D layout [G0, M, G1, N]
|
||||
std::vector<ck::index_t> d0_gs_ms_ns_lengths{G0, G1, M, N};
|
||||
std::vector<ck::index_t> d0_gs_ms_ns_strides{M * G1 * N, N, G1 * N, 1};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) * G0 * G1 * M * K);
|
||||
SimpleDeviceMem b0_device_buf(sizeof(B0DataType) * G0 * G1 * N * K);
|
||||
SimpleDeviceMem d0_device_buf(sizeof(D0DataType) * G0 * G1 * M * N);
|
||||
SimpleDeviceMem b1_device_buf(sizeof(B1DataType) * G0 * G1 * O * N);
|
||||
SimpleDeviceMem c_device_buf(sizeof(CDataType) * G0 * G1 * M * O);
|
||||
|
||||
using DeviceOp =
|
||||
ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute<2,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
ADataType,
|
||||
B0DataType,
|
||||
B1DataType,
|
||||
CDataType,
|
||||
ck::Tuple<D0DataType>,
|
||||
ck::Tuple<>,
|
||||
AElementOp,
|
||||
B0ElementOp,
|
||||
Acc0ElementOp,
|
||||
B1ElementOp,
|
||||
CElementOp,
|
||||
MaskingSpec>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device op instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
a_device_buf.GetDeviceBuffer(),
|
||||
b0_device_buf.GetDeviceBuffer(),
|
||||
b1_device_buf.GetDeviceBuffer(),
|
||||
c_device_buf.GetDeviceBuffer(),
|
||||
std::array<void*, 1>{d0_device_buf.GetDeviceBuffer()}, // p_acc0_biases
|
||||
{}, // p_acc1_biases
|
||||
a_gs_ms_ks_lengths,
|
||||
a_gs_ms_ks_strides,
|
||||
b0_gs_ns_ks_lengths,
|
||||
b0_gs_ns_ks_strides,
|
||||
b1_gs_os_ns_lengths,
|
||||
b1_gs_os_ns_strides,
|
||||
c_gs_ms_os_lengths,
|
||||
c_gs_ms_os_strides,
|
||||
std::array<std::vector<ck::index_t>, 1>{
|
||||
d0_gs_ms_ns_lengths}, // acc0_biases_gs_ms_ns_lengths
|
||||
std::array<std::vector<ck::index_t>, 1>{
|
||||
d0_gs_ms_ns_strides}, // acc0_biases_gs_ms_ns_strides
|
||||
{}, // acc1_biases_gs_ms_os_lengths
|
||||
{}, // acc1_biases_gs_ms_os_strides
|
||||
AElementOp{},
|
||||
B0ElementOp{},
|
||||
Acc0ElementOp{1 / sqrtf(K)},
|
||||
B1ElementOp{},
|
||||
CElementOp{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * G0 * G1;
|
||||
std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
|
||||
sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O +
|
||||
sizeof(D0DataType) * M * N) *
|
||||
G0 * G1;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
|
||||
<< " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best instance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
a_device_buf.GetDeviceBuffer(),
|
||||
b0_device_buf.GetDeviceBuffer(),
|
||||
b1_device_buf.GetDeviceBuffer(),
|
||||
c_device_buf.GetDeviceBuffer(),
|
||||
std::array<void*, 1>{d0_device_buf.GetDeviceBuffer()}, // p_acc0_biases
|
||||
{}, // p_acc1_biases
|
||||
a_gs_ms_ks_lengths,
|
||||
a_gs_ms_ks_strides,
|
||||
b0_gs_ns_ks_lengths,
|
||||
b0_gs_ns_ks_strides,
|
||||
b1_gs_os_ns_lengths,
|
||||
b1_gs_os_ns_strides,
|
||||
c_gs_ms_os_lengths,
|
||||
c_gs_ms_os_strides,
|
||||
std::array<std::vector<ck::index_t>, 1>{
|
||||
d0_gs_ms_ns_lengths}, // acc0_biases_gs_ms_ns_lengths
|
||||
std::array<std::vector<ck::index_t>, 1>{
|
||||
d0_gs_ms_ns_strides}, // acc0_biases_gs_ms_ns_strides
|
||||
{}, // acc1_biases_gs_ms_os_lengths
|
||||
{}, // acc1_biases_gs_ms_os_strides
|
||||
AElementOp{},
|
||||
B0ElementOp{},
|
||||
Acc0ElementOp{1 / sqrtf(K)},
|
||||
B1ElementOp{},
|
||||
CElementOp{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
22
client_example/09_quantization/CMakeLists.txt
Normal file
22
client_example/09_quantization/CMakeLists.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
if(GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "int8" OR NOT DEFINED DTYPES))
|
||||
add_executable(client_conv2d_fwd_bias_tanh_perchannel_quantization conv2d_fwd_bias_tanh_perchannel_quantization.cpp)
|
||||
target_link_libraries(client_conv2d_fwd_bias_tanh_perchannel_quantization PRIVATE composable_kernel::device_conv_operations composable_kernel::device_other_operations composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_conv2d_fwd_bias_relu_perchannel_quantization conv2d_fwd_bias_relu_perchannel_quantization.cpp)
|
||||
target_link_libraries(client_conv2d_fwd_bias_relu_perchannel_quantization PRIVATE composable_kernel::device_conv_operations composable_kernel::device_other_operations composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_conv2d_fwd_bias_tanh_perlayer_quantization conv2d_fwd_bias_tanh_perlayer_quantization.cpp)
|
||||
target_link_libraries(client_conv2d_fwd_bias_tanh_perlayer_quantization PRIVATE composable_kernel::device_conv_operations composable_kernel::device_other_operations composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_conv2d_fwd_bias_relu_perlayer_quantization conv2d_fwd_bias_relu_perlayer_quantization.cpp)
|
||||
target_link_libraries(client_conv2d_fwd_bias_relu_perlayer_quantization PRIVATE composable_kernel::device_conv_operations composable_kernel::device_other_operations composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_conv2d_fwd_perchannel_quantization conv2d_fwd_perchannel_quantization.cpp)
|
||||
target_link_libraries(client_conv2d_fwd_perchannel_quantization PRIVATE composable_kernel::device_conv_operations composable_kernel::device_other_operations composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_conv2d_fwd_perlayer_quantization conv2d_fwd_perlayer_quantization.cpp)
|
||||
target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable_kernel::device_conv_operations composable_kernel::device_other_operations composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_gemm_quantization gemm_quantization.cpp)
|
||||
target_link_libraries(client_gemm_quantization PRIVATE composable_kernel::device_conv_operations composable_kernel::device_other_operations composable_kernel::device_gemm_operations)
|
||||
endif()
|
||||
@@ -0,0 +1,210 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using RequantScaleDataType = float;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
using BiasLayout = ck::tensor_layout::convolution::G_K;
|
||||
using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
|
||||
using OutLayout = ck::tensor_layout::convolution::NHWGK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::Relu;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 4;
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 32; // output channel
|
||||
static constexpr ck::index_t C = 64; // input channel (per group)
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
|
||||
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
|
||||
// Hence, we need to adjust the order of stride
|
||||
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
|
||||
std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
|
||||
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
|
||||
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
|
||||
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
|
||||
std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
|
||||
|
||||
std::array<ck::index_t, 2> in_left_pad{1, 1};
|
||||
std::array<ck::index_t, 2> in_right_pad{1, 1};
|
||||
std::array<ck::index_t, 2> conv_strides{2, 2};
|
||||
std::array<ck::index_t, 2> conv_dilations{1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
|
||||
SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
|
||||
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
|
||||
NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout, RequantScaleLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<BiasDataType, RequantScaleDataType>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
OutElementOp>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths, requant_scale_lengths},
|
||||
{bias_strides, requant_scale_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes =
|
||||
G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
|
||||
G * sizeof(BiasDataType) * K + G * sizeof(RequantScaleDataType) * K +
|
||||
G * sizeof(OutDataType) * N * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// run the best intance
|
||||
if(best_op_id != -1)
|
||||
{
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths, requant_scale_lengths},
|
||||
{bias_strides, requant_scale_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,206 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
using BiasLayout = ck::tensor_layout::convolution::G_K;
|
||||
using OutLayout = ck::tensor_layout::convolution::NHWGK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::Relu;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 4;
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 32; // output channel
|
||||
static constexpr ck::index_t C = 64; // input channel (per group)
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
static constexpr float requant_scale = 0.5f; // requantize qAcc to qz
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
|
||||
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
|
||||
// Hence, we need to adjust the order of stride
|
||||
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
|
||||
std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
|
||||
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
|
||||
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
|
||||
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
|
||||
|
||||
std::array<ck::index_t, 2> in_left_pad{1, 1};
|
||||
std::array<ck::index_t, 2> in_right_pad{1, 1};
|
||||
std::array<ck::index_t, 2> conv_strides{2, 2};
|
||||
std::array<ck::index_t, 2> conv_dilations{1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
|
||||
SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
|
||||
|
||||
using DeviceOp =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<BiasDataType>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
OutElementOp>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths},
|
||||
{bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{requant_scale, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes =
|
||||
G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
|
||||
G * sizeof(BiasDataType) * K + G * sizeof(OutDataType) * N * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// run the best intance
|
||||
if(best_op_id != -1)
|
||||
{
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths},
|
||||
{bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{requant_scale, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,213 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using RequantScaleDataType = float;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
using BiasLayout = ck::tensor_layout::convolution::G_K;
|
||||
using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
|
||||
using OutLayout = ck::tensor_layout::convolution::NHWGK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::TanH;
|
||||
using OutElementOp =
|
||||
ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 4;
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 32; // output channel
|
||||
static constexpr ck::index_t C = 64; // input channel (per group)
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
static constexpr float sz_inv = 0.5f; // inverse of scale_z
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
|
||||
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
|
||||
// Hence, we need to adjust the order of stride
|
||||
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
|
||||
std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
|
||||
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
|
||||
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
|
||||
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
|
||||
std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
|
||||
|
||||
std::array<ck::index_t, 2> in_left_pad{1, 1};
|
||||
std::array<ck::index_t, 2> in_right_pad{1, 1};
|
||||
std::array<ck::index_t, 2> conv_strides{2, 2};
|
||||
std::array<ck::index_t, 2> conv_dilations{1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
|
||||
SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
|
||||
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
|
||||
NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout, RequantScaleLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<BiasDataType, RequantScaleDataType>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
OutElementOp>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths, requant_scale_lengths},
|
||||
{bias_strides, requant_scale_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{sz_inv, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes =
|
||||
G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
|
||||
G * sizeof(BiasDataType) * K + G * sizeof(RequantScaleDataType) * K +
|
||||
G * sizeof(OutDataType) * N * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// run the best intance
|
||||
if(best_op_id != -1)
|
||||
{
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths, requant_scale_lengths},
|
||||
{bias_strides, requant_scale_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{sz_inv, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,205 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
using BiasLayout = ck::tensor_layout::convolution::G_K;
|
||||
using OutLayout = ck::tensor_layout::convolution::NHWGK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::TanH;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 4;
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 32; // output channel
|
||||
static constexpr ck::index_t C = 64; // input channel (per group)
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
static constexpr float sacc = 0.5f; // scale of acc
|
||||
static constexpr float sz_inv = 0.5f; // inverse of scale_z
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
|
||||
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
|
||||
// Hence, we need to adjust the order of stride
|
||||
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
|
||||
std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
|
||||
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
|
||||
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
|
||||
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
|
||||
|
||||
std::array<ck::index_t, 2> in_left_pad{1, 1};
|
||||
std::array<ck::index_t, 2> in_right_pad{1, 1};
|
||||
std::array<ck::index_t, 2> conv_strides{2, 2};
|
||||
std::array<ck::index_t, 2> conv_dilations{1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
|
||||
SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
|
||||
|
||||
using DeviceOp =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<BiasDataType>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
OutElementOp>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths},
|
||||
{bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{sacc, sz_inv, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes =
|
||||
G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
|
||||
G * sizeof(BiasDataType) * K + G * sizeof(OutDataType) * N * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// run the best intance
|
||||
if(best_op_id != -1)
|
||||
{
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths},
|
||||
{bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{sacc, sz_inv, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,203 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using RequantScaleDataType = float;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
|
||||
using OutLayout = ck::tensor_layout::convolution::NHWGK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ActivationOp = PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 4;
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 32; // output channel
|
||||
static constexpr ck::index_t C = 64; // input channel (per group)
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
|
||||
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
|
||||
// Hence, we need to adjust the order of stride
|
||||
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
|
||||
std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
|
||||
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
|
||||
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
|
||||
std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
|
||||
|
||||
std::array<ck::index_t, 2> in_left_pad{1, 1};
|
||||
std::array<ck::index_t, 2> in_right_pad{1, 1};
|
||||
std::array<ck::index_t, 2> conv_strides{2, 2};
|
||||
std::array<ck::index_t, 2> conv_dilations{1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
|
||||
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
|
||||
NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<RequantScaleLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<RequantScaleDataType>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
OutElementOp>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{requant_scale.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{requant_scale_lengths},
|
||||
{requant_scale_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes =
|
||||
G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
|
||||
G * sizeof(RequantScaleDataType) * K + G * sizeof(OutDataType) * N * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// run the best intance
|
||||
if(best_op_id != -1)
|
||||
{
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{},
|
||||
{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,199 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NHWGK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ActivationOp = PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 4;
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 32; // output channel
|
||||
static constexpr ck::index_t C = 64; // input channel (per group)
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
static constexpr float requant_scale = 0.5f; // requantize qAcc to qY
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
|
||||
// However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
|
||||
// Hence, we need to adjust the order of stride
|
||||
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
|
||||
std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
|
||||
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
|
||||
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
|
||||
|
||||
std::array<ck::index_t, 2> in_left_pad{1, 1};
|
||||
std::array<ck::index_t, 2> in_right_pad{1, 1};
|
||||
std::array<ck::index_t, 2> conv_strides{2, 2};
|
||||
std::array<ck::index_t, 2> conv_dilations{1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
OutElementOp>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{},
|
||||
{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{requant_scale, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
|
||||
G * sizeof(WeiDataType) * K * Y * X * C +
|
||||
G * sizeof(OutDataType) * N * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id != -1)
|
||||
{
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{},
|
||||
{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{requant_scale, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
193
client_example/09_quantization/gemm_quantization.cpp
Normal file
193
client_example/09_quantization/gemm_quantization.cpp
Normal file
@@ -0,0 +1,193 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp"
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using ActivationOp = PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
using ADataType = int8_t;
|
||||
using BDataType = int8_t;
|
||||
using EDataType = int8_t;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using ELayout = Row;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t M = 1024;
|
||||
ck::index_t N = 1024;
|
||||
ck::index_t K = 1024;
|
||||
|
||||
ck::index_t StrideA = 1024;
|
||||
ck::index_t StrideB = 1024;
|
||||
ck::index_t StrideE = 1024;
|
||||
|
||||
float requant_scale = 0.03;
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
|
||||
SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<ALayout,
|
||||
BLayout,
|
||||
ck::Tuple<>,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ck::Tuple<>,
|
||||
EDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
{},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_bytes =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id != -1)
|
||||
{
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
{},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
13
client_example/10_grouped_convnd_bwd_data/CMakeLists.txt
Normal file
13
client_example/10_grouped_convnd_bwd_data/CMakeLists.txt
Normal file
@@ -0,0 +1,13 @@
|
||||
add_executable(client_grouped_conv2d_bwd_data grouped_conv2d_bwd_data.cpp)
|
||||
target_link_libraries(client_grouped_conv2d_bwd_data PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_grouped_conv2d_bwd_data_ngchw grouped_conv2d_bwd_data_ngchw.cpp)
|
||||
target_link_libraries(client_grouped_conv2d_bwd_data_ngchw PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_grouped_conv3d_bwd_data grouped_conv3d_bwd_data.cpp)
|
||||
target_link_libraries(client_grouped_conv3d_bwd_data PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
|
||||
add_executable(client_grouped_conv3d_bwd_data_input_fp16_comp_bf8f8 grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp)
|
||||
target_link_libraries(client_grouped_conv3d_bwd_data_input_fp16_comp_bf8f8 PRIVATE composable_kernel::device_conv_operations)
|
||||
endif()
|
||||
48
client_example/10_grouped_convnd_bwd_data/README.md
Normal file
48
client_example/10_grouped_convnd_bwd_data/README.md
Normal file
@@ -0,0 +1,48 @@
|
||||
[Back to supported operations](../../../include/ck/README.md)
|
||||
# Composable Kernel Grouped Convolution
|
||||
|
||||
## Grouped Convolution Backward Data
|
||||
|
||||
Grouped convolution operation for 1D, 2D or 3D spatial dimensions. Convolution utilizes GEMM kernel after tensor coordinate transform. In CK Grouped Convolution Backward Data operation is called as `DeviceGroupedConvBwdDataMultipleD` and requires following types as template parameters:
|
||||
|
||||
* **NumDimSpatial** - number of spatial dimensions (1D, 2D, 3D).
|
||||
* **ALayout** - output layout (NHWGK, GNHWK, NGKHW).
|
||||
* **BLayout** - weight layout (GKYXC).
|
||||
* **DsLayout** - layouts for additional tensors for fused operations.
|
||||
* **ELayout** - input layout (NHWGC, GNHWC, NGCHW).
|
||||
* **ADataType** - output data type.
|
||||
* **BDataType** - weight data type.
|
||||
* **DsDataType** - data types for additional tensors for fused operations.
|
||||
* **EDataType** - input data type.
|
||||
* **AElementwiseOperation** - fused operation on tensor A (output).
|
||||
* **BElementwiseOperation** - fused operation on tensor B (weight).
|
||||
* **CDEElementwiseOperation** - fused operation on tensor C (input).
|
||||
* **AComputeType** - compute data type of tensor A for mfma instruction (ADataType by default).
|
||||
* **BComputeType** - compute data type of tensor B for mfma instruction (AComputeType by default).
|
||||
|
||||
Grouped convolution backward data supports tensors larger than 2GB (except when image is larger than 2GB).
|
||||
|
||||
List of the device operations for grouped convolution backward data in CK:
|
||||
|
||||
* **DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1** - Device operation with XDL instructions and support of fused operations to input.
|
||||
* **DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle** - Device operation with WMMA instructions.
|
||||
|
||||
Table of supported cases by instance factory with XDL instruction:
|
||||
|
||||
| |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|GNHWC/GKYXC/GNHWK|
|
||||
|-------|---|---|---|
|
||||
|bf16|2D, 3D|2D, 3D|2D, 3D|
|
||||
|fp16 |2D, 3D|2D, 3D|2D, 3D|
|
||||
|fp32 |2D, 3D|2D, 3D|2D, 3D|
|
||||
|
||||
Table of supported cases by instance factory with WMMA instruction:
|
||||
|
||||
| |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|GNHWC/GKYXC/GNHWK|
|
||||
|-------|---|---|---|
|
||||
|fp16 |2D, 3D|✗|2D, 3D|
|
||||
|int8 |2D, 3D|✗|2D, 3D|
|
||||
|
||||
Table of supported cases by instance factory with fused elementwise operation:
|
||||
|
||||
* **Bilinear** - 3D, NHWGC, bf16/fp16/fp32
|
||||
* **Scale** - 3D, NHWGC, bf16/fp16/fp32
|
||||
@@ -0,0 +1,226 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::GNHWK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 32;
|
||||
static constexpr ck::index_t N = 256;
|
||||
static constexpr ck::index_t K = 192;
|
||||
static constexpr ck::index_t C = 192;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 28;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1};
|
||||
|
||||
std::partial_sum(rbegin(in_lengths),
|
||||
std::prev(rend(in_lengths)),
|
||||
std::next(rbegin(in_strides)),
|
||||
std::multiplies<>{});
|
||||
std::partial_sum(rbegin(wei_lengths),
|
||||
std::prev(rend(wei_lengths)),
|
||||
std::next(rbegin(wei_strides)),
|
||||
std::multiplies<>{});
|
||||
std::partial_sum(rbegin(out_lengths),
|
||||
std::prev(rend(out_lengths)),
|
||||
std::next(rbegin(out_strides)),
|
||||
std::multiplies<>{});
|
||||
|
||||
// transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
|
||||
std::rotate(
|
||||
rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
|
||||
std::rotate(
|
||||
rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
|
||||
std::rotate(
|
||||
rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
|
||||
std::rotate(
|
||||
rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
|
||||
std::rotate(
|
||||
rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
|
||||
std::rotate(
|
||||
rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NumDimSpatial,
|
||||
OutLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
InLayout,
|
||||
OutDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<>,
|
||||
InDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
in.GetDeviceBuffer(),
|
||||
out_lengths,
|
||||
out_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
in_lengths,
|
||||
in_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
|
||||
sizeof(WeiDataType) * G * K * Y * X * C +
|
||||
sizeof(OutDataType) * G * N * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
in.GetDeviceBuffer(),
|
||||
out_lengths,
|
||||
out_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
in_lengths,
|
||||
in_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,205 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NGCHW;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NGKHW;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 32;
|
||||
static constexpr ck::index_t N = 256;
|
||||
static constexpr ck::index_t K = 192;
|
||||
static constexpr ck::index_t C = 192;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 28;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> in_strides{
|
||||
C * Hi * Wi, G * C * Hi * Wi, Wi, 1, Hi * Wi};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> wei_strides{K * Y * X * C, Y * X * C, X * C, C, 1};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> out_strides{
|
||||
K * Ho * Wo, G * K * Ho * Wo, Wo, 1, Ho * Wo};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NumDimSpatial,
|
||||
OutLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
InLayout,
|
||||
OutDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<>,
|
||||
InDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
in.GetDeviceBuffer(),
|
||||
out_lengths,
|
||||
out_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
in_lengths,
|
||||
in_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace_dev(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
|
||||
sizeof(WeiDataType) * G * K * Y * X * C +
|
||||
sizeof(OutDataType) * G * N * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
in.GetDeviceBuffer(),
|
||||
out_lengths,
|
||||
out_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
in_lengths,
|
||||
in_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,205 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 2;
|
||||
static constexpr ck::index_t N = 16;
|
||||
static constexpr ck::index_t K = 16;
|
||||
static constexpr ck::index_t C = 16;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 14;
|
||||
static constexpr ck::index_t Hi = 14;
|
||||
static constexpr ck::index_t Wi = 14;
|
||||
static constexpr ck::index_t Do = 14;
|
||||
static constexpr ck::index_t Ho = 14;
|
||||
static constexpr ck::index_t Wo = 14;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, C, Di, Hi, Wi};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> in_strides{
|
||||
C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, C, Z, Y, X};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> wei_strides{
|
||||
K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, K, Do, Ho, Wo};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> out_strides{
|
||||
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * G * N * Di * Hi * Wi * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Do * Ho * Wo * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NumDimSpatial,
|
||||
OutLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
InLayout,
|
||||
OutDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<>,
|
||||
InDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
in.GetDeviceBuffer(),
|
||||
out_lengths,
|
||||
out_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
in_lengths,
|
||||
in_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = std::size_t(2) * G * N * K * C * Do * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes = sizeof(InDataType) * G * N * Di * Hi * Wi * C +
|
||||
sizeof(WeiDataType) * G * K * Z * Y * X * C +
|
||||
sizeof(OutDataType) * G * N * Do * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
in.GetDeviceBuffer(),
|
||||
out_lengths,
|
||||
out_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
in_lengths,
|
||||
in_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,207 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 2;
|
||||
static constexpr ck::index_t N = 16;
|
||||
static constexpr ck::index_t K = 16;
|
||||
static constexpr ck::index_t C = 16;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 14;
|
||||
static constexpr ck::index_t Hi = 14;
|
||||
static constexpr ck::index_t Wi = 14;
|
||||
static constexpr ck::index_t Do = 14;
|
||||
static constexpr ck::index_t Ho = 14;
|
||||
static constexpr ck::index_t Wo = 14;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, C, Di, Hi, Wi};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> in_strides{
|
||||
C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, C, Z, Y, X};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> wei_strides{
|
||||
K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, K, Do, Ho, Wo};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> out_strides{
|
||||
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * G * N * Di * Hi * Wi * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Do * Ho * Wo * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NumDimSpatial,
|
||||
OutLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
InLayout,
|
||||
OutDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<>,
|
||||
InDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
ck::bf8_t,
|
||||
ck::f8_t>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
in.GetDeviceBuffer(),
|
||||
out_lengths,
|
||||
out_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
in_lengths,
|
||||
in_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = std::size_t(2) * G * N * K * C * Do * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes = sizeof(InDataType) * G * N * Di * Hi * Wi * C +
|
||||
sizeof(WeiDataType) * G * K * Z * Y * X * C +
|
||||
sizeof(OutDataType) * G * N * Do * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
in.GetDeviceBuffer(),
|
||||
out_lengths,
|
||||
out_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
in_lengths,
|
||||
in_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
}
|
||||
14
client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
Normal file
14
client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
Normal file
@@ -0,0 +1,14 @@
|
||||
add_executable(client_grouped_conv1d_bwd_weight_fp16 grouped_conv1d_bwd_weight_fp16.cpp)
|
||||
add_executable(client_grouped_conv2d_bwd_weight_fp16 grouped_conv2d_bwd_weight_fp16.cpp)
|
||||
add_executable(client_grouped_conv3d_bwd_weight_fp16 grouped_conv3d_bwd_weight_fp16.cpp)
|
||||
add_executable(client_grouped_conv3d_bwd_weight_fp32 grouped_conv3d_bwd_weight_fp32.cpp)
|
||||
|
||||
target_link_libraries(client_grouped_conv1d_bwd_weight_fp16 PRIVATE composable_kernel::device_conv_operations)
|
||||
target_link_libraries(client_grouped_conv2d_bwd_weight_fp16 PRIVATE composable_kernel::device_conv_operations)
|
||||
target_link_libraries(client_grouped_conv3d_bwd_weight_fp16 PRIVATE composable_kernel::device_conv_operations)
|
||||
target_link_libraries(client_grouped_conv3d_bwd_weight_fp32 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
|
||||
add_executable(client_grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8 grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8.cpp)
|
||||
target_link_libraries(client_grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8 PRIVATE composable_kernel::device_conv_operations)
|
||||
endif()
|
||||
62
client_example/11_grouped_conv_bwd_weight/README.md
Normal file
62
client_example/11_grouped_conv_bwd_weight/README.md
Normal file
@@ -0,0 +1,62 @@
|
||||
[Back to supported operations](../../../include/ck/README.md)
|
||||
# Composable Kernel Grouped Convolution
|
||||
|
||||
## Grouped Convolution Backward Weight
|
||||
|
||||
Grouped convolution operation for 1D, 2D or 3D spatial dimensions. Convolution utilizes GEMM kernel after tensor coordinate transform. Backward weight version uses splitK feature (due to large GEMM K dimension). In CK Grouped Convolution Backward Weight operation is called as `DeviceGroupedConvBwdWeight` and requires following types as template parameters:
|
||||
|
||||
* **NumDimSpatial** - number of spatial dimensions (1D, 2D, 3D).
|
||||
* **InLayout** - input layout (NHWGC, GNHWC, NGCHW).
|
||||
* **WeiLayout** - weight layout (GKYXC).
|
||||
* **OutLayout** - output layout (NHWGK, GNHWK, NGKHW).
|
||||
* **InDataType** - input data type.
|
||||
* **WeiDataType** - weight data type.
|
||||
* **OutDataType** - output data type.
|
||||
* **InElementwiseOperation** - fused operation on tensor input.
|
||||
* **WeiElementwiseOperation** - fused operation on tensor weight.
|
||||
* **OutElementwiseOperation** - fused operation on tensor output.
|
||||
* **ComputeTypeA** - compute data type of tensor A for mfma instruction (ADataType by default).
|
||||
* **ComputeTypeB** - compute data type of tensor B for mfma instruction (ComputeTypeA by default).
|
||||
|
||||
For fused operations with additional tensor there is `DeviceGroupedConvBwdWeightMultipleD` operation which requires following parameters:
|
||||
* **DsLayout** - layouts for additional tensors for fused operations.
|
||||
* **DsDataType** - data types for additional tensors for fused operations.
|
||||
|
||||
Grouped convolution backward weight doesn't supports tensors larger than 2GB.
|
||||
|
||||
List of the device operations for grouped convolution backward weight in CK:
|
||||
|
||||
* **DeviceGroupedConvBwdWeight_Xdl_CShuffle** - Device operation with XDL instructions.
|
||||
* **DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle** - Device operation with XDL instructions. Optimized for small C or K.
|
||||
* **DeviceGroupedConvBwdWeight_Wmma_CShuffle** - Device operation with WMMA instructions.
|
||||
* **DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle** - Device operation with XDL instructions and support of fused operations to output.
|
||||
* **DeviceGroupedConvBwdWeight_Dl** - Device operation with DL instructions.
|
||||
|
||||
Table of supported cases by instance factory with XDL instruction:
|
||||
|
||||
| |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|NGCHW/GKCYX/NGKHW|GNHWC/GKYXC/GNHWK|
|
||||
|-------|---|---|---|---|
|
||||
|bf16|2D, 3D|2D, 3D|2D, 3D|✗|
|
||||
|bf16(fp32 for weight)|2D, 3D|✗|✗|1D, 2D, 3D|
|
||||
|fp16 |2D, 3D|2D, 3D|2D, 3D|1D, 2D, 3D|
|
||||
|fp32 |2D, 3D|2D, 3D|2D, 3D|1D, 2D, 3D|
|
||||
|
||||
Table of supported cases by instance factory with WMMA instruction:
|
||||
|
||||
| |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|GNHWC/GKYXC/GNHWK|
|
||||
|-------|---|---|---|
|
||||
|fp16 |3D|✗|3D|
|
||||
|int8 |3D|✗|3D|
|
||||
|
||||
Table of supported cases by instance factory with DL instruction:
|
||||
|
||||
| |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|GNHWC/GKYXC/GNHWK|
|
||||
|-------|---|---|---|
|
||||
|bf16(fp32 for weight)|1D, 2D, 3D|✗|1D, 2D, 3D|
|
||||
|fp16 |1D, 2D, 3D|✗|1D, 2D, 3D|
|
||||
|fp32 |1D, 2D, 3D|✗|1D, 2D, 3D|
|
||||
|
||||
Table of supported cases by instance factory with fused elementwise operation:
|
||||
|
||||
* **Bilinear** - 3D, NHWGC, bf16(fp32 for weight)/fp16/fp32
|
||||
* **Scale** - 3D, NHWGC, bf16(fp32 for weight)/fp16/fp32
|
||||
239
client_example/11_grouped_conv_bwd_weight/common.hpp
Normal file
239
client_example/11_grouped_conv_bwd_weight/common.hpp
Normal file
@@ -0,0 +1,239 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
template <ck::index_t NumDimSpatial>
|
||||
std::size_t GetFlops(const std::array<ck::index_t, NumDimSpatial>& output_lengths,
|
||||
const std::array<ck::index_t, NumDimSpatial>& filter_lengths)
|
||||
{
|
||||
constexpr ck::index_t spatial_offset = 3;
|
||||
const auto C = filter_lengths[2];
|
||||
// 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
|
||||
return static_cast<std::size_t>(2) * C *
|
||||
std::accumulate(std::begin(output_lengths),
|
||||
std::end(output_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>()) *
|
||||
std::accumulate(std::begin(filter_lengths) + spatial_offset,
|
||||
std::end(filter_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>());
|
||||
}
|
||||
|
||||
template <typename InDataType, ck::index_t NumDimSpatial>
|
||||
std::size_t GetInputByte(const std::array<ck::index_t, NumDimSpatial>& input_lengths)
|
||||
{
|
||||
// sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
|
||||
return sizeof(InDataType) * (std::accumulate(std::begin(input_lengths),
|
||||
std::end(input_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>()));
|
||||
}
|
||||
|
||||
template <typename WeiDataType, ck::index_t NumDimSpatial>
|
||||
std::size_t GetWeightByte(const std::array<ck::index_t, NumDimSpatial>& filter_lengths)
|
||||
{
|
||||
// sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
|
||||
return sizeof(WeiDataType) * (std::accumulate(std::begin(filter_lengths),
|
||||
std::end(filter_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>()));
|
||||
}
|
||||
|
||||
template <typename OutDataType, ck::index_t NumDimSpatial>
|
||||
std::size_t GetOutputByte(const std::array<ck::index_t, NumDimSpatial>& output_lengths)
|
||||
{
|
||||
// sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
|
||||
return sizeof(OutDataType) * (std::accumulate(std::begin(output_lengths),
|
||||
std::end(output_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>()));
|
||||
}
|
||||
|
||||
template <ck::index_t NumDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename AComputeType = InDataType,
|
||||
typename BComputeType = AComputeType>
|
||||
bool run_grouped_conv_bwd_weight(
|
||||
const std::array<ck::index_t, NumDimSpatial + 3>& input_lengths,
|
||||
const std::array<ck::index_t, NumDimSpatial + 3>& input_strides,
|
||||
const std::array<ck::index_t, NumDimSpatial + 3>& filter_lengths,
|
||||
const std::array<ck::index_t, NumDimSpatial + 3>& weights_strides,
|
||||
const std::array<ck::index_t, NumDimSpatial + 3>& output_lengths,
|
||||
const std::array<ck::index_t, NumDimSpatial + 3>& output_strides,
|
||||
const std::array<ck::index_t, NumDimSpatial>& conv_filter_strides,
|
||||
const std::array<ck::index_t, NumDimSpatial>& conv_filter_dilations,
|
||||
const std::array<ck::index_t, NumDimSpatial>& input_left_pads,
|
||||
const std::array<ck::index_t, NumDimSpatial>& input_right_pads)
|
||||
{
|
||||
|
||||
ck::index_t split_k = 2;
|
||||
SimpleDeviceMem in(GetInputByte<InDataType, NumDimSpatial + 3>(input_lengths));
|
||||
SimpleDeviceMem wei(GetWeightByte<WeiDataType, NumDimSpatial + 3>(filter_lengths));
|
||||
SimpleDeviceMem out(GetOutputByte<OutDataType, NumDimSpatial + 3>(output_lengths));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
AComputeType,
|
||||
BComputeType>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + 3> a_g_n_c_wis_lengths{};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> a_g_n_c_wis_strides{};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> b_g_k_c_xs_lengths{};
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
out.GetDeviceBuffer(),
|
||||
input_lengths,
|
||||
input_strides,
|
||||
filter_lengths,
|
||||
weights_strides,
|
||||
output_lengths,
|
||||
output_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
split_k);
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace_dev(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = GetFlops<NumDimSpatial + 3>(output_lengths, filter_lengths);
|
||||
std::size_t num_bytes = GetInputByte<InDataType, NumDimSpatial + 3>(input_lengths) +
|
||||
GetWeightByte<WeiDataType, NumDimSpatial + 3>(filter_lengths) +
|
||||
GetOutputByte<OutDataType, NumDimSpatial + 3>(output_lengths);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
out.GetDeviceBuffer(),
|
||||
input_lengths,
|
||||
input_strides,
|
||||
filter_lengths,
|
||||
weights_strides,
|
||||
output_lengths,
|
||||
output_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
split_k);
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::GNWK;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 1;
|
||||
static constexpr ck::index_t G = 32;
|
||||
static constexpr ck::index_t N = 256;
|
||||
static constexpr ck::index_t K = 192;
|
||||
static constexpr ck::index_t C = 192;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Wi = 28;
|
||||
static constexpr ck::index_t Wo = 28;
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Wi};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, X};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Wo};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{N * Wi * C, Wi* C, 1, C};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{K * X * C, X* C, 1, C};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{N * Wo * K, Wo* K, 1, K};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1};
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_bwd_weight<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(input_lengths,
|
||||
input_strides,
|
||||
filter_lengths,
|
||||
weights_strides,
|
||||
output_lengths,
|
||||
output_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads)
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::GNHWK;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 32;
|
||||
static constexpr ck::index_t N = 256;
|
||||
static constexpr ck::index_t K = 192;
|
||||
static constexpr ck::index_t C = 192;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 28;
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Hi, Wi};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Y, X};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Ho, Wo};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
|
||||
N * Hi * Wi * C, Hi* Wi* C, 1, Wi* C, C};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
|
||||
K * Y * X * C, Y* X* C, 1, X* C, C};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
|
||||
N * Ho * Wo * K, Ho* Wo* K, 1, Wo* K, K};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_bwd_weight<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(input_lengths,
|
||||
input_strides,
|
||||
filter_lengths,
|
||||
weights_strides,
|
||||
output_lengths,
|
||||
output_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads)
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNDHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::GNDHWK;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 8;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 128;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 3;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 3;
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Di, Hi, Wi};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Z, Y, X};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Do, Ho, Wo};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
|
||||
N * Di * Hi * Wi * C, Di* Hi* Wi* C, 1, Hi* Wi* C, Wi* C, C};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
|
||||
K * Z * Y * X * C, Z* Y* X* C, 1, Y* X* C, X* C, C};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
|
||||
N * Do * Ho * Wo * K, Do* Ho* Wo* K, 1, Ho* Wo* K, Wo* K, K};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_bwd_weight<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(input_lengths,
|
||||
input_strides,
|
||||
filter_lengths,
|
||||
weights_strides,
|
||||
output_lengths,
|
||||
output_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads)
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
|
||||
using AComputeType = ck::bf8_t;
|
||||
using BComputeType = ck::f8_t;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 8;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 128;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 3;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 3;
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Di, Hi, Wi};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Z, Y, X};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Do, Ho, Wo};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
|
||||
N * Di * Hi * Wi * C, Di* Hi* Wi* C, 1, Hi* Wi* C, Wi* C, C};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
|
||||
K * Z * Y * X * C, Z* Y* X* C, 1, Y* X* C, X* C, C};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
|
||||
N * Do * Ho * Wo * K, Do* Ho* Wo* K, 1, Ho* Wo* K, Wo* K, K};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_bwd_weight<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
AComputeType,
|
||||
BComputeType>(input_lengths,
|
||||
input_strides,
|
||||
filter_lengths,
|
||||
weights_strides,
|
||||
output_lengths,
|
||||
output_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads)
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = float;
|
||||
using WeiDataType = float;
|
||||
using OutDataType = float;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNDHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::GNDHWK;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 8;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 128;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 3;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 3;
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Di, Hi, Wi};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> filter_lengths{G, K, C, Z, Y, X};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Do, Ho, Wo};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
|
||||
N * Di * Hi * Wi * C, Di* Hi* Wi* C, 1, Hi* Wi* C, Wi* C, C};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
|
||||
K * Z * Y * X * C, Z* Y* X* C, 1, Y* X* C, X* C, C};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
|
||||
N * Do * Ho * Wo * K, Do* Ho* Wo* K, 1, Ho* Wo* K, Wo* K, K};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
|
||||
static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_bwd_weight<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(input_lengths,
|
||||
input_strides,
|
||||
filter_lengths,
|
||||
weights_strides,
|
||||
output_lengths,
|
||||
output_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads)
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
@@ -0,0 +1,2 @@
|
||||
add_executable(client_elementwise_layernorm2d elementwise_layernorm2d.cpp)
|
||||
target_link_libraries(client_elementwise_layernorm2d PRIVATE composable_kernel::device_other_operations)
|
||||
@@ -0,0 +1,176 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/reduction_enums.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp"
|
||||
|
||||
using ADataType = ck::half_t; // Input 1
|
||||
using BDataType = ck::half_t; // Input 2
|
||||
using XDataType = ck::half_t;
|
||||
using GammaDataType = ck::half_t;
|
||||
using BetaDataType = ck::half_t;
|
||||
using YDataType = ck::half_t;
|
||||
using AccDataType = float;
|
||||
using XElementwiseOperation = ck::tensor_operation::element_wise::Add;
|
||||
using YElementwiseOperation = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
constexpr int Rank = 2;
|
||||
constexpr int NumReduceDim = 1;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
bool time_kernel = true;
|
||||
|
||||
ck::index_t M = 48 * 256;
|
||||
ck::index_t N = 1024;
|
||||
ck::index_t Stride = N;
|
||||
|
||||
auto mn_size = (M - 1) * Stride + N;
|
||||
|
||||
SimpleDeviceMem a_dev_buf(sizeof(ADataType) * mn_size);
|
||||
SimpleDeviceMem b_dev_buf(sizeof(BDataType) * mn_size);
|
||||
SimpleDeviceMem gamma_dev_buf(sizeof(GammaDataType) * N);
|
||||
SimpleDeviceMem beta_dev_buf(sizeof(BetaDataType) * N);
|
||||
SimpleDeviceMem y_dev_buf(sizeof(YDataType) * mn_size);
|
||||
|
||||
std::array<const void*, 2> ab_input = {a_dev_buf.GetDeviceBuffer(),
|
||||
b_dev_buf.GetDeviceBuffer()};
|
||||
std::vector<ck::index_t> abStride = {Stride, 1};
|
||||
std::array<std::vector<ck::index_t>, 2> abStrides = {abStride, abStride};
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceElementwiseNormalization<
|
||||
ck::Tuple<ADataType, BDataType>,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
AccDataType,
|
||||
YDataType,
|
||||
XElementwiseOperation,
|
||||
YElementwiseOperation,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
|
||||
abStrides,
|
||||
{0, 1}, // gammaStrides
|
||||
{0, 1}, // betaStrides
|
||||
{Stride, 1}, // yStrides
|
||||
{1}, // reduceDims
|
||||
1e-4,
|
||||
ab_input,
|
||||
gamma_dev_buf.GetDeviceBuffer(),
|
||||
beta_dev_buf.GetDeviceBuffer(),
|
||||
y_dev_buf.GetDeviceBuffer(),
|
||||
XElementwiseOperation{},
|
||||
YElementwiseOperation{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_byte = sizeof(ADataType) * M * N + sizeof(BDataType) * M * N +
|
||||
sizeof(GammaDataType) * N + sizeof(BetaDataType) * N +
|
||||
sizeof(YDataType) * M * N;
|
||||
|
||||
float gb_per_sec = num_byte / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
|
||||
abStrides,
|
||||
{1}, // gammaStrides
|
||||
{1}, // betaStrides
|
||||
{Stride, 1}, // yStrides
|
||||
{1}, // reduceDims
|
||||
1e-4,
|
||||
ab_input,
|
||||
gamma_dev_buf.GetDeviceBuffer(),
|
||||
beta_dev_buf.GetDeviceBuffer(),
|
||||
y_dev_buf.GetDeviceBuffer(),
|
||||
XElementwiseOperation{},
|
||||
YElementwiseOperation{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
6
client_example/13_batchnorm/CMakeLists.txt
Normal file
6
client_example/13_batchnorm/CMakeLists.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
add_executable(client_batchnorm_fwd_nhwc batchnorm_fwd_nhwc.cpp)
|
||||
add_executable(client_batchnorm_bwd_nhwc batchnorm_bwd_nhwc.cpp)
|
||||
add_executable(client_batchnorm_infer_nhwc batchnorm_infer_nhwc.cpp)
|
||||
target_link_libraries(client_batchnorm_fwd_nhwc PRIVATE composable_kernel::device_other_operations)
|
||||
target_link_libraries(client_batchnorm_bwd_nhwc PRIVATE composable_kernel::device_other_operations)
|
||||
target_link_libraries(client_batchnorm_infer_nhwc PRIVATE composable_kernel::device_other_operations)
|
||||
207
client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
Normal file
207
client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
Normal file
@@ -0,0 +1,207 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp"
|
||||
|
||||
using XDataType = ck::half_t;
|
||||
using DxDataType = float;
|
||||
using DyDataType = float;
|
||||
using AccDataType = float;
|
||||
using ScaleDataType = ck::half_t;
|
||||
using DscaleDbiasDataType = float;
|
||||
using MeanVarDataType = float;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumBatchNormReduceDim = 3;
|
||||
|
||||
const double epsilon = std::numeric_limits<float>::epsilon();
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
std::array<ck::index_t, Rank> xyLengths{16, 8, 128, 256};
|
||||
std::array<ck::index_t, Rank> xyStrides{8 * 128 * 256, 128 * 256, 256, 1};
|
||||
std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarLengths{256};
|
||||
std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarStrides{1};
|
||||
std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
|
||||
|
||||
ck::index_t numXYElement =
|
||||
std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies<ck::index_t>());
|
||||
|
||||
ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(),
|
||||
scaleBiasMeanVarLengths.end(),
|
||||
1,
|
||||
std::multiplies<ck::index_t>());
|
||||
|
||||
SimpleDeviceMem x(sizeof(XDataType) * numXYElement);
|
||||
SimpleDeviceMem dy(sizeof(DyDataType) * numXYElement);
|
||||
SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem invVariance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem dx(sizeof(DxDataType) * numXYElement);
|
||||
SimpleDeviceMem dscale(sizeof(DscaleDbiasDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem dbias(sizeof(DscaleDbiasDataType) * numScaleBiasMeanVarElement);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
|
||||
DxDataType,
|
||||
DyDataType,
|
||||
AccDataType,
|
||||
ScaleDataType,
|
||||
DscaleDbiasDataType,
|
||||
MeanVarDataType,
|
||||
PassThrough,
|
||||
Rank,
|
||||
NumBatchNormReduceDim>;
|
||||
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
|
||||
xyStrides,
|
||||
xyStrides,
|
||||
xyStrides,
|
||||
reduceDims,
|
||||
scaleBiasMeanVarLengths,
|
||||
scaleBiasMeanVarStrides,
|
||||
scaleBiasMeanVarStrides,
|
||||
scaleBiasMeanVarStrides,
|
||||
x.GetDeviceBuffer(),
|
||||
dy.GetDeviceBuffer(),
|
||||
scale.GetDeviceBuffer(),
|
||||
mean.GetDeviceBuffer(),
|
||||
invVariance.GetDeviceBuffer(),
|
||||
epsilon,
|
||||
PassThrough{},
|
||||
dx.GetDeviceBuffer(),
|
||||
dscale.GetDeviceBuffer(),
|
||||
dbias.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_bytes =
|
||||
numXYElement * (sizeof(XDataType) + sizeof(DyDataType) + sizeof(DxDataType)) +
|
||||
numScaleBiasMeanVarElement *
|
||||
(sizeof(ScaleDataType) + sizeof(DscaleDbiasDataType) * 2 +
|
||||
sizeof(MeanVarDataType) * 2);
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(found)
|
||||
{
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
|
||||
xyStrides,
|
||||
xyStrides,
|
||||
xyStrides,
|
||||
reduceDims,
|
||||
scaleBiasMeanVarLengths,
|
||||
scaleBiasMeanVarStrides,
|
||||
scaleBiasMeanVarStrides,
|
||||
scaleBiasMeanVarStrides,
|
||||
x.GetDeviceBuffer(),
|
||||
dy.GetDeviceBuffer(),
|
||||
scale.GetDeviceBuffer(),
|
||||
mean.GetDeviceBuffer(),
|
||||
invVariance.GetDeviceBuffer(),
|
||||
epsilon,
|
||||
PassThrough{},
|
||||
dx.GetDeviceBuffer(),
|
||||
dscale.GetDeviceBuffer(),
|
||||
dbias.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
203
client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
Normal file
203
client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
Normal file
@@ -0,0 +1,203 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp"
|
||||
|
||||
using XDataType = float;
|
||||
using YDataType = float;
|
||||
using AccDataType = float;
|
||||
using ScaleDataType = AccDataType;
|
||||
using BiasDataType = AccDataType;
|
||||
using MeanVarDataType = AccDataType;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumBatchNormReduceDim = 3;
|
||||
|
||||
const double epsilon = std::numeric_limits<float>::epsilon();
|
||||
const double averageFactor = 0.1;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
std::array<ck::index_t, Rank> xyLengths{16, 8, 128, 256};
|
||||
std::array<ck::index_t, Rank> xyStrides{8 * 128 * 256, 128 * 256, 256, 1};
|
||||
std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarLengths{256};
|
||||
std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarStrides{1};
|
||||
std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
|
||||
|
||||
ck::index_t numXYElement =
|
||||
std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies<ck::index_t>());
|
||||
|
||||
ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(),
|
||||
scaleBiasMeanVarLengths.end(),
|
||||
1,
|
||||
std::multiplies<ck::index_t>());
|
||||
|
||||
SimpleDeviceMem x(sizeof(XDataType) * numXYElement);
|
||||
SimpleDeviceMem y(sizeof(YDataType) * numXYElement);
|
||||
SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem bias(sizeof(BiasDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem invVariance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceBatchNormFwd<XDataType,
|
||||
YDataType,
|
||||
AccDataType,
|
||||
ScaleDataType,
|
||||
BiasDataType,
|
||||
MeanVarDataType,
|
||||
PassThrough,
|
||||
Rank,
|
||||
NumBatchNormReduceDim>;
|
||||
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
|
||||
xyStrides,
|
||||
xyStrides,
|
||||
reduceDims,
|
||||
scaleBiasMeanVarLengths,
|
||||
scaleBiasMeanVarStrides,
|
||||
scaleBiasMeanVarStrides,
|
||||
scaleBiasMeanVarStrides,
|
||||
x.GetDeviceBuffer(),
|
||||
scale.GetDeviceBuffer(),
|
||||
bias.GetDeviceBuffer(),
|
||||
epsilon,
|
||||
PassThrough{},
|
||||
y.GetDeviceBuffer(),
|
||||
mean.GetDeviceBuffer(),
|
||||
invVariance.GetDeviceBuffer(),
|
||||
averageFactor,
|
||||
nullptr,
|
||||
nullptr);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_bytes =
|
||||
numXYElement * (sizeof(XDataType) + sizeof(YDataType)) +
|
||||
numScaleBiasMeanVarElement * (sizeof(ScaleDataType) + sizeof(BiasDataType) +
|
||||
sizeof(MeanVarDataType) + sizeof(MeanVarDataType));
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(found)
|
||||
{
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
|
||||
xyStrides,
|
||||
xyStrides,
|
||||
reduceDims,
|
||||
scaleBiasMeanVarLengths,
|
||||
scaleBiasMeanVarStrides,
|
||||
scaleBiasMeanVarStrides,
|
||||
scaleBiasMeanVarStrides,
|
||||
x.GetDeviceBuffer(),
|
||||
scale.GetDeviceBuffer(),
|
||||
bias.GetDeviceBuffer(),
|
||||
epsilon,
|
||||
PassThrough{},
|
||||
y.GetDeviceBuffer(),
|
||||
mean.GetDeviceBuffer(),
|
||||
invVariance.GetDeviceBuffer(),
|
||||
averageFactor,
|
||||
nullptr,
|
||||
nullptr);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
189
client_example/13_batchnorm/batchnorm_infer_nhwc.cpp
Normal file
189
client_example/13_batchnorm/batchnorm_infer_nhwc.cpp
Normal file
@@ -0,0 +1,189 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp"
|
||||
|
||||
using XDataType = float;
|
||||
using YDataType = float;
|
||||
using ScaleDataType = float;
|
||||
using BiasDataType = float;
|
||||
using MeanVarDataType = float;
|
||||
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumBatchNormReduceDim = 3;
|
||||
|
||||
using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
|
||||
|
||||
const double epsilon = std::numeric_limits<float>::epsilon();
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
std::array<ck::index_t, Rank> xyLengths{16, 8, 128, 256};
|
||||
std::array<ck::index_t, Rank> xyStrides{8 * 128 * 256, 128 * 256, 256, 1};
|
||||
std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarLengths{256};
|
||||
std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarStrides{1};
|
||||
std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
|
||||
std::array<int, Rank - NumBatchNormReduceDim> invariantDims{3};
|
||||
|
||||
ck::index_t numXYElement =
|
||||
std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies<ck::index_t>());
|
||||
|
||||
ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(),
|
||||
scaleBiasMeanVarLengths.end(),
|
||||
1,
|
||||
std::multiplies<ck::index_t>());
|
||||
|
||||
SimpleDeviceMem x(sizeof(XDataType) * numXYElement);
|
||||
SimpleDeviceMem y(sizeof(YDataType) * numXYElement);
|
||||
SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem bias(sizeof(BiasDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem variance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
|
||||
|
||||
// values in variance need be non-negative
|
||||
(void)hipMemset(
|
||||
variance.GetDeviceBuffer(), 0, sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
|
||||
|
||||
std::array<ck::index_t, Rank> aligned_scaleBiasMeanVarStrides{0};
|
||||
|
||||
int i = 0;
|
||||
for(auto dim : invariantDims)
|
||||
{
|
||||
assert(xyLengths[dim] == scaleBiasMeanVarLengths[i]);
|
||||
|
||||
aligned_scaleBiasMeanVarStrides[dim] = scaleBiasMeanVarStrides[i];
|
||||
i++;
|
||||
};
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<XDataType, MeanVarDataType, MeanVarDataType, ScaleDataType, BiasDataType>,
|
||||
ck::Tuple<YDataType>,
|
||||
Normalize,
|
||||
Rank>;
|
||||
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
|
||||
{xyStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides},
|
||||
{xyStrides},
|
||||
{x.GetDeviceBuffer(),
|
||||
mean.GetDeviceBuffer(),
|
||||
variance.GetDeviceBuffer(),
|
||||
scale.GetDeviceBuffer(),
|
||||
bias.GetDeviceBuffer()},
|
||||
{y.GetDeviceBuffer()},
|
||||
Normalize{epsilon});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_bytes =
|
||||
numXYElement * (sizeof(XDataType) + sizeof(YDataType)) +
|
||||
numScaleBiasMeanVarElement * (sizeof(ScaleDataType) + sizeof(BiasDataType) +
|
||||
sizeof(MeanVarDataType) + sizeof(MeanVarDataType));
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(found)
|
||||
{
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
|
||||
{xyStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides},
|
||||
{xyStrides},
|
||||
{x.GetDeviceBuffer(),
|
||||
mean.GetDeviceBuffer(),
|
||||
variance.GetDeviceBuffer(),
|
||||
scale.GetDeviceBuffer(),
|
||||
bias.GetDeviceBuffer()},
|
||||
{y.GetDeviceBuffer()},
|
||||
Normalize{epsilon});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
2
client_example/14_instance_id/CMakeLists.txt
Normal file
2
client_example/14_instance_id/CMakeLists.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
add_executable(client_batchnorm_fwd_instance_id batchnorm_fwd_instance_id.cpp)
|
||||
target_link_libraries(client_batchnorm_fwd_instance_id PRIVATE composable_kernel::device_other_operations)
|
||||
206
client_example/14_instance_id/batchnorm_fwd_instance_id.cpp
Normal file
206
client_example/14_instance_id/batchnorm_fwd_instance_id.cpp
Normal file
@@ -0,0 +1,206 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp"
|
||||
|
||||
using XDataType = float;
|
||||
using YDataType = float;
|
||||
using AccDataType = float;
|
||||
using ScaleDataType = AccDataType;
|
||||
using BiasDataType = AccDataType;
|
||||
using MeanVarDataType = AccDataType;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumBatchNormReduceDim = 3;
|
||||
|
||||
const double epsilon = std::numeric_limits<float>::epsilon();
|
||||
const double averageFactor = 0.1;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
// In the actual application, the instance index and name are usually from the perf db
|
||||
static int instance_index = -1;
|
||||
static std::string instance_name;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
std::array<ck::index_t, Rank> xyLengths{16, 8, 128, 256};
|
||||
std::array<ck::index_t, Rank> xyStrides{8 * 128 * 256, 128 * 256, 256, 1};
|
||||
std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarLengths{256};
|
||||
std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarStrides{1};
|
||||
std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
|
||||
|
||||
ck::index_t numXYElement =
|
||||
std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies<ck::index_t>());
|
||||
|
||||
ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(),
|
||||
scaleBiasMeanVarLengths.end(),
|
||||
1,
|
||||
std::multiplies<ck::index_t>());
|
||||
|
||||
SimpleDeviceMem x(sizeof(XDataType) * numXYElement);
|
||||
SimpleDeviceMem y(sizeof(YDataType) * numXYElement);
|
||||
SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem bias(sizeof(BiasDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem invVariance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceBatchNormFwd<XDataType,
|
||||
YDataType,
|
||||
AccDataType,
|
||||
ScaleDataType,
|
||||
BiasDataType,
|
||||
MeanVarDataType,
|
||||
PassThrough,
|
||||
Rank,
|
||||
NumBatchNormReduceDim>;
|
||||
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
bool found = false;
|
||||
int best_op_index = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
|
||||
// profile device operation instances and save the best performant instance index and instance
|
||||
// name
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
|
||||
xyStrides,
|
||||
xyStrides,
|
||||
reduceDims,
|
||||
scaleBiasMeanVarLengths,
|
||||
scaleBiasMeanVarStrides,
|
||||
scaleBiasMeanVarStrides,
|
||||
scaleBiasMeanVarStrides,
|
||||
x.GetDeviceBuffer(),
|
||||
scale.GetDeviceBuffer(),
|
||||
bias.GetDeviceBuffer(),
|
||||
epsilon,
|
||||
PassThrough{},
|
||||
y.GetDeviceBuffer(),
|
||||
mean.GetDeviceBuffer(),
|
||||
invVariance.GetDeviceBuffer(),
|
||||
averageFactor,
|
||||
nullptr,
|
||||
nullptr);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_index = i;
|
||||
best_ave_time = ave_time;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(found)
|
||||
{
|
||||
instance_index = best_op_index;
|
||||
instance_name = op_ptrs[instance_index]->GetTypeIdHashCode();
|
||||
};
|
||||
|
||||
// simulate the execution of the operation when the instance index and name are available
|
||||
const auto op_ptrs_2 = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
if(instance_index >= 0 && instance_index < op_ptrs_2.size())
|
||||
{
|
||||
auto& op_ptr = op_ptrs_2[instance_index];
|
||||
|
||||
if(op_ptr->GetTypeIdHashCode() == instance_name)
|
||||
{
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
|
||||
xyStrides,
|
||||
xyStrides,
|
||||
reduceDims,
|
||||
scaleBiasMeanVarLengths,
|
||||
scaleBiasMeanVarStrides,
|
||||
scaleBiasMeanVarStrides,
|
||||
scaleBiasMeanVarStrides,
|
||||
x.GetDeviceBuffer(),
|
||||
scale.GetDeviceBuffer(),
|
||||
bias.GetDeviceBuffer(),
|
||||
epsilon,
|
||||
PassThrough{},
|
||||
y.GetDeviceBuffer(),
|
||||
mean.GetDeviceBuffer(),
|
||||
invVariance.GetDeviceBuffer(),
|
||||
averageFactor,
|
||||
nullptr,
|
||||
nullptr);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
float exec_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
size_t num_bytes = numXYElement * (sizeof(XDataType) + sizeof(YDataType)) +
|
||||
numScaleBiasMeanVarElement *
|
||||
(sizeof(ScaleDataType) + sizeof(BiasDataType) +
|
||||
sizeof(MeanVarDataType) + sizeof(MeanVarDataType));
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / exec_time;
|
||||
|
||||
std::cout << "Kernel execution time: " << std::setw(10) << exec_time
|
||||
<< " ms, effective data transfer bandwidth: " << gb_per_sec << " GB/s"
|
||||
<< std::endl;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
7
client_example/15_convnd_bwd_data/CMakeLists.txt
Normal file
7
client_example/15_convnd_bwd_data/CMakeLists.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
if(GPU_TARGETS MATCHES "gfx9")
|
||||
add_executable(client_conv3d_bwd_data_fp16 conv3d_bwd_data_fp16.cpp)
|
||||
add_executable(client_conv3d_bwd_data_fp32 conv3d_bwd_data_fp32.cpp)
|
||||
|
||||
target_link_libraries(client_conv3d_bwd_data_fp16 PRIVATE composable_kernel::device_conv_operations)
|
||||
target_link_libraries(client_conv3d_bwd_data_fp32 PRIVATE composable_kernel::device_conv_operations)
|
||||
endif()
|
||||
233
client_example/15_convnd_bwd_data/common.hpp
Normal file
233
client_example/15_convnd_bwd_data/common.hpp
Normal file
@@ -0,0 +1,233 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
std::size_t GetFlops(ck::index_t N,
|
||||
ck::index_t K,
|
||||
ck::index_t C,
|
||||
const std::vector<ck::index_t>& output_spatial_lengths,
|
||||
const std::vector<ck::index_t>& weights_spatial_lengths)
|
||||
{
|
||||
// 2 * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
|
||||
|
||||
return static_cast<std::size_t>(2) * N * K * C *
|
||||
std::accumulate(std::begin(output_spatial_lengths),
|
||||
std::end(output_spatial_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>()) *
|
||||
std::accumulate(std::begin(weights_spatial_lengths),
|
||||
std::end(weights_spatial_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>());
|
||||
}
|
||||
|
||||
template <typename InDataType>
|
||||
std::size_t
|
||||
GetInputByte(ck::index_t N, ck::index_t C, const std::vector<ck::index_t>& input_spatial_lengths)
|
||||
{
|
||||
// sizeof(InDataType) * (N * C * <input spatial lengths product>) +
|
||||
return sizeof(InDataType) * N * C *
|
||||
std::accumulate(std::begin(input_spatial_lengths),
|
||||
std::end(input_spatial_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>());
|
||||
}
|
||||
|
||||
template <typename WeiDataType>
|
||||
std::size_t
|
||||
GetWeightByte(ck::index_t K, ck::index_t C, const std::vector<ck::index_t>& weights_spatial_lengths)
|
||||
{
|
||||
// sizeof(WeiDataType) * (K * C * <filter spatial lengths product>) +
|
||||
return sizeof(WeiDataType) * K * C *
|
||||
std::accumulate(std::begin(weights_spatial_lengths),
|
||||
std::end(weights_spatial_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>());
|
||||
}
|
||||
|
||||
template <typename OutDataType>
|
||||
std::size_t
|
||||
GetOutputByte(ck::index_t N, ck::index_t K, const std::vector<ck::index_t>& output_spatial_lengths)
|
||||
{
|
||||
// sizeof(OutDataType) * (N * K * <output spatial lengths product>);
|
||||
return sizeof(OutDataType) * N * K *
|
||||
std::accumulate(std::begin(output_spatial_lengths),
|
||||
std::end(output_spatial_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>());
|
||||
}
|
||||
|
||||
template <ck::index_t NumDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout>
|
||||
bool run_conv_bwd_data(ck::index_t N,
|
||||
ck::index_t K,
|
||||
ck::index_t C,
|
||||
const std::vector<ck::index_t>& in_spatial_lengths,
|
||||
const std::vector<ck::index_t>& wei_spatial_lengths,
|
||||
const std::vector<ck::index_t>& out_spatial_lengths)
|
||||
{
|
||||
std::size_t in_mem_size = GetInputByte<InDataType>(N, C, in_spatial_lengths);
|
||||
std::size_t wei_mem_size = GetWeightByte<WeiDataType>(K, C, wei_spatial_lengths);
|
||||
std::size_t out_mem_size = GetOutputByte<OutDataType>(N, K, out_spatial_lengths);
|
||||
|
||||
SimpleDeviceMem in(in_mem_size);
|
||||
SimpleDeviceMem wei(wei_mem_size);
|
||||
SimpleDeviceMem out(out_mem_size);
|
||||
|
||||
std::vector<ck::index_t> filter_strides(NumDimSpatial, 1);
|
||||
std::vector<ck::index_t> filter_dilations(NumDimSpatial, 1);
|
||||
std::vector<ck::index_t> input_left_pads(NumDimSpatial, 1);
|
||||
std::vector<ck::index_t> input_right_pads(NumDimSpatial, 1);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceConvBwdData<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
std::size_t flop = GetFlops(N, K, C, out_spatial_lengths, wei_spatial_lengths);
|
||||
std::size_t num_bytes = in_mem_size + wei_mem_size + out_mem_size;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
out.GetDeviceBuffer(),
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
in_spatial_lengths,
|
||||
wei_spatial_lengths,
|
||||
out_spatial_lengths,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
out.GetDeviceBuffer(),
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
in_spatial_lengths,
|
||||
wei_spatial_lengths,
|
||||
out_spatial_lengths,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
42
client_example/15_convnd_bwd_data/conv3d_bwd_data_fp16.cpp
Normal file
42
client_example/15_convnd_bwd_data/conv3d_bwd_data_fp16.cpp
Normal file
@@ -0,0 +1,42 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::KZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWK;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 64;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 28;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 28;
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_conv_bwd_data<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(N, K, C, {Di, Hi, Wi}, {Z, Y, X}, {Do, Ho, Wo})
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
42
client_example/15_convnd_bwd_data/conv3d_bwd_data_fp32.cpp
Normal file
42
client_example/15_convnd_bwd_data/conv3d_bwd_data_fp32.cpp
Normal file
@@ -0,0 +1,42 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = float;
|
||||
using WeiDataType = float;
|
||||
using OutDataType = float;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::KZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWK;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 64;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 28;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 28;
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_conv_bwd_data<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(N, K, C, {Di, Hi, Wi}, {Z, Y, X}, {Do, Ho, Wo})
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
4
client_example/15_gemm_add_multiply/CMakeLists.txt
Normal file
4
client_example/15_gemm_add_multiply/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
if(GPU_TARGETS MATCHES "gfx9")
|
||||
add_executable(client_gemm_add_multiply gemm_add_multiply.cpp)
|
||||
target_link_libraries(client_gemm_add_multiply PRIVATE composable_kernel::device_gemm_operations)
|
||||
endif()
|
||||
242
client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
Normal file
242
client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
Normal file
@@ -0,0 +1,242 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using AddMultiply = ck::tensor_operation::element_wise::AddMultiply;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = AddMultiply;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using D0DataType = F16;
|
||||
using D1DataType = F16;
|
||||
using EDataType = F16;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using D0Layout = Row;
|
||||
using D1Layout = Row;
|
||||
using ELayout = Row;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
ck::index_t N = 4096;
|
||||
ck::index_t K = 4096;
|
||||
|
||||
ck::index_t StrideA = 4096;
|
||||
ck::index_t StrideB = 4096;
|
||||
ck::index_t StrideD0 = 0;
|
||||
ck::index_t StrideD1 = 4096;
|
||||
ck::index_t StrideE = 4096;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 9)
|
||||
{
|
||||
M = std::stoi(argv[1]);
|
||||
N = std::stoi(argv[2]);
|
||||
K = std::stoi(argv[3]);
|
||||
|
||||
StrideA = std::stoi(argv[4]);
|
||||
StrideB = std::stoi(argv[5]);
|
||||
StrideD0 = std::stoi(argv[6]);
|
||||
StrideD1 = std::stoi(argv[7]);
|
||||
StrideE = std::stoi(argv[8]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1 to 8: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
|
||||
SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
|
||||
f_matrix_space_size(M, N, StrideD0, D0Layout{}));
|
||||
SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) *
|
||||
f_matrix_space_size(M, N, StrideD1, D1Layout{}));
|
||||
SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
|
||||
|
||||
using DeviceOp =
|
||||
ck::tensor_operation::device::DeviceGemmMultipleD<ALayout,
|
||||
BLayout,
|
||||
ck::Tuple<D0Layout, D1Layout>,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ck::Tuple<D0DataType, D1DataType>,
|
||||
EDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
|
||||
d1_m_n_device_buf.GetDeviceBuffer()},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
std::array<ck::index_t, 2>{StrideD0, StrideD1},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
|
||||
std::size_t num_btype =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
|
||||
d1_m_n_device_buf.GetDeviceBuffer()},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
std::array<ck::index_t, 2>{StrideD0, StrideD1},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
2
client_example/15_reduce/CMakeLists.txt
Normal file
2
client_example/15_reduce/CMakeLists.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
add_executable(client_reduce_nhwc_c reduce_nhwc_c.cpp)
|
||||
target_link_libraries(client_reduce_nhwc_c PRIVATE composable_kernel::device_reduction_operations)
|
||||
175
client_example/15_reduce/reduce_nhwc_c.cpp
Normal file
175
client_example/15_reduce/reduce_nhwc_c.cpp
Normal file
@@ -0,0 +1,175 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp"
|
||||
|
||||
using InDataType = float;
|
||||
using OutDataType = float;
|
||||
using AccDataType = float;
|
||||
using ReduceAdd = ck::reduce::Add;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using UnaryDivide = ck::tensor_operation::element_wise::UnaryDivide;
|
||||
|
||||
constexpr bool PropagateNan = false;
|
||||
constexpr bool OutputIndex = false;
|
||||
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
std::array<ck::index_t, Rank> in_lengths{16, 8, 128, 256};
|
||||
std::array<ck::index_t, Rank> in_strides{8 * 128 * 256, 128 * 256, 256, 1};
|
||||
std::array<ck::index_t, Rank - NumReduceDim> out_lengths{256};
|
||||
std::array<ck::index_t, Rank - NumReduceDim> out_strides{1};
|
||||
std::array<int, NumReduceDim> reduce_dims{0, 1, 2};
|
||||
|
||||
ck::index_t num_in_elements =
|
||||
std::accumulate(in_lengths.begin(), in_lengths.end(), 1, std::multiplies<ck::index_t>());
|
||||
|
||||
ck::index_t num_out_elements =
|
||||
std::accumulate(out_lengths.begin(), out_lengths.end(), 1, std::multiplies<ck::index_t>());
|
||||
|
||||
ck::index_t reduce_length = 1;
|
||||
|
||||
for(auto dim : reduce_dims)
|
||||
reduce_length *= in_lengths[dim];
|
||||
|
||||
double alpha{1.0};
|
||||
double beta{0.0};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * num_in_elements);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * num_out_elements);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceReduce<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
ReduceAdd,
|
||||
PassThrough,
|
||||
UnaryDivide,
|
||||
PropagateNan,
|
||||
OutputIndex>;
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths,
|
||||
in_strides,
|
||||
out_lengths,
|
||||
out_strides,
|
||||
reduce_dims,
|
||||
alpha,
|
||||
beta,
|
||||
in.GetDeviceBuffer(),
|
||||
nullptr,
|
||||
out.GetDeviceBuffer(),
|
||||
nullptr,
|
||||
PassThrough{},
|
||||
UnaryDivide{reduce_length});
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_bytes = num_in_elements * sizeof(InDataType) +
|
||||
(beta == 0.0f ? 1 : 2) * num_out_elements * sizeof(OutDataType);
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths,
|
||||
in_strides,
|
||||
out_lengths,
|
||||
out_strides,
|
||||
reduce_dims,
|
||||
alpha,
|
||||
beta,
|
||||
in.GetDeviceBuffer(),
|
||||
nullptr,
|
||||
out.GetDeviceBuffer(),
|
||||
nullptr,
|
||||
PassThrough{},
|
||||
UnaryDivide{reduce_length});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
15
client_example/16_convnd_fwd/CMakeLists.txt
Normal file
15
client_example/16_convnd_fwd/CMakeLists.txt
Normal file
@@ -0,0 +1,15 @@
|
||||
if((DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
|
||||
add_executable(client_conv3d_fwd_fp16 conv3d_fwd_fp16.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_fp16 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
endif()
|
||||
|
||||
if((DTYPES MATCHES "fp8") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
|
||||
add_executable(client_conv3d_fwd_fp16_comp_fp8 conv3d_fwd_fp16_comp_fp8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_fp16_comp_fp8 PRIVATE composable_kernel::device_conv_operations)
|
||||
endif()
|
||||
|
||||
if((DTYPES MATCHES "fp32") OR NOT DEFINED DTYPES)
|
||||
add_executable(client_conv3d_fwd_fp32 conv3d_fwd_fp32.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_fp32 PRIVATE composable_kernel::device_conv_operations)
|
||||
endif()
|
||||
304
client_example/16_convnd_fwd/common.hpp
Normal file
304
client_example/16_convnd_fwd/common.hpp
Normal file
@@ -0,0 +1,304 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
template <ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
|
||||
std::size_t
|
||||
GetFlops(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& output_lengths,
|
||||
const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& weights_lengths)
|
||||
{
|
||||
// 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
|
||||
ck::index_t G = weights_lengths[0];
|
||||
ck::index_t N = output_lengths[1];
|
||||
ck::index_t K = weights_lengths[1];
|
||||
ck::index_t C = weights_lengths[2];
|
||||
|
||||
return static_cast<std::size_t>(2) * G * N * K * C *
|
||||
std::accumulate(std::next(std::begin(output_lengths), NumNonSpatialDim),
|
||||
std::end(output_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>()) *
|
||||
std::accumulate(std::next(std::begin(weights_lengths), NumNonSpatialDim),
|
||||
std::end(weights_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>());
|
||||
}
|
||||
|
||||
template <typename InDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
|
||||
std::size_t
|
||||
GetInputByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& input_lengths)
|
||||
{
|
||||
// sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
|
||||
return sizeof(InDataType) * std::accumulate(std::begin(input_lengths),
|
||||
std::end(input_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>());
|
||||
}
|
||||
|
||||
template <typename WeiDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
|
||||
std::size_t
|
||||
GetWeightByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& weights_lengths)
|
||||
{
|
||||
// sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
|
||||
return sizeof(WeiDataType) * std::accumulate(std::begin(weights_lengths),
|
||||
std::end(weights_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<>());
|
||||
}
|
||||
|
||||
template <typename OutDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
|
||||
std::size_t
|
||||
GetOutputByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& output_lengths)
|
||||
{
|
||||
// sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
|
||||
return sizeof(OutDataType) * std::accumulate(std::begin(output_lengths),
|
||||
std::end(output_lengths),
|
||||
static_cast<std::size_t>(1),
|
||||
std::multiplies<std::size_t>());
|
||||
}
|
||||
|
||||
template <ck::index_t NumDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
ck::index_t NumNonSpatialDim = 3,
|
||||
typename AComputeType = InDataType,
|
||||
typename BComputeType = AComputeType>
|
||||
bool run_grouped_conv_fwd(std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> in_lengths,
|
||||
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> wei_lengths,
|
||||
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> out_lengths)
|
||||
{
|
||||
std::size_t in_mem_size = GetInputByte<InDataType, NumDimSpatial>(in_lengths);
|
||||
std::size_t wei_mem_size = GetWeightByte<WeiDataType, NumDimSpatial>(wei_lengths);
|
||||
std::size_t out_mem_size = GetOutputByte<OutDataType, NumDimSpatial>(out_lengths);
|
||||
|
||||
SimpleDeviceMem in(in_mem_size);
|
||||
SimpleDeviceMem wei(wei_mem_size);
|
||||
SimpleDeviceMem out(out_mem_size);
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> in_strides;
|
||||
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> wei_strides;
|
||||
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> out_strides;
|
||||
in_strides.fill(0);
|
||||
wei_strides.fill(0);
|
||||
out_strides.fill(0);
|
||||
in_strides.back() = 1;
|
||||
wei_strides.back() = 1;
|
||||
out_strides.back() = 1;
|
||||
|
||||
std::partial_sum(rbegin(in_lengths),
|
||||
std::prev(rend(in_lengths)),
|
||||
std::next(rbegin(in_strides)),
|
||||
std::multiplies<>{});
|
||||
std::partial_sum(rbegin(wei_lengths),
|
||||
std::prev(rend(wei_lengths)),
|
||||
std::next(rbegin(wei_strides)),
|
||||
std::multiplies<>{});
|
||||
std::partial_sum(rbegin(out_lengths),
|
||||
std::prev(rend(out_lengths)),
|
||||
std::next(rbegin(out_strides)),
|
||||
std::multiplies<>{});
|
||||
|
||||
// transpose NDHWGC/KZYXGC/NDHWGK to GNDHWC/GKZYXC/GNDHWK to GNCDHW/GKCZYX/GNKDHW
|
||||
std::rotate(std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 2), rend(in_lengths));
|
||||
std::rotate(rbegin(in_lengths),
|
||||
std::next(rbegin(in_lengths)),
|
||||
std::next(rbegin(in_lengths), NumDimSpatial + 1));
|
||||
|
||||
std::rotate(std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 2), rend(in_strides));
|
||||
std::rotate(rbegin(in_strides),
|
||||
std::next(rbegin(in_strides)),
|
||||
std::next(rbegin(in_strides), NumDimSpatial + 1));
|
||||
|
||||
std::rotate(rbegin(wei_lengths),
|
||||
std::next(rbegin(wei_lengths)),
|
||||
std::next(rbegin(wei_lengths), NumDimSpatial + 1));
|
||||
|
||||
std::rotate(rbegin(wei_strides),
|
||||
std::next(rbegin(wei_strides)),
|
||||
std::next(rbegin(wei_strides), NumDimSpatial + 1));
|
||||
|
||||
std::rotate(
|
||||
std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 2), rend(out_lengths));
|
||||
std::rotate(rbegin(out_lengths),
|
||||
std::next(rbegin(out_lengths)),
|
||||
std::next(rbegin(out_lengths), NumDimSpatial + 1));
|
||||
|
||||
std::rotate(
|
||||
std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 2), rend(out_strides));
|
||||
std::rotate(rbegin(out_strides),
|
||||
std::next(rbegin(out_strides)),
|
||||
std::next(rbegin(out_strides), NumDimSpatial + 1));
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> conv_filter_strides;
|
||||
std::array<ck::index_t, NumDimSpatial> conv_filter_dilations;
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads;
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads;
|
||||
conv_filter_strides.fill(1);
|
||||
conv_filter_dilations.fill(1);
|
||||
input_left_pads.fill(1);
|
||||
input_right_pads.fill(1);
|
||||
|
||||
std::size_t flop = GetFlops<NumDimSpatial>(out_lengths, wei_lengths);
|
||||
std::size_t num_bytes = in_mem_size + wei_mem_size + out_mem_size;
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
AComputeType,
|
||||
BComputeType>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
std::array<const void*, 0>{},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{{}},
|
||||
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{{}},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
std::array<const void*, 0>{},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{{}},
|
||||
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{{}},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
44
client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp
Normal file
44
client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp
Normal file
@@ -0,0 +1,44 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 64;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 3;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 3;
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_fwd<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(
|
||||
{N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
46
client_example/16_convnd_fwd/conv3d_fwd_fp16_comp_fp8.cpp
Normal file
46
client_example/16_convnd_fwd/conv3d_fwd_fp16_comp_fp8.cpp
Normal file
@@ -0,0 +1,46 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 64;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 3;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 3;
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_fwd<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
3,
|
||||
ck::f8_t>(
|
||||
{N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
44
client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp
Normal file
44
client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp
Normal file
@@ -0,0 +1,44 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = float;
|
||||
using WeiDataType = float;
|
||||
using OutDataType = float;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 64;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 3;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 3;
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_fwd<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout>(
|
||||
{N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
4
client_example/17_grouped_gemm_fastgelu/CMakeLists.txt
Normal file
4
client_example/17_grouped_gemm_fastgelu/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
if(GPU_TARGETS MATCHES "gfx9")
|
||||
add_executable(client_grouped_gemm_fastgelu grouped_gemm_fastgelu.cpp)
|
||||
target_link_libraries(client_grouped_gemm_fastgelu PRIVATE composable_kernel::device_gemm_operations)
|
||||
endif()
|
||||
@@ -0,0 +1,232 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_fastgelu.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using FastGelu = ck::tensor_operation::element_wise::FastGelu;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using DsDataType = ck::Tuple<>;
|
||||
using EDataType = F16;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using DsLayout = ck::Tuple<>;
|
||||
using ELayout = Row;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = FastGelu;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
std::mt19937 gen(19391);
|
||||
std::uniform_int_distribution<> distrib(1, 10);
|
||||
int group_count = distrib(gen);
|
||||
|
||||
std::vector<int> Ms, Ns, Ks, StrideAs, StrideBs, StrideEs;
|
||||
|
||||
for(int i = 0; i < group_count; ++i)
|
||||
{
|
||||
Ms.push_back(256 + 256 * distrib(gen));
|
||||
Ns.push_back(256 + 256 * distrib(gen));
|
||||
Ks.push_back(128 + 128 * distrib(gen));
|
||||
|
||||
StrideAs.push_back(std::is_same<Row, ALayout>::value ? Ks[i] : Ms[i]);
|
||||
StrideBs.push_back(std::is_same<Row, BLayout>::value ? Ns[i] : Ks[i]);
|
||||
StrideEs.push_back(std::is_same<Row, ELayout>::value ? Ns[i] : Ms[i]);
|
||||
}
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<SimpleDeviceMem> a_dev_bufs, b_dev_bufs, e_dev_bufs;
|
||||
|
||||
a_dev_bufs.reserve(group_count);
|
||||
b_dev_bufs.reserve(group_count);
|
||||
e_dev_bufs.reserve(group_count);
|
||||
|
||||
std::vector<const void*> p_a, p_b;
|
||||
std::vector<void*> p_e;
|
||||
|
||||
p_a.reserve(group_count);
|
||||
p_b.reserve(group_count);
|
||||
p_e.reserve(group_count);
|
||||
|
||||
std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
|
||||
|
||||
gemm_descs.reserve(group_count);
|
||||
|
||||
for(int i = 0; i < group_count; ++i)
|
||||
{
|
||||
a_dev_bufs.emplace_back(sizeof(ADataType) *
|
||||
f_matrix_space_size(Ms[i], Ks[i], StrideAs[i], ALayout{}));
|
||||
b_dev_bufs.emplace_back(sizeof(BDataType) *
|
||||
f_matrix_space_size(Ks[i], Ns[i], StrideBs[i], BLayout{}));
|
||||
e_dev_bufs.emplace_back(sizeof(EDataType) *
|
||||
f_matrix_space_size(Ms[i], Ns[i], StrideEs[i], ELayout{}));
|
||||
|
||||
gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideEs[i], {}});
|
||||
|
||||
p_a.push_back(a_dev_bufs[i].GetDeviceBuffer());
|
||||
p_b.push_back(b_dev_bufs[i].GetDeviceBuffer());
|
||||
p_e.push_back(e_dev_bufs[i].GetDeviceBuffer());
|
||||
}
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemm<ALayout,
|
||||
BLayout,
|
||||
DsLayout,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
auto p_ds = std::vector<std::array<const void*, 0>>{};
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
p_a, p_b, p_ds, p_e, gemm_descs, a_element_op, b_element_op, cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
SimpleDeviceMem gemm_desc_workspace(op_ptr->GetWorkSpaceSize(argument_ptr.get()));
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = 0, num_btype = 0;
|
||||
for(std::size_t j = 0; j < gemm_descs.size(); ++j)
|
||||
{
|
||||
flop += std::size_t(2) * Ms[j] * Ns[j] * Ks[j];
|
||||
|
||||
num_btype += sizeof(ADataType) * Ms[j] * Ks[j] + sizeof(BDataType) * Ks[j] * Ns[j] +
|
||||
sizeof(EDataType) * Ms[j] * Ns[j];
|
||||
}
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
p_a, p_b, p_ds, p_e, gemm_descs, a_element_op, b_element_op, cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
SimpleDeviceMem gemm_desc_workspace(op_ptr->GetWorkSpaceSize(argument_ptr.get()));
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
8
client_example/18_groupnorm/CMakeLists.txt
Normal file
8
client_example/18_groupnorm/CMakeLists.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
add_executable(client_groupnorm_bwd_data groupnorm_bwd_data.cpp)
|
||||
target_link_libraries(client_groupnorm_bwd_data PRIVATE composable_kernel::device_other_operations)
|
||||
|
||||
add_executable(client_groupnorm_bwd_gamma_beta groupnorm_bwd_gamma_beta.cpp)
|
||||
target_link_libraries(client_groupnorm_bwd_gamma_beta PRIVATE composable_kernel::device_other_operations)
|
||||
|
||||
add_executable(client_groupnorm_swish_fwd groupnorm_swish_fwd.cpp)
|
||||
target_link_libraries(client_groupnorm_swish_fwd PRIVATE composable_kernel::device_other_operations)
|
||||
182
client_example/18_groupnorm/groupnorm_bwd_data.cpp
Normal file
182
client_example/18_groupnorm/groupnorm_bwd_data.cpp
Normal file
@@ -0,0 +1,182 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_data.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/groupnorm_bwd_data.hpp"
|
||||
|
||||
using DYDataType = float;
|
||||
using XDataType = float;
|
||||
using GammaDataType = float;
|
||||
using MeanInvStdDataType = float;
|
||||
using DXDataType = float;
|
||||
|
||||
constexpr int Rank = 5;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t N = 32;
|
||||
ck::index_t H = 16;
|
||||
ck::index_t W = 16;
|
||||
ck::index_t G = 64;
|
||||
ck::index_t C = 128;
|
||||
|
||||
std::size_t length = N * H * W * G * C;
|
||||
|
||||
std::vector<ck::index_t> strideDy = {H * W * G * C, W * G * C, G * C, C, 1};
|
||||
std::vector<ck::index_t> strideX = strideDy;
|
||||
std::vector<ck::index_t> strideDx = strideDy;
|
||||
|
||||
std::vector<ck::index_t> strideGamma = {0, 0, 0, C, 1};
|
||||
std::vector<ck::index_t> strideMeanInvStd = {G, 0, 0, 1, 0};
|
||||
|
||||
SimpleDeviceMem dy_dev(sizeof(DYDataType) * length);
|
||||
SimpleDeviceMem x_dev(sizeof(XDataType) * length);
|
||||
SimpleDeviceMem gamma_dev(sizeof(GammaDataType) * G * C);
|
||||
SimpleDeviceMem mean_dev(sizeof(MeanInvStdDataType) * N * G);
|
||||
SimpleDeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * N * G);
|
||||
SimpleDeviceMem dx_dev(sizeof(DXDataType) * length);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceNormalizationBwdData<DYDataType,
|
||||
XDataType,
|
||||
GammaDataType,
|
||||
MeanInvStdDataType,
|
||||
DXDataType,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},
|
||||
strideDy,
|
||||
strideX,
|
||||
strideGamma,
|
||||
strideMeanInvStd,
|
||||
strideMeanInvStd,
|
||||
strideDx,
|
||||
{1, 2, 4}, // reduceDims
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
x_dev.GetDeviceBuffer(),
|
||||
gamma_dev.GetDeviceBuffer(),
|
||||
mean_dev.GetDeviceBuffer(),
|
||||
inv_std_dev.GetDeviceBuffer(),
|
||||
dx_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_byte = sizeof(DYDataType) * length + sizeof(XDataType) * length +
|
||||
sizeof(GammaDataType) * G * C +
|
||||
sizeof(MeanInvStdDataType) * N * G * 2 +
|
||||
sizeof(DXDataType) * length;
|
||||
|
||||
float gb_per_sec = num_byte / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},
|
||||
strideDy,
|
||||
strideX,
|
||||
strideGamma,
|
||||
strideMeanInvStd,
|
||||
strideMeanInvStd,
|
||||
strideDx,
|
||||
{1, 2, 4}, // reduceDims
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
x_dev.GetDeviceBuffer(),
|
||||
gamma_dev.GetDeviceBuffer(),
|
||||
mean_dev.GetDeviceBuffer(),
|
||||
inv_std_dev.GetDeviceBuffer(),
|
||||
dx_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
180
client_example/18_groupnorm/groupnorm_bwd_gamma_beta.cpp
Normal file
180
client_example/18_groupnorm/groupnorm_bwd_gamma_beta.cpp
Normal file
@@ -0,0 +1,180 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/groupnorm_bwd_gamma_beta.hpp"
|
||||
|
||||
using DYDataType = float;
|
||||
using XDataType = float;
|
||||
using GammaDataType = float;
|
||||
using MeanInvStdDataType = float;
|
||||
using DGammaDataType = float;
|
||||
using DBetaDataType = float;
|
||||
|
||||
constexpr int Rank = 5;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t N = 32;
|
||||
ck::index_t H = 16;
|
||||
ck::index_t W = 16;
|
||||
ck::index_t G = 64;
|
||||
ck::index_t C = 128;
|
||||
|
||||
std::size_t length = N * H * W * G * C;
|
||||
|
||||
std::vector<ck::index_t> strideDy = {H * W * G * C, W * G * C, G * C, C, 1};
|
||||
std::vector<ck::index_t> strideX = strideDy;
|
||||
std::vector<ck::index_t> strideMeanInvStd = {G, 0, 0, 1, 0};
|
||||
std::vector<ck::index_t> strideDGammaBeta = {C, 1};
|
||||
|
||||
SimpleDeviceMem dy_dev(sizeof(DYDataType) * length);
|
||||
SimpleDeviceMem x_dev(sizeof(XDataType) * length);
|
||||
SimpleDeviceMem mean_dev(sizeof(MeanInvStdDataType) * N * G);
|
||||
SimpleDeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * N * G);
|
||||
SimpleDeviceMem dgamma_dev(sizeof(DGammaDataType) * G * C);
|
||||
SimpleDeviceMem dbeta_dev(sizeof(DBetaDataType) * G * C);
|
||||
|
||||
using DeviceOp =
|
||||
ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta<DYDataType,
|
||||
XDataType,
|
||||
MeanInvStdDataType,
|
||||
DGammaDataType,
|
||||
DBetaDataType,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
std::size_t num_bytes = sizeof(DYDataType) * length + sizeof(XDataType) * length +
|
||||
sizeof(GammaDataType) * G * C + sizeof(MeanInvStdDataType) * N * G * 2 +
|
||||
sizeof(DGammaDataType) * G * C + sizeof(DBetaDataType) * G * C;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},
|
||||
strideDy,
|
||||
strideX,
|
||||
strideMeanInvStd,
|
||||
strideMeanInvStd,
|
||||
{G, C},
|
||||
strideDGammaBeta,
|
||||
strideDGammaBeta,
|
||||
{0, 1, 2}, // reduceDims
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
x_dev.GetDeviceBuffer(),
|
||||
mean_dev.GetDeviceBuffer(),
|
||||
inv_std_dev.GetDeviceBuffer(),
|
||||
dgamma_dev.GetDeviceBuffer(),
|
||||
dbeta_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
float gb_per_sec = num_bytes / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},
|
||||
strideDy,
|
||||
strideX,
|
||||
strideMeanInvStd,
|
||||
strideMeanInvStd,
|
||||
{G, C},
|
||||
strideDGammaBeta,
|
||||
strideDGammaBeta,
|
||||
{0, 1, 2}, // reduceDims
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
x_dev.GetDeviceBuffer(),
|
||||
mean_dev.GetDeviceBuffer(),
|
||||
inv_std_dev.GetDeviceBuffer(),
|
||||
dgamma_dev.GetDeviceBuffer(),
|
||||
dbeta_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
236
client_example/18_groupnorm/groupnorm_swish_fwd.cpp
Normal file
236
client_example/18_groupnorm/groupnorm_swish_fwd.cpp
Normal file
@@ -0,0 +1,236 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/normalization_fwd_swish.hpp"
|
||||
|
||||
using XDataType = ck::half_t;
|
||||
using GammaDataType = float;
|
||||
using BetaDataType = float;
|
||||
using YDataType = ck::half_t;
|
||||
using SaveMeanInvStdDataType = float;
|
||||
using Swish = ck::tensor_operation::element_wise::Swish;
|
||||
|
||||
#define SAVE_MEAN_INV_STD
|
||||
|
||||
constexpr int Rank = 5;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t N = 32;
|
||||
ck::index_t H = 16;
|
||||
ck::index_t W = 16;
|
||||
ck::index_t G = 64;
|
||||
ck::index_t C = 128;
|
||||
|
||||
std::size_t xy_size = N * H * W * G * C;
|
||||
std::size_t gamma_beta_size = G * C;
|
||||
|
||||
std::vector<ck::index_t> xy_strides = {H * W * G * C, W * G * C, G * C, C, 1};
|
||||
std::vector<ck::index_t> gamma_beta_strides = {0, 0, 0, C, 1};
|
||||
std::vector<ck::index_t> save_mean_inv_std_strides = {G, 1};
|
||||
|
||||
SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size);
|
||||
SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_beta_size);
|
||||
SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * gamma_beta_size);
|
||||
SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
|
||||
#ifdef SAVE_MEAN_INV_STD
|
||||
SimpleDeviceMem save_mean_device_buf(sizeof(SaveMeanInvStdDataType) * N * G);
|
||||
SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * N * G);
|
||||
#endif
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd<XDataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
YDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
Swish,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto& generic_op_ptr = op_ptrs[0];
|
||||
|
||||
auto generic_argument_ptr =
|
||||
generic_op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths
|
||||
xy_strides, // xStrides
|
||||
gamma_beta_strides, // gammaStrides
|
||||
gamma_beta_strides, // betaStrides
|
||||
xy_strides, // yStrides
|
||||
save_mean_inv_std_strides, // save_mean Strides
|
||||
save_mean_inv_std_strides, // save_inv_std Strides
|
||||
{1, 2, 4}, // reduceDims
|
||||
1e-6,
|
||||
x_device_buf.GetDeviceBuffer(),
|
||||
gamma_device_buf.GetDeviceBuffer(),
|
||||
beta_device_buf.GetDeviceBuffer(),
|
||||
y_device_buf.GetDeviceBuffer(),
|
||||
#ifdef SAVE_MEAN_INV_STD
|
||||
save_mean_device_buf.GetDeviceBuffer(),
|
||||
save_inv_std_device_buf.GetDeviceBuffer(),
|
||||
#else
|
||||
nullptr,
|
||||
nullptr,
|
||||
#endif
|
||||
Swish{});
|
||||
|
||||
if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"The generic kernel instance should be able to support any input shapes");
|
||||
};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths
|
||||
xy_strides, // xStrides
|
||||
gamma_beta_strides, // gammaStrides
|
||||
gamma_beta_strides, // betaStrides
|
||||
xy_strides, // yStrides
|
||||
save_mean_inv_std_strides, // save_mean Strides
|
||||
save_mean_inv_std_strides, // save_inv_std Strides
|
||||
{1, 2, 4}, // reduceDims
|
||||
1e-6,
|
||||
x_device_buf.GetDeviceBuffer(),
|
||||
gamma_device_buf.GetDeviceBuffer(),
|
||||
beta_device_buf.GetDeviceBuffer(),
|
||||
y_device_buf.GetDeviceBuffer(),
|
||||
#ifdef SAVE_MEAN_INV_STD
|
||||
save_mean_device_buf.GetDeviceBuffer(),
|
||||
save_inv_std_device_buf.GetDeviceBuffer(),
|
||||
#else
|
||||
nullptr,
|
||||
nullptr,
|
||||
#endif
|
||||
Swish{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_byte =
|
||||
sizeof(XDataType) * xy_size + sizeof(GammaDataType) * gamma_beta_size +
|
||||
sizeof(BetaDataType) * gamma_beta_size + sizeof(YDataType) * xy_size;
|
||||
|
||||
#ifdef SAVE_MEAN_INV_STD
|
||||
num_byte += sizeof(SaveMeanInvStdDataType) * N * G * 2;
|
||||
#endif
|
||||
|
||||
float gb_per_sec = num_byte / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths
|
||||
xy_strides, // xStrides
|
||||
gamma_beta_strides, // gammaStrides
|
||||
gamma_beta_strides, // betaStrides
|
||||
xy_strides, // yStrides
|
||||
save_mean_inv_std_strides, // save_mean Strides
|
||||
save_mean_inv_std_strides, // save_inv_std Strides
|
||||
{1, 2, 4}, // reduceDims
|
||||
1e-6,
|
||||
x_device_buf.GetDeviceBuffer(),
|
||||
gamma_device_buf.GetDeviceBuffer(),
|
||||
beta_device_buf.GetDeviceBuffer(),
|
||||
y_device_buf.GetDeviceBuffer(),
|
||||
#ifdef SAVE_MEAN_INV_STD
|
||||
save_mean_device_buf.GetDeviceBuffer(),
|
||||
save_inv_std_device_buf.GetDeviceBuffer(),
|
||||
#else
|
||||
nullptr,
|
||||
nullptr,
|
||||
#endif
|
||||
Swish{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
11
client_example/19_pool/CMakeLists.txt
Normal file
11
client_example/19_pool/CMakeLists.txt
Normal file
@@ -0,0 +1,11 @@
|
||||
add_executable(client_max_pool2d_fwd max_pool2d_fwd.cpp)
|
||||
target_link_libraries(client_max_pool2d_fwd PRIVATE composable_kernel::device_other_operations)
|
||||
|
||||
add_executable(client_max_pool2d_bwd max_pool2d_bwd.cpp)
|
||||
target_link_libraries(client_max_pool2d_bwd PRIVATE composable_kernel::device_other_operations)
|
||||
|
||||
add_executable(client_avg_pool3d_fwd avg_pool3d_fwd.cpp)
|
||||
target_link_libraries(client_avg_pool3d_fwd PRIVATE composable_kernel::device_other_operations)
|
||||
|
||||
add_executable(client_avg_pool3d_bwd avg_pool3d_bwd.cpp)
|
||||
target_link_libraries(client_avg_pool3d_bwd PRIVATE composable_kernel::device_other_operations)
|
||||
191
client_example/19_pool/avg_pool3d_bwd.cpp
Normal file
191
client_example/19_pool/avg_pool3d_bwd.cpp
Normal file
@@ -0,0 +1,191 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp"
|
||||
|
||||
using DOutDataType = ck::half_t;
|
||||
using DInDataType = ck::half_t;
|
||||
|
||||
using DOutLayout = ck::tensor_layout::convolution::NDHWC;
|
||||
using DInLayout = ck::tensor_layout::convolution::NDHWC;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}, mMemSize_(mem_size)
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
void SetZero() const { (void)hipMemset(p_mem_, 0, mMemSize_); }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
std::size_t mMemSize_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t N = 2;
|
||||
ck::index_t C = 32;
|
||||
ck::index_t Z = 2;
|
||||
ck::index_t Y = 2;
|
||||
ck::index_t X = 2;
|
||||
ck::index_t Di = 30;
|
||||
ck::index_t Hi = 30;
|
||||
ck::index_t Wi = 30;
|
||||
ck::index_t window_stride_d = 2;
|
||||
ck::index_t window_stride_h = 2;
|
||||
ck::index_t window_stride_w = 2;
|
||||
ck::index_t window_dilation_d = 1;
|
||||
ck::index_t window_dilation_h = 1;
|
||||
ck::index_t window_dilation_w = 1;
|
||||
ck::index_t in_left_pad_d = 1;
|
||||
ck::index_t in_left_pad_h = 1;
|
||||
ck::index_t in_left_pad_w = 1;
|
||||
ck::index_t in_right_pad_d = 1;
|
||||
ck::index_t in_right_pad_h = 1;
|
||||
ck::index_t in_right_pad_w = 1;
|
||||
|
||||
const ck::index_t Zs = (Z - 1) * window_dilation_d + 1;
|
||||
const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
|
||||
const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
|
||||
ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Zs) / window_stride_d + 1;
|
||||
ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
|
||||
ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
|
||||
|
||||
// Pool API only support the order of NCDHW
|
||||
std::vector<ck::index_t> in_length = {N, C, Di, Hi, Wi};
|
||||
std::vector<ck::index_t> out_length = {N, C, Do, Ho, Wo};
|
||||
std::vector<ck::index_t> window_spatial_lengths = {Z, Y, X};
|
||||
std::vector<ck::index_t> window_strides = {window_stride_d, window_stride_h, window_stride_w};
|
||||
std::vector<ck::index_t> window_dilations{
|
||||
window_dilation_d, window_dilation_h, window_dilation_w};
|
||||
std::vector<ck::index_t> input_left_pads = {in_left_pad_d, in_left_pad_h, in_left_pad_w};
|
||||
std::vector<ck::index_t> input_right_pads = {in_right_pad_d, in_right_pad_h, in_right_pad_w};
|
||||
|
||||
std::size_t in_tensor_size = N * C * Di * Hi * Wi;
|
||||
std::size_t out_tensor_size = N * C * Do * Ho * Wo;
|
||||
|
||||
// tensor layout = NDHWC
|
||||
std::vector<ck::index_t> in_tensor_stride = {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C};
|
||||
std::vector<ck::index_t> out_tensor_stride = {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C};
|
||||
|
||||
SimpleDeviceMem dout_device_buf(sizeof(DOutDataType) * out_tensor_size);
|
||||
SimpleDeviceMem din_device_buf(sizeof(DInDataType) * in_tensor_size);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::
|
||||
DeviceAvgPoolBwd<3, DOutDataType, DInDataType, DOutLayout, DInLayout>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
|
||||
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
|
||||
out_length,
|
||||
in_length,
|
||||
out_tensor_stride,
|
||||
in_tensor_stride,
|
||||
window_spatial_lengths,
|
||||
window_strides,
|
||||
window_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
din_device_buf.SetZero();
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_bytes =
|
||||
in_tensor_size * sizeof(DInDataType) + out_tensor_size * sizeof(DOutDataType);
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
|
||||
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
|
||||
out_length,
|
||||
in_length,
|
||||
out_tensor_stride,
|
||||
in_tensor_stride,
|
||||
window_spatial_lengths,
|
||||
window_strides,
|
||||
window_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
din_device_buf.SetZero();
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
213
client_example/19_pool/avg_pool3d_fwd.cpp
Normal file
213
client_example/19_pool/avg_pool3d_fwd.cpp
Normal file
@@ -0,0 +1,213 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
using IndexDataType = int32_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWC;
|
||||
|
||||
constexpr ck::index_t InOutRank = 5;
|
||||
constexpr ck::index_t WindowRank = 3;
|
||||
#if 0
|
||||
constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
|
||||
constexpr bool OutputIndex = false;
|
||||
#else
|
||||
constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
|
||||
constexpr bool OutputIndex = false;
|
||||
#endif
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t N = 2;
|
||||
ck::index_t C = 32;
|
||||
ck::index_t Z = 2;
|
||||
ck::index_t Y = 2;
|
||||
ck::index_t X = 2;
|
||||
ck::index_t Di = 30;
|
||||
ck::index_t Hi = 30;
|
||||
ck::index_t Wi = 30;
|
||||
ck::index_t window_stride_d = 2;
|
||||
ck::index_t window_stride_h = 2;
|
||||
ck::index_t window_stride_w = 2;
|
||||
ck::index_t window_dilation_d = 1;
|
||||
ck::index_t window_dilation_h = 1;
|
||||
ck::index_t window_dilation_w = 1;
|
||||
ck::index_t in_left_pad_d = 1;
|
||||
ck::index_t in_left_pad_h = 1;
|
||||
ck::index_t in_left_pad_w = 1;
|
||||
ck::index_t in_right_pad_d = 1;
|
||||
ck::index_t in_right_pad_h = 1;
|
||||
ck::index_t in_right_pad_w = 1;
|
||||
|
||||
const ck::index_t Zs = (Z - 1) * window_dilation_d + 1;
|
||||
const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
|
||||
const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
|
||||
ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Zs) / window_stride_d + 1;
|
||||
ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
|
||||
ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
|
||||
|
||||
// Pool API only support the order of NCDHW
|
||||
std::vector<ck::index_t> in_length = {N, C, Di, Hi, Wi};
|
||||
std::vector<ck::index_t> out_length = {N, C, Do, Ho, Wo};
|
||||
std::vector<ck::index_t> window_spatial_lengths = {Z, Y, X};
|
||||
std::vector<ck::index_t> window_strides = {window_stride_d, window_stride_h, window_stride_w};
|
||||
std::vector<ck::index_t> window_dilations{
|
||||
window_dilation_d, window_dilation_h, window_dilation_w};
|
||||
std::vector<ck::index_t> input_left_pads = {in_left_pad_d, in_left_pad_h, in_left_pad_w};
|
||||
std::vector<ck::index_t> input_right_pads = {in_right_pad_d, in_right_pad_h, in_right_pad_w};
|
||||
|
||||
std::size_t in_tensor_size = N * C * Di * Hi * Wi;
|
||||
std::size_t out_tensor_size = N * C * Do * Ho * Wo;
|
||||
|
||||
// tensor layout = NDHWC
|
||||
std::vector<ck::index_t> in_tensor_stride = {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C};
|
||||
std::vector<ck::index_t> out_tensor_stride = {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C};
|
||||
|
||||
SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
|
||||
SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
|
||||
WindowRank,
|
||||
InDataType,
|
||||
OutDataType,
|
||||
IndexDataType,
|
||||
InLayout,
|
||||
OutLayout,
|
||||
ReduceOpId,
|
||||
OutputIndex>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
nullptr,
|
||||
in_length,
|
||||
window_spatial_lengths,
|
||||
out_length,
|
||||
in_tensor_stride,
|
||||
out_tensor_stride,
|
||||
out_tensor_stride,
|
||||
window_strides,
|
||||
window_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
{2, 3, 4});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_bytes =
|
||||
in_tensor_size * sizeof(InDataType) + out_tensor_size * sizeof(OutDataType);
|
||||
|
||||
if constexpr(OutputIndex)
|
||||
num_bytes += out_tensor_size * sizeof(IndexDataType);
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
nullptr,
|
||||
in_length,
|
||||
window_spatial_lengths,
|
||||
out_length,
|
||||
in_tensor_stride,
|
||||
out_tensor_stride,
|
||||
out_tensor_stride,
|
||||
window_strides,
|
||||
window_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
{2, 3, 4});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
280
client_example/19_pool/max_pool2d_bwd.cpp
Normal file
280
client_example/19_pool/max_pool2d_bwd.cpp
Normal file
@@ -0,0 +1,280 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
using DOutDataType = ck::half_t;
|
||||
using DInDataType = ck::half_t;
|
||||
using IndexDataType = int32_t;
|
||||
|
||||
// We use pool3d to implement pool2d in this example
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWC;
|
||||
|
||||
constexpr ck::index_t InOutRank = 5;
|
||||
constexpr ck::index_t WindowRank = 3;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
void TransformPool2dparamToPool3d(std::vector<ck::index_t>& input_lengths,
|
||||
std::vector<ck::index_t>& window_lengths,
|
||||
std::vector<ck::index_t>& output_lengths,
|
||||
std::vector<ck::index_t>& input_stride,
|
||||
std::vector<ck::index_t>& output_stride,
|
||||
std::vector<ck::index_t>& indices_stride,
|
||||
std::vector<ck::index_t>& window_strides,
|
||||
std::vector<ck::index_t>& window_dilations,
|
||||
std::vector<ck::index_t>& input_left_pads,
|
||||
std::vector<ck::index_t>& input_right_pads,
|
||||
std::vector<ck::index_t>& pooling_dims)
|
||||
{
|
||||
// NCHW to NCDHW
|
||||
input_lengths.insert(input_lengths.begin() + 2, 1);
|
||||
output_lengths.insert(output_lengths.begin() + 2, 1);
|
||||
input_stride.insert(input_stride.begin() + 2, 0);
|
||||
output_stride.insert(output_stride.begin() + 2, 0);
|
||||
indices_stride.insert(indices_stride.begin() + 2, 0);
|
||||
|
||||
// YX to ZYX
|
||||
window_lengths.insert(window_lengths.begin(), 1);
|
||||
window_strides.insert(window_strides.begin(), 0);
|
||||
window_dilations.insert(window_dilations.begin(), 0);
|
||||
input_left_pads.insert(input_left_pads.begin(), 0);
|
||||
input_right_pads.insert(input_right_pads.begin(), 0);
|
||||
|
||||
pooling_dims = {2, 3, 4};
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t N = 2;
|
||||
ck::index_t C = 32;
|
||||
ck::index_t Y = 2;
|
||||
ck::index_t X = 2;
|
||||
ck::index_t Hi = 30;
|
||||
ck::index_t Wi = 30;
|
||||
ck::index_t window_stride_h = 2;
|
||||
ck::index_t window_stride_w = 2;
|
||||
ck::index_t window_dilation_h = 1;
|
||||
ck::index_t window_dilation_w = 1;
|
||||
ck::index_t in_left_pad_h = 1;
|
||||
ck::index_t in_left_pad_w = 1;
|
||||
ck::index_t in_right_pad_h = 1;
|
||||
ck::index_t in_right_pad_w = 1;
|
||||
|
||||
const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
|
||||
const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
|
||||
ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
|
||||
ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
|
||||
|
||||
// Pool API only support the order of NCHW
|
||||
std::vector<ck::index_t> in_length = {N, C, Hi, Wi};
|
||||
std::vector<ck::index_t> out_length = {N, C, Ho, Wo};
|
||||
std::vector<ck::index_t> window_spatial_lengths = {Y, X};
|
||||
std::vector<ck::index_t> window_strides = {window_stride_h, window_stride_w};
|
||||
std::vector<ck::index_t> window_dilations = {window_dilation_h, window_dilation_w};
|
||||
std::vector<ck::index_t> input_left_pads = {in_left_pad_h, in_left_pad_w};
|
||||
std::vector<ck::index_t> input_right_pads = {in_right_pad_h, in_right_pad_w};
|
||||
std::vector<ck::index_t> pooling_dims = {2, 3};
|
||||
|
||||
std::size_t in_tensor_size = N * C * Hi * Wi;
|
||||
std::size_t out_tensor_size = N * C * Ho * Wo;
|
||||
|
||||
// tensor layout = NHWC
|
||||
std::vector<ck::index_t> in_tensor_stride = {C * Hi * Wi, 1, Wi * C, C};
|
||||
std::vector<ck::index_t> out_tensor_stride = {C * Ho * Wo, 1, Wo * C, C};
|
||||
|
||||
TransformPool2dparamToPool3d(in_length,
|
||||
window_spatial_lengths,
|
||||
out_length,
|
||||
in_tensor_stride,
|
||||
out_tensor_stride,
|
||||
out_tensor_stride,
|
||||
window_strides,
|
||||
window_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
pooling_dims);
|
||||
|
||||
SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
|
||||
SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
|
||||
SimpleDeviceMem indices_device_buf(sizeof(IndexDataType) * out_tensor_size);
|
||||
SimpleDeviceMem dout_device_buf(sizeof(DOutDataType) * out_tensor_size);
|
||||
SimpleDeviceMem din_device_buf(sizeof(DInDataType) * in_tensor_size);
|
||||
|
||||
// Generate index data from max pool forward
|
||||
{
|
||||
using MaxPoolFwdDeviceOp =
|
||||
ck::tensor_operation::device::DevicePoolFwd<InOutRank,
|
||||
WindowRank,
|
||||
InDataType,
|
||||
OutDataType,
|
||||
IndexDataType,
|
||||
InLayout,
|
||||
OutLayout,
|
||||
ck::ReduceTensorOp::MAX,
|
||||
true>;
|
||||
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
MaxPoolFwdDeviceOp>::GetInstances();
|
||||
|
||||
auto& op_ptr = op_ptrs[0];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
|
||||
in_length,
|
||||
window_spatial_lengths,
|
||||
out_length,
|
||||
in_tensor_stride,
|
||||
out_tensor_stride,
|
||||
out_tensor_stride,
|
||||
window_strides,
|
||||
window_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
pooling_dims);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
}
|
||||
|
||||
// Run MaxPool bwd
|
||||
using MaxPoolBwdDeviceOp =
|
||||
ck::tensor_operation::device::DeviceMaxPoolBwd<DOutDataType, IndexDataType, DInDataType>;
|
||||
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
MaxPoolBwdDeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
static_cast<InDataType*>(dout_device_buf.GetDeviceBuffer()),
|
||||
static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
|
||||
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
|
||||
out_tensor_size,
|
||||
in_tensor_size,
|
||||
window_spatial_lengths,
|
||||
window_strides,
|
||||
window_dilations);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_bytes = in_tensor_size * sizeof(DInDataType) +
|
||||
out_tensor_size * sizeof(IndexDataType) +
|
||||
out_tensor_size * sizeof(DOutDataType);
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << "GB / s,"
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
static_cast<InDataType*>(dout_device_buf.GetDeviceBuffer()),
|
||||
static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
|
||||
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
|
||||
out_tensor_size,
|
||||
in_tensor_size,
|
||||
window_spatial_lengths,
|
||||
window_strides,
|
||||
window_dilations);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
|
||||
|
||||
SimpleDeviceMem workspace(workspace_sz);
|
||||
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
248
client_example/19_pool/max_pool2d_fwd.cpp
Normal file
248
client_example/19_pool/max_pool2d_fwd.cpp
Normal file
@@ -0,0 +1,248 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
using IndexDataType = int32_t;
|
||||
|
||||
// We use pool3d to implement pool2d in this example
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWC;
|
||||
|
||||
constexpr ck::index_t InOutRank = 5;
|
||||
constexpr ck::index_t WindowRank = 3;
|
||||
#if 1
|
||||
constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
|
||||
constexpr bool OutputIndex = true;
|
||||
#else
|
||||
constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
|
||||
constexpr bool OutputIndex = false;
|
||||
#endif
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
void TransformPool2dparamToPool3d(std::vector<ck::index_t>& input_lengths,
|
||||
std::vector<ck::index_t>& window_lengths,
|
||||
std::vector<ck::index_t>& output_lengths,
|
||||
std::vector<ck::index_t>& input_stride,
|
||||
std::vector<ck::index_t>& output_stride,
|
||||
std::vector<ck::index_t>& indices_stride,
|
||||
std::vector<ck::index_t>& window_strides,
|
||||
std::vector<ck::index_t>& window_dilations,
|
||||
std::vector<ck::index_t>& input_left_pads,
|
||||
std::vector<ck::index_t>& input_right_pads,
|
||||
std::vector<ck::index_t>& pooling_dims)
|
||||
{
|
||||
// NCHW to NCDHW
|
||||
input_lengths.insert(input_lengths.begin() + 2, 1);
|
||||
output_lengths.insert(output_lengths.begin() + 2, 1);
|
||||
input_stride.insert(input_stride.begin() + 2, 0);
|
||||
output_stride.insert(output_stride.begin() + 2, 0);
|
||||
indices_stride.insert(indices_stride.begin() + 2, 0);
|
||||
|
||||
// YX to ZYX
|
||||
window_lengths.insert(window_lengths.begin(), 1);
|
||||
window_strides.insert(window_strides.begin(), 0);
|
||||
window_dilations.insert(window_dilations.begin(), 0);
|
||||
input_left_pads.insert(input_left_pads.begin(), 0);
|
||||
input_right_pads.insert(input_right_pads.begin(), 0);
|
||||
|
||||
pooling_dims = {2, 3, 4};
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t N = 2;
|
||||
ck::index_t C = 32;
|
||||
ck::index_t Y = 2;
|
||||
ck::index_t X = 2;
|
||||
ck::index_t Hi = 30;
|
||||
ck::index_t Wi = 30;
|
||||
ck::index_t window_stride_h = 2;
|
||||
ck::index_t window_stride_w = 2;
|
||||
ck::index_t window_dilation_h = 1;
|
||||
ck::index_t window_dilation_w = 1;
|
||||
ck::index_t in_left_pad_h = 1;
|
||||
ck::index_t in_left_pad_w = 1;
|
||||
ck::index_t in_right_pad_h = 1;
|
||||
ck::index_t in_right_pad_w = 1;
|
||||
|
||||
const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
|
||||
const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
|
||||
ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
|
||||
ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
|
||||
|
||||
// Pool API only support the order of NCHW
|
||||
std::vector<ck::index_t> in_length = {N, C, Hi, Wi};
|
||||
std::vector<ck::index_t> out_length = {N, C, Ho, Wo};
|
||||
std::vector<ck::index_t> window_spatial_lengths = {Y, X};
|
||||
std::vector<ck::index_t> window_strides = {window_stride_h, window_stride_w};
|
||||
std::vector<ck::index_t> window_dilations = {window_dilation_h, window_dilation_w};
|
||||
std::vector<ck::index_t> input_left_pads = {in_left_pad_h, in_left_pad_w};
|
||||
std::vector<ck::index_t> input_right_pads = {in_right_pad_h, in_right_pad_w};
|
||||
std::vector<ck::index_t> pooling_dims = {2, 3};
|
||||
|
||||
std::size_t in_tensor_size = N * C * Hi * Wi;
|
||||
std::size_t out_tensor_size = N * C * Ho * Wo;
|
||||
|
||||
// tensor layout = NHWC
|
||||
std::vector<ck::index_t> in_tensor_stride = {C * Hi * Wi, 1, Wi * C, C};
|
||||
std::vector<ck::index_t> out_tensor_stride = {C * Ho * Wo, 1, Wo * C, C};
|
||||
|
||||
TransformPool2dparamToPool3d(in_length,
|
||||
window_spatial_lengths,
|
||||
out_length,
|
||||
in_tensor_stride,
|
||||
out_tensor_stride,
|
||||
out_tensor_stride,
|
||||
window_strides,
|
||||
window_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
pooling_dims);
|
||||
|
||||
SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
|
||||
SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
|
||||
SimpleDeviceMem out_indices_device_buf(sizeof(IndexDataType) * out_tensor_size);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
|
||||
WindowRank,
|
||||
InDataType,
|
||||
OutDataType,
|
||||
IndexDataType,
|
||||
InLayout,
|
||||
OutLayout,
|
||||
ReduceOpId,
|
||||
OutputIndex>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
|
||||
in_length,
|
||||
window_spatial_lengths,
|
||||
out_length,
|
||||
in_tensor_stride,
|
||||
out_tensor_stride,
|
||||
out_tensor_stride,
|
||||
window_strides,
|
||||
window_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
pooling_dims);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_bytes =
|
||||
in_tensor_size * sizeof(InDataType) + out_tensor_size * sizeof(OutDataType);
|
||||
|
||||
if constexpr(OutputIndex)
|
||||
num_bytes += out_tensor_size * sizeof(IndexDataType);
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
|
||||
static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
|
||||
in_length,
|
||||
window_spatial_lengths,
|
||||
out_length,
|
||||
in_tensor_stride,
|
||||
out_tensor_stride,
|
||||
out_tensor_stride,
|
||||
window_strides,
|
||||
window_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
pooling_dims);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
4
client_example/20_splitk_gemm/CMakeLists.txt
Normal file
4
client_example/20_splitk_gemm/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
|
||||
add_executable(client_splitK_gemm splitK_gemm_fp16_f8.cpp)
|
||||
target_link_libraries(client_splitK_gemm PRIVATE composable_kernel::device_gemm_operations)
|
||||
endif()
|
||||
226
client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp
Normal file
226
client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp
Normal file
@@ -0,0 +1,226 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
|
||||
|
||||
using F8 = ck::f8_t;
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CElementOp = PassThrough;
|
||||
|
||||
using ADataType = F8;
|
||||
using BDataType = F16;
|
||||
using CDataType = F16;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using CLayout = Row;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
ck::index_t N = 4096;
|
||||
ck::index_t K = 4096;
|
||||
|
||||
ck::index_t StrideA = 4096;
|
||||
ck::index_t StrideB = 4096;
|
||||
ck::index_t StrideC = 4096;
|
||||
|
||||
ck::index_t KBatch = 1;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 8)
|
||||
{
|
||||
M = std::stoi(argv[1]);
|
||||
N = std::stoi(argv[2]);
|
||||
K = std::stoi(argv[3]);
|
||||
|
||||
StrideA = std::stoi(argv[4]);
|
||||
StrideB = std::stoi(argv[5]);
|
||||
StrideC = std::stoi(argv[6]);
|
||||
|
||||
KBatch = std::stoi(argv[7]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1 to 7: M, N, K, StrideA, StrideB, StrideC, KBatch\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
|
||||
SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<
|
||||
ALayout,
|
||||
BLayout,
|
||||
CLayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
CDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto c_element_op = CElementOp{};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
c_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
StrideC,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
c_element_op,
|
||||
KBatch);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
|
||||
std::size_t num_btype =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(found)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
c_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
StrideC,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
c_element_op,
|
||||
KBatch);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
4
client_example/21_grouped_gemm_bias/CMakeLists.txt
Normal file
4
client_example/21_grouped_gemm_bias/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
if(GPU_TARGETS MATCHES "gfx9")
|
||||
add_executable(client_grouped_gemm_fixed_nk_bias_fp16 grouped_gemm_fixed_nk_bias_fp16.cpp)
|
||||
target_link_libraries(client_grouped_gemm_fixed_nk_bias_fp16 PRIVATE composable_kernel::device_gemm_operations)
|
||||
endif()
|
||||
@@ -0,0 +1,243 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_bias.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using D0DataType = F32;
|
||||
using DsDataType = ck::Tuple<D0DataType>;
|
||||
using EDataType = F32;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Row;
|
||||
using D0Layout = Row;
|
||||
using DsLayout = ck::Tuple<D0Layout>;
|
||||
using ELayout = Row;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = Add;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
std::vector<int> Ms, Ns, Ks, StrideAs, StrideBs, StrideEs;
|
||||
|
||||
int sum_of_m = 0;
|
||||
|
||||
const int group_count = 16;
|
||||
|
||||
for(int i = 0; i < group_count; ++i)
|
||||
{
|
||||
Ms.push_back(256 + 256 * i);
|
||||
Ns.push_back(128 + 128 * i);
|
||||
Ks.push_back(128 + 64 * i);
|
||||
|
||||
StrideAs.push_back(std::is_same<Row, ALayout>::value ? Ks[i] : Ms[i]);
|
||||
StrideBs.push_back(std::is_same<Row, BLayout>::value ? Ns[i] : Ks[i]);
|
||||
StrideEs.push_back(std::is_same<Row, ELayout>::value ? Ns[i] : Ms[i]);
|
||||
|
||||
sum_of_m += Ms[i];
|
||||
}
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<SimpleDeviceMem> a_dev_bufs, b_dev_bufs, d0_dev_bufs, e_dev_bufs;
|
||||
|
||||
a_dev_bufs.reserve(group_count);
|
||||
b_dev_bufs.reserve(group_count);
|
||||
d0_dev_bufs.reserve(group_count);
|
||||
e_dev_bufs.reserve(group_count);
|
||||
|
||||
std::vector<void*> p_e;
|
||||
|
||||
p_e.reserve(group_count);
|
||||
|
||||
std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
|
||||
|
||||
gemm_descs.reserve(group_count);
|
||||
|
||||
std::vector<ck::tensor_operation::device::GroupedGemmKernelArgument<1>>
|
||||
grouped_gemm_kernel_args_;
|
||||
grouped_gemm_kernel_args_.reserve(group_count);
|
||||
|
||||
for(int i = 0; i < group_count; ++i)
|
||||
{
|
||||
a_dev_bufs.emplace_back(sizeof(ADataType) *
|
||||
f_matrix_space_size(Ms[i], Ks[i], StrideAs[i], ALayout{}));
|
||||
b_dev_bufs.emplace_back(sizeof(BDataType) *
|
||||
f_matrix_space_size(Ks[i], Ns[i], StrideBs[i], BLayout{}));
|
||||
d0_dev_bufs.emplace_back(sizeof(D0DataType) *
|
||||
f_matrix_space_size(Ms[i], Ns[i], 0, D0Layout{}));
|
||||
e_dev_bufs.emplace_back(sizeof(EDataType) *
|
||||
f_matrix_space_size(Ms[i], Ns[i], StrideEs[i], ELayout{}));
|
||||
|
||||
gemm_descs.push_back({sum_of_m, Ns[i], Ks[i], 1, StrideBs[i], 1, {0}});
|
||||
|
||||
p_e.push_back(e_dev_bufs[i].GetDeviceBuffer());
|
||||
|
||||
grouped_gemm_kernel_args_.push_back(
|
||||
{a_dev_bufs[i].GetDeviceBuffer(),
|
||||
b_dev_bufs[i].GetDeviceBuffer(),
|
||||
std::array<const void*, 1>{d0_dev_bufs[i].GetDeviceBuffer()},
|
||||
e_dev_bufs[i].GetDeviceBuffer(),
|
||||
Ms[i],
|
||||
Ns[i],
|
||||
Ks[i],
|
||||
StrideAs[i],
|
||||
StrideBs[i],
|
||||
std::array<ck::index_t, 1>{0},
|
||||
StrideEs[i]});
|
||||
}
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemmFixedNK<ALayout,
|
||||
BLayout,
|
||||
DsLayout,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
std::vector<const void*> p_a = {}, p_b = {};
|
||||
std::vector<std::array<const void*, 1>> p_ds = {};
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
p_a, p_b, p_ds, p_e, gemm_descs, a_element_op, b_element_op, cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
SimpleDeviceMem grouped_gemm_kernel_args_dev(
|
||||
op_ptr->GetDeviceKernelArgSize(argument_ptr.get()));
|
||||
|
||||
SimpleDeviceMem grouped_gemm_workspace_dev(op_ptr->GetWorkSpaceSize(argument_ptr.get()));
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
hipGetErrorString(hipMemcpy(grouped_gemm_kernel_args_dev.GetDeviceBuffer(),
|
||||
grouped_gemm_kernel_args_.data(),
|
||||
op_ptr->GetDeviceKernelArgSize(argument_ptr.get()),
|
||||
hipMemcpyHostToDevice));
|
||||
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(),
|
||||
grouped_gemm_workspace_dev.GetDeviceBuffer());
|
||||
|
||||
op_ptr->SetDeviceKernelArgs(argument_ptr.get(),
|
||||
grouped_gemm_kernel_args_dev.GetDeviceBuffer());
|
||||
|
||||
op_ptr->SetKBatch(argument_ptr.get(), 2);
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = 0, num_btype = 0;
|
||||
for(std::size_t j = 0; j < gemm_descs.size(); ++j)
|
||||
{
|
||||
flop += std::size_t(2) * Ms[j] * Ns[j] * Ks[j];
|
||||
|
||||
num_btype += sizeof(ADataType) * Ms[j] * Ks[j] + sizeof(BDataType) * Ks[j] * Ns[j] +
|
||||
sizeof(EDataType) * Ms[j] * Ns[j];
|
||||
}
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
13
client_example/22_grouped_gemm/CMakeLists.txt
Normal file
13
client_example/22_grouped_gemm/CMakeLists.txt
Normal file
@@ -0,0 +1,13 @@
|
||||
if(GPU_TARGETS MATCHES "gfx9")
|
||||
add_executable(client_grouped_gemm_fixed_nk_fp16 grouped_gemm_fixed_nk_fp16.cpp)
|
||||
target_link_libraries(client_grouped_gemm_fixed_nk_fp16 PRIVATE composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_grouped_gemm_fixed_nk_fp8 grouped_gemm_fixed_nk_fp8.cpp)
|
||||
target_link_libraries(client_grouped_gemm_fixed_nk_fp8 PRIVATE composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_grouped_gemm_fixed_nk_i8 grouped_gemm_fixed_nk_i8.cpp)
|
||||
target_link_libraries(client_grouped_gemm_fixed_nk_i8 PRIVATE composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_grouped_gemm_fixed_nk_bf16 grouped_gemm_fixed_nk_bf16.cpp)
|
||||
target_link_libraries(client_grouped_gemm_fixed_nk_bf16 PRIVATE composable_kernel::device_gemm_operations)
|
||||
endif()
|
||||
237
client_example/22_grouped_gemm/grouped_gemm_fixed_nk_bf16.cpp
Normal file
237
client_example/22_grouped_gemm/grouped_gemm_fixed_nk_bf16.cpp
Normal file
@@ -0,0 +1,237 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_fixed_nk.hpp"
|
||||
|
||||
using I8 = int8_t;
|
||||
using BF16 = ck::bhalf_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using ADataType = BF16;
|
||||
using BDataType = I8;
|
||||
using DsDataType = ck::Tuple<>;
|
||||
using EDataType = BF16;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Row;
|
||||
using DsLayout = ck::Tuple<>;
|
||||
using ELayout = Row;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = PassThrough;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
std::vector<int> Ms, Ns, Ks, StrideAs, StrideBs, StrideEs;
|
||||
|
||||
int sum_of_m = 0;
|
||||
|
||||
const int group_count = 16;
|
||||
|
||||
for(int i = 0; i < group_count; ++i)
|
||||
{
|
||||
Ms.push_back(256 + 256 * i);
|
||||
Ns.push_back(128 + 128 * i);
|
||||
Ks.push_back(128 + 64 * i);
|
||||
|
||||
StrideAs.push_back(std::is_same<Row, ALayout>::value ? Ks[i] : Ms[i]);
|
||||
StrideBs.push_back(std::is_same<Row, BLayout>::value ? Ns[i] : Ks[i]);
|
||||
StrideEs.push_back(std::is_same<Row, ELayout>::value ? Ns[i] : Ms[i]);
|
||||
|
||||
sum_of_m += Ms[i];
|
||||
}
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<SimpleDeviceMem> a_dev_bufs, b_dev_bufs, e_dev_bufs;
|
||||
|
||||
a_dev_bufs.reserve(group_count);
|
||||
b_dev_bufs.reserve(group_count);
|
||||
e_dev_bufs.reserve(group_count);
|
||||
|
||||
std::vector<void*> p_e;
|
||||
|
||||
p_e.reserve(group_count);
|
||||
|
||||
std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
|
||||
|
||||
gemm_descs.reserve(group_count);
|
||||
|
||||
std::vector<ck::tensor_operation::device::GroupedGemmKernelArgument<1>>
|
||||
grouped_gemm_kernel_args_;
|
||||
grouped_gemm_kernel_args_.reserve(group_count);
|
||||
|
||||
for(int i = 0; i < group_count; ++i)
|
||||
{
|
||||
a_dev_bufs.emplace_back(sizeof(ADataType) *
|
||||
f_matrix_space_size(Ms[i], Ks[i], StrideAs[i], ALayout{}));
|
||||
b_dev_bufs.emplace_back(sizeof(BDataType) *
|
||||
f_matrix_space_size(Ks[i], Ns[i], StrideBs[i], BLayout{}));
|
||||
e_dev_bufs.emplace_back(sizeof(EDataType) *
|
||||
f_matrix_space_size(Ms[i], Ns[i], StrideEs[i], ELayout{}));
|
||||
|
||||
gemm_descs.push_back({sum_of_m, Ns[i], Ks[i], 1, StrideBs[i], 1, {0}});
|
||||
|
||||
p_e.push_back(e_dev_bufs[i].GetDeviceBuffer());
|
||||
|
||||
grouped_gemm_kernel_args_.push_back({a_dev_bufs[i].GetDeviceBuffer(),
|
||||
b_dev_bufs[i].GetDeviceBuffer(),
|
||||
{},
|
||||
e_dev_bufs[i].GetDeviceBuffer(),
|
||||
Ms[i],
|
||||
Ns[i],
|
||||
Ks[i],
|
||||
StrideAs[i],
|
||||
StrideBs[i],
|
||||
{},
|
||||
StrideEs[i]});
|
||||
}
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemmFixedNK<ALayout,
|
||||
BLayout,
|
||||
DsLayout,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
std::vector<const void*> p_a = {}, p_b = {};
|
||||
std::vector<std::array<const void*, 0>> p_ds = {};
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
p_a, p_b, p_ds, p_e, gemm_descs, a_element_op, b_element_op, cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
SimpleDeviceMem grouped_gemm_kernel_args_dev(
|
||||
op_ptr->GetDeviceKernelArgSize(argument_ptr.get()));
|
||||
|
||||
SimpleDeviceMem grouped_gemm_workspace_dev(op_ptr->GetWorkSpaceSize(argument_ptr.get()));
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
hipGetErrorString(hipMemcpy(grouped_gemm_kernel_args_dev.GetDeviceBuffer(),
|
||||
grouped_gemm_kernel_args_.data(),
|
||||
op_ptr->GetDeviceKernelArgSize(argument_ptr.get()),
|
||||
hipMemcpyHostToDevice));
|
||||
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(),
|
||||
grouped_gemm_workspace_dev.GetDeviceBuffer());
|
||||
|
||||
op_ptr->SetDeviceKernelArgs(argument_ptr.get(),
|
||||
grouped_gemm_kernel_args_dev.GetDeviceBuffer());
|
||||
|
||||
op_ptr->SetKBatch(argument_ptr.get(), 1);
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = 0, num_btype = 0;
|
||||
for(std::size_t j = 0; j < gemm_descs.size(); ++j)
|
||||
{
|
||||
flop += std::size_t(2) * Ms[j] * Ns[j] * Ks[j];
|
||||
|
||||
num_btype += sizeof(ADataType) * Ms[j] * Ks[j] + sizeof(BDataType) * Ks[j] * Ns[j] +
|
||||
sizeof(EDataType) * Ms[j] * Ns[j];
|
||||
}
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
236
client_example/22_grouped_gemm/grouped_gemm_fixed_nk_fp16.cpp
Normal file
236
client_example/22_grouped_gemm/grouped_gemm_fixed_nk_fp16.cpp
Normal file
@@ -0,0 +1,236 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_fixed_nk.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using DsDataType = ck::Tuple<>;
|
||||
using EDataType = F16;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Row;
|
||||
using DsLayout = ck::Tuple<>;
|
||||
using ELayout = Row;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = PassThrough;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
std::vector<int> Ms, Ns, Ks, StrideAs, StrideBs, StrideEs;
|
||||
|
||||
int sum_of_m = 0;
|
||||
|
||||
const int group_count = 16;
|
||||
|
||||
for(int i = 0; i < group_count; ++i)
|
||||
{
|
||||
Ms.push_back(256 + 256 * i);
|
||||
Ns.push_back(128 + 128 * i);
|
||||
Ks.push_back(128 + 64 * i);
|
||||
|
||||
StrideAs.push_back(std::is_same<Row, ALayout>::value ? Ks[i] : Ms[i]);
|
||||
StrideBs.push_back(std::is_same<Row, BLayout>::value ? Ns[i] : Ks[i]);
|
||||
StrideEs.push_back(std::is_same<Row, ELayout>::value ? Ns[i] : Ms[i]);
|
||||
|
||||
sum_of_m += Ms[i];
|
||||
}
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<SimpleDeviceMem> a_dev_bufs, b_dev_bufs, e_dev_bufs;
|
||||
|
||||
a_dev_bufs.reserve(group_count);
|
||||
b_dev_bufs.reserve(group_count);
|
||||
e_dev_bufs.reserve(group_count);
|
||||
|
||||
std::vector<void*> p_e;
|
||||
|
||||
p_e.reserve(group_count);
|
||||
|
||||
std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
|
||||
|
||||
gemm_descs.reserve(group_count);
|
||||
|
||||
std::vector<ck::tensor_operation::device::GroupedGemmKernelArgument<1>>
|
||||
grouped_gemm_kernel_args_;
|
||||
grouped_gemm_kernel_args_.reserve(group_count);
|
||||
|
||||
for(int i = 0; i < group_count; ++i)
|
||||
{
|
||||
a_dev_bufs.emplace_back(sizeof(ADataType) *
|
||||
f_matrix_space_size(Ms[i], Ks[i], StrideAs[i], ALayout{}));
|
||||
b_dev_bufs.emplace_back(sizeof(BDataType) *
|
||||
f_matrix_space_size(Ks[i], Ns[i], StrideBs[i], BLayout{}));
|
||||
e_dev_bufs.emplace_back(sizeof(EDataType) *
|
||||
f_matrix_space_size(Ms[i], Ns[i], StrideEs[i], ELayout{}));
|
||||
|
||||
gemm_descs.push_back({sum_of_m, Ns[i], Ks[i], 1, StrideBs[i], 1, {0}});
|
||||
|
||||
p_e.push_back(e_dev_bufs[i].GetDeviceBuffer());
|
||||
|
||||
grouped_gemm_kernel_args_.push_back({a_dev_bufs[i].GetDeviceBuffer(),
|
||||
b_dev_bufs[i].GetDeviceBuffer(),
|
||||
{},
|
||||
e_dev_bufs[i].GetDeviceBuffer(),
|
||||
Ms[i],
|
||||
Ns[i],
|
||||
Ks[i],
|
||||
StrideAs[i],
|
||||
StrideBs[i],
|
||||
{},
|
||||
StrideEs[i]});
|
||||
}
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemmFixedNK<ALayout,
|
||||
BLayout,
|
||||
DsLayout,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
std::vector<const void*> p_a = {}, p_b = {};
|
||||
std::vector<std::array<const void*, 0>> p_ds = {};
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
p_a, p_b, p_ds, p_e, gemm_descs, a_element_op, b_element_op, cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
SimpleDeviceMem grouped_gemm_kernel_args_dev(
|
||||
op_ptr->GetDeviceKernelArgSize(argument_ptr.get()));
|
||||
|
||||
SimpleDeviceMem grouped_gemm_workspace_dev(op_ptr->GetWorkSpaceSize(argument_ptr.get()));
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
hipGetErrorString(hipMemcpy(grouped_gemm_kernel_args_dev.GetDeviceBuffer(),
|
||||
grouped_gemm_kernel_args_.data(),
|
||||
op_ptr->GetDeviceKernelArgSize(argument_ptr.get()),
|
||||
hipMemcpyHostToDevice));
|
||||
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(),
|
||||
grouped_gemm_workspace_dev.GetDeviceBuffer());
|
||||
|
||||
op_ptr->SetDeviceKernelArgs(argument_ptr.get(),
|
||||
grouped_gemm_kernel_args_dev.GetDeviceBuffer());
|
||||
|
||||
op_ptr->SetKBatch(argument_ptr.get(), 32);
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = 0, num_btype = 0;
|
||||
for(std::size_t j = 0; j < gemm_descs.size(); ++j)
|
||||
{
|
||||
flop += std::size_t(2) * Ms[j] * Ns[j] * Ks[j];
|
||||
|
||||
num_btype += sizeof(ADataType) * Ms[j] * Ks[j] + sizeof(BDataType) * Ks[j] * Ns[j] +
|
||||
sizeof(EDataType) * Ms[j] * Ns[j];
|
||||
}
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user