mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 17:26:00 +00:00
sync 22
This commit is contained in:
@@ -1,11 +0,0 @@
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp16 grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp16 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_bf16 grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_bf16 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_int8 grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_int8 PRIVATE composable_kernel::device_conv_operations)
|
||||
@@ -1,216 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
using BiasLayout = ck::tensor_layout::convolution::G_K;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ScaleAddScaleAddRelu = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 32;
|
||||
static constexpr ck::index_t N = 64; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 32; // input channel (per group)
|
||||
static constexpr ck::index_t Z = 3; // filter D
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Di = 14; // input D
|
||||
static constexpr ck::index_t Hi = 14; // input H
|
||||
static constexpr ck::index_t Wi = 14; // input W
|
||||
static constexpr ck::index_t Do = 14; // output D
|
||||
static constexpr ck::index_t Ho = 14; // output H
|
||||
static constexpr ck::index_t Wo = 14; // output W
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int execute_conv_fwd_scaleadd_scaleadd_relu()
|
||||
{
|
||||
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
|
||||
// However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
|
||||
// Hence, we need to adjust the order of strides.
|
||||
std::array<ck::index_t, 6> in_lengths{G, N, C, Di, Hi, Wi};
|
||||
std::array<ck::index_t, 6> in_strides{
|
||||
C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
|
||||
std::array<ck::index_t, 6> wei_lengths{G, K, C, Z, Y, X};
|
||||
std::array<ck::index_t, 6> wei_strides{
|
||||
K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
|
||||
std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
|
||||
std::array<ck::index_t, 6> out_strides{
|
||||
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
|
||||
// Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
|
||||
std::array<ck::index_t, 6> bias_lengths{G, 1, K, 1, 1, 1};
|
||||
std::array<ck::index_t, 6> bias_strides{K, 0, 1, 0, 0, 0};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Di * Hi * Wi * G * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
|
||||
SimpleDeviceMem d0(sizeof(std::tuple_element_t<0, DDataTypes>) * N * Do * Ho * Wo * G * K);
|
||||
SimpleDeviceMem d1(sizeof(std::tuple_element_t<1, DDataTypes>) * G * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
|
||||
NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<OutLayout, BiasLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<std::tuple_element_t<0, DDataTypes>, std::tuple_element_t<1, DDataTypes>>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
ScaleAddScaleAddRelu>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{d0.GetDeviceBuffer(), d1.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{out_lengths, bias_lengths},
|
||||
{out_strides, bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
ScaleAddScaleAddRelu{2.f, 2.f});
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop =
|
||||
std::size_t(2) * G * N * K * C * Ho * Wo * Y * X + 2 * N * Ho * Wo * G * K;
|
||||
std::size_t num_bytes =
|
||||
sizeof(InDataType) * N * Hi * Wi * G * C + sizeof(WeiDataType) * G * K * Y * X * C +
|
||||
(sizeof(OutDataType) + sizeof(std::tuple_element_t<0, DDataTypes>) +
|
||||
sizeof(std::tuple_element_t<1, DDataTypes>)) *
|
||||
N * Ho * Wo * G * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{d0.GetDeviceBuffer(), d1.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{out_lengths, bias_lengths},
|
||||
{out_strides, bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
ScaleAddScaleAddRelu{2.f, 2.f});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -1,18 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
using InDataType = ck::bhalf_t;
|
||||
using WeiDataType = ck::bhalf_t;
|
||||
using OutDataType = ck::bhalf_t;
|
||||
// Use std tuple instead of ck tuple to avoid clang
|
||||
// implicit instantiation of undefined template error.
|
||||
using DDataTypes = std::tuple<ck::bhalf_t, ck::bhalf_t>;
|
||||
|
||||
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
|
||||
|
||||
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
|
||||
@@ -1,18 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
// Use std tuple instead of ck tuple to avoid clang
|
||||
// implicit instantiation of undefined template error.
|
||||
using DDataTypes = std::tuple<ck::half_t, ck::half_t>;
|
||||
|
||||
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
|
||||
|
||||
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
|
||||
@@ -1,18 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
using InDataType = float;
|
||||
using WeiDataType = float;
|
||||
using OutDataType = float;
|
||||
// Use std tuple instead of ck tuple to avoid clang
|
||||
// implicit instantiation of undefined template error.
|
||||
using DDataTypes = std::tuple<float, float>;
|
||||
|
||||
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
|
||||
|
||||
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
|
||||
@@ -1,18 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using OutDataType = int8_t;
|
||||
// Use std tuple instead of ck tuple to avoid clang
|
||||
// implicit instantiation of undefined template error.
|
||||
using DDataTypes = std::tuple<float, float>;
|
||||
|
||||
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
|
||||
|
||||
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
|
||||
5
example/64_fpAintB_gemm/CMakeLists.txt
Normal file
5
example/64_fpAintB_gemm/CMakeLists.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
if(GPU_TARGETS MATCHES "gfx11")
|
||||
add_custom_target(example_fpAintB_gemm_wmma)
|
||||
add_example_executable(example_fp16int8_gemm_wmma fp16int8_gemm_wmma.cpp)
|
||||
add_dependencies(example_fpAintB_gemm_wmma example_fp16int8_gemm_wmma)
|
||||
endif()
|
||||
123
example/64_fpAintB_gemm/common.hpp
Normal file
123
example/64_fpAintB_gemm/common.hpp
Normal file
@@ -0,0 +1,123 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <numeric>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/fill.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp"
|
||||
|
||||
struct ProblemSize final
|
||||
{
|
||||
ck::index_t M = 3840;
|
||||
ck::index_t N = 4096;
|
||||
ck::index_t K = 4096;
|
||||
|
||||
ck::index_t StrideA = 4096;
|
||||
ck::index_t StrideB = 4096;
|
||||
ck::index_t StrideC = 4096;
|
||||
};
|
||||
|
||||
struct ExecutionConfig final
|
||||
{
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
};
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
template <typename IntType>
|
||||
struct UnsignedWeightPreprocessor
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct UnsignedWeightPreprocessor<int8_t>
|
||||
{
|
||||
using UnsignedWeight = Tensor<uint8_t>;
|
||||
using SignedWeight = Tensor<int8_t>;
|
||||
static UnsignedWeight convert(SignedWeight const& Input)
|
||||
{
|
||||
|
||||
UnsignedWeight Output = Input.template CopyAsType<uint8_t>();
|
||||
|
||||
auto f_kn = [&](auto k, auto n) {
|
||||
const uint8_t adder = 128;
|
||||
int8_t v_signed_weight;
|
||||
uint8_t v_unsigned_weight;
|
||||
|
||||
ck::tensor_operation::element_wise::PassThrough{}(v_signed_weight, Input(k, n));
|
||||
v_unsigned_weight = ck::type_convert<uint8_t>(v_signed_weight) + adder;
|
||||
Output(k, n) = v_unsigned_weight;
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_kn, Input.mDesc.GetLengths()[0], Input.mDesc.GetLengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
|
||||
return Output;
|
||||
}
|
||||
|
||||
UnsignedWeight operator()(SignedWeight const& Input) { return convert(Input); }
|
||||
};
|
||||
|
||||
inline bool
|
||||
parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config)
|
||||
{
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 4)
|
||||
{
|
||||
config.do_verification = std::stoi(argv[1]);
|
||||
config.init_method = std::stoi(argv[2]);
|
||||
config.time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 10)
|
||||
{
|
||||
config.do_verification = std::stoi(argv[1]);
|
||||
config.init_method = std::stoi(argv[2]);
|
||||
config.time_kernel = std::stoi(argv[3]);
|
||||
|
||||
problem_size.M = std::stoi(argv[4]);
|
||||
problem_size.N = std::stoi(argv[5]);
|
||||
problem_size.K = std::stoi(argv[6]);
|
||||
|
||||
problem_size.StrideA = std::stoi(argv[7]);
|
||||
problem_size.StrideB = std::stoi(argv[8]);
|
||||
problem_size.StrideC = std::stoi(argv[9]);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
|
||||
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
|
||||
<< std::endl
|
||||
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
|
||||
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
93
example/64_fpAintB_gemm/fp16int8_gemm_wmma.cpp
Normal file
93
example/64_fpAintB_gemm/fp16int8_gemm_wmma.cpp
Normal file
@@ -0,0 +1,93 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp"
|
||||
|
||||
// Implementation follows the paper:
|
||||
// Kim, Young Jin, Rawn Henry, Raffy Fahim, and Hany Hassan Awadalla. “Who Says Elephants Can’t Run:
|
||||
// Bringing Large Scale MoE Models into Cloud Scale Production.” arXiv, November 17, 2022.
|
||||
// https://doi.org/10.48550/arXiv.2211.10017. Assume weight (Matrix B) is add preprocess to
|
||||
// unsigned.
|
||||
|
||||
// The DeviceOp is CDataType = ADataType * Dequant(BDataType) * ScaleDataType
|
||||
// The HostRef is CDataType = ADataType * Dequant(QuantDataType) * ScaleDataType
|
||||
|
||||
// TODO: Current implementation consume more VGPR than expected.
|
||||
|
||||
using ADataType = ck::half_t;
|
||||
using QuantDataType = int8_t;
|
||||
using BDataType = uint8_t;
|
||||
using ScaleDataType = ck::half_t;
|
||||
using AccDataType = float;
|
||||
using CShuffleDataType = float;
|
||||
using CDataType = ck::half_t;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using CLayout = Row;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CElementOp = PassThrough;
|
||||
|
||||
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
// clang-format off
|
||||
using DeviceGemmInstance = ck::tensor_operation::device::DeviceFpAintBGemm_Wmma_CShuffle
|
||||
< ALayout,
|
||||
BLayout,
|
||||
CLayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ScaleDataType,
|
||||
CDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CElementOp,
|
||||
GemmDefault,
|
||||
1, // Prefetch stage
|
||||
128, // BlockSize
|
||||
64, // MPerBlock
|
||||
128, // NPerBlock
|
||||
64, // KPerBlock
|
||||
8, // K1
|
||||
16, // MPerWmma
|
||||
16, // NPerWmma
|
||||
2, // M-Repeat // M-PerWmma / M-Repeat = M-Wave
|
||||
4, // N-Repeat // N-PerWmma / N-Repeat = N-Wave
|
||||
S<4, 32, 1>,
|
||||
S<1, 0, 2>,
|
||||
S<1, 0, 2>,
|
||||
2,
|
||||
8,
|
||||
8,
|
||||
true,
|
||||
S<4, 32, 1>,
|
||||
S<1, 0, 2>,
|
||||
S<1, 0, 2>,
|
||||
2,
|
||||
8,
|
||||
8,
|
||||
true,
|
||||
1, // C shuffle (M Repeat) Per store
|
||||
1, // C shuffle (N Repeat) Per store
|
||||
S<1, 32, 1, 4>,
|
||||
8>;
|
||||
// clang-format on
|
||||
|
||||
using ReferenceGemmInstance = ck::tensor_operation::host::ReferencefpAintBGemm<ADataType,
|
||||
QuantDataType,
|
||||
ScaleDataType,
|
||||
CDataType,
|
||||
AccDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CElementOp>;
|
||||
|
||||
#include "run_gemm_example.inc"
|
||||
|
||||
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
|
||||
172
example/64_fpAintB_gemm/run_gemm_example.inc
Normal file
172
example/64_fpAintB_gemm/run_gemm_example.inc
Normal file
@@ -0,0 +1,172 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
|
||||
{
|
||||
#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
|
||||
static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
|
||||
#endif
|
||||
|
||||
using namespace ck::literals;
|
||||
|
||||
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
|
||||
Tensor<QuantDataType> quant_b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
|
||||
// assume scale tensor is [1, n]
|
||||
Tensor<ScaleDataType> scale_k_n(f_host_tensor_descriptor(K, N, 0, Row{}));
|
||||
|
||||
switch(config.init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-1.f, 1.f}(a_m_k);
|
||||
ck::utils::FillUniformDistributionIntegerValue<QuantDataType>{-1.f, 1.f}(quant_b_k_n);
|
||||
ck::utils::FillUniformDistributionIntegerValue<ScaleDataType>{-1.f, 1.f}(scale_k_n);
|
||||
break;
|
||||
case 2:
|
||||
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
|
||||
ck::utils::FillUniformDistribution<QuantDataType>{-1.f, 1.f}(quant_b_k_n);
|
||||
ck::utils::FillUniformDistribution<ScaleDataType>{-1.f, 1.f}(scale_k_n);
|
||||
break;
|
||||
default:
|
||||
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
|
||||
ck::utils::FillUniformDistribution<QuantDataType>{-1.f, 1.f}(quant_b_k_n);
|
||||
ck::utils::FillUniformDistribution<ScaleDataType>{-1.f, 1.f}(scale_k_n);
|
||||
}
|
||||
|
||||
UnsignedWeightPreprocessor<QuantDataType> preprocessor;
|
||||
Tensor<BDataType> b_k_n = preprocessor(quant_b_k_n);
|
||||
|
||||
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
|
||||
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
|
||||
|
||||
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
|
||||
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
|
||||
std::cout << "scale_k_n: " << scale_k_n.mDesc << std::endl;
|
||||
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
|
||||
|
||||
#ifdef BUILD_INT4_EXAMPLE
|
||||
DeviceMem a_m_k_device_buf(sizeof(KernelADataType) * a_m_k.mDesc.GetElementSpaceSize());
|
||||
DeviceMem b_k_n_device_buf(sizeof(KernelBDataType) * b_k_n.mDesc.GetElementSpaceSize());
|
||||
DeviceMem c_m_n_device_buf(sizeof(KernelCDataType) *
|
||||
c_m_n_device_result.mDesc.GetElementSpaceSize());
|
||||
|
||||
const Tensor<KernelADataType> a_m_k_converted(a_m_k);
|
||||
const Tensor<KernelBDataType> b_k_n_converted(b_k_n);
|
||||
|
||||
a_m_k_device_buf.ToDevice(a_m_k_converted.mData.data());
|
||||
b_k_n_device_buf.ToDevice(b_k_n_converted.mData.data());
|
||||
#else
|
||||
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
|
||||
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
|
||||
DeviceMem scale_k_n_device_buf(sizeof(ScaleDataType) * scale_k_n.mDesc.GetElementSpaceSize());
|
||||
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
|
||||
|
||||
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
|
||||
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
|
||||
scale_k_n_device_buf.ToDevice(scale_k_n.mData.data());
|
||||
#endif
|
||||
|
||||
auto a_element_op = AElementOp{};
|
||||
auto b_element_op = BElementOp{};
|
||||
auto c_element_op = CElementOp{};
|
||||
|
||||
// do GEMM
|
||||
auto gemm = DeviceGemmInstance{};
|
||||
auto invoker = gemm.MakeInvoker();
|
||||
auto argument = gemm.MakeArgument(
|
||||
#ifdef BUILD_INT4_EXAMPLE
|
||||
static_cast<KernelADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
|
||||
static_cast<KernelBDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
|
||||
static_cast<KernelCDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
|
||||
#else
|
||||
static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
|
||||
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
|
||||
static_cast<ScaleDataType*>(scale_k_n_device_buf.GetDeviceBuffer()),
|
||||
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
|
||||
#endif
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
StrideC,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
c_element_op);
|
||||
|
||||
if(!gemm.IsSupportedArgument(argument))
|
||||
{
|
||||
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
|
||||
|
||||
std::size_t flop = 2_uz * M * N * K;
|
||||
std::size_t num_btype =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
|
||||
<< gemm.GetTypeString() << std::endl;
|
||||
|
||||
if(config.do_verification)
|
||||
{
|
||||
auto ref_gemm = ReferenceGemmInstance{};
|
||||
auto ref_invoker = ref_gemm.MakeInvoker();
|
||||
|
||||
auto ref_argument = ref_gemm.MakeArgument(a_m_k,
|
||||
quant_b_k_n,
|
||||
scale_k_n,
|
||||
c_m_n_host_result,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
c_element_op);
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
#ifdef BUILD_INT4_EXAMPLE
|
||||
Tensor<CDataType> c_m_n_device_result_converted(c_m_n_host_result.mDesc);
|
||||
|
||||
c_m_n_device_buf.FromDevice(c_m_n_device_result_converted.mData.data());
|
||||
|
||||
c_m_n_device_result = c_m_n_device_result_converted.CopyAsType<CDataType>();
|
||||
|
||||
return ck::utils::check_err(c_m_n_device_result_converted, c_m_n_host_result);
|
||||
#else
|
||||
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
|
||||
|
||||
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
|
||||
#endif
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool run_gemm_example(int argc, char* argv[])
|
||||
{
|
||||
ProblemSize problem_size;
|
||||
ExecutionConfig config;
|
||||
|
||||
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
|
||||
}
|
||||
21
test/wrapper/CMakeLists.txt
Normal file
21
test/wrapper/CMakeLists.txt
Normal file
@@ -0,0 +1,21 @@
|
||||
add_custom_target(test_wrapper)
|
||||
|
||||
add_gtest_executable(test_wrapper_layout test_wrapper_layout.cpp)
|
||||
target_link_libraries(test_wrapper_layout PRIVATE utility)
|
||||
add_dependencies(test_wrapper test_wrapper_layout)
|
||||
add_gtest_executable(test_wrapper_tensor test_wrapper_tensor.cpp)
|
||||
target_link_libraries(test_wrapper_tensor PRIVATE utility)
|
||||
add_dependencies(test_wrapper test_wrapper_tensor)
|
||||
add_gtest_executable(test_wrapper_copy test_wrapper_copy.cpp)
|
||||
target_link_libraries(test_wrapper_copy PRIVATE utility)
|
||||
add_dependencies(test_wrapper test_wrapper_copy)
|
||||
add_gtest_executable(test_wrapper_partition test_wrapper_partition.cpp)
|
||||
target_link_libraries(test_wrapper_partition PRIVATE utility)
|
||||
add_dependencies(test_wrapper test_wrapper_partition)
|
||||
if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR
|
||||
GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR
|
||||
GPU_TARGETS MATCHES "gfx942")
|
||||
add_gtest_executable(test_wrapper_gemm test_wrapper_gemm.cpp)
|
||||
target_link_libraries(test_wrapper_gemm PRIVATE utility)
|
||||
add_dependencies(test_wrapper test_wrapper_gemm)
|
||||
endif()
|
||||
135
test/wrapper/test_wrapper_copy.cpp
Normal file
135
test/wrapper/test_wrapper_copy.cpp
Normal file
@@ -0,0 +1,135 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <numeric>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
#include "ck/wrapper/tensor.hpp"
|
||||
#include "ck/wrapper/operations/copy.hpp"
|
||||
|
||||
// Test copy from Global to Global through LDS and VGPR
|
||||
template <typename InputTensor,
|
||||
typename OutputTensor,
|
||||
typename BlockShape,
|
||||
typename ThreadLayout,
|
||||
bool UseOptimizedCopy>
|
||||
__global__ void TestCopyDevice(const InputTensor input_tensor,
|
||||
OutputTensor output_tensor,
|
||||
const BlockShape tile_shape,
|
||||
const ThreadLayout thread_layout)
|
||||
{
|
||||
__shared__ ck::index_t p_shared[ck::wrapper::size(tile_shape)];
|
||||
const auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
p_shared, ck::wrapper::make_layout(tile_shape));
|
||||
|
||||
const auto block_idxs =
|
||||
ck::make_tuple(static_cast<ck::index_t>(blockIdx.x), static_cast<ck::index_t>(blockIdx.y));
|
||||
|
||||
// Get local tiles for global memory
|
||||
const auto input_local_tile =
|
||||
ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idxs);
|
||||
const auto output_local_tile =
|
||||
ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idxs);
|
||||
|
||||
// Get partition per thread
|
||||
const auto input_local_partition =
|
||||
ck::wrapper::make_local_partition(input_local_tile, thread_layout, threadIdx.x);
|
||||
auto lds_local_partition =
|
||||
ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x);
|
||||
auto output_local_partition =
|
||||
ck::wrapper::make_local_partition(output_local_tile, thread_layout, threadIdx.x);
|
||||
|
||||
// Allocate VGPR
|
||||
auto tensor_vgpr =
|
||||
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
|
||||
ck::wrapper::make_layout(shape(lds_local_partition)));
|
||||
|
||||
// Perform copy
|
||||
if constexpr(UseOptimizedCopy)
|
||||
{
|
||||
using DimAccessOrder = ck::Tuple<ck::Number<1>, ck::Number<0>>;
|
||||
constexpr ck::index_t vector_dim = 0;
|
||||
constexpr ck::index_t scalar_per_vector = 2;
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(input_local_partition,
|
||||
lds_local_partition);
|
||||
// TODO: Enable optimized copy for static buffers
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(lds_local_partition,
|
||||
tensor_vgpr);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(tensor_vgpr,
|
||||
output_local_partition);
|
||||
}
|
||||
else
|
||||
{
|
||||
ck::wrapper::copy(input_local_partition, lds_local_partition);
|
||||
ck::wrapper::copy(lds_local_partition, tensor_vgpr);
|
||||
ck::wrapper::copy(tensor_vgpr, output_local_partition);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool UseOptimizedCopy>
|
||||
void PerformCopyGlobalToGlobalViaLDS()
|
||||
{
|
||||
const auto shape =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<256>{});
|
||||
const auto strides =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<2>{}), ck::Number<4>{});
|
||||
const auto layout = ck::wrapper::make_layout(shape, strides);
|
||||
|
||||
// 0, 1, 2, ..., size(shape) - 1
|
||||
std::vector<ck::index_t> input_data(ck::wrapper::size(shape));
|
||||
std::iota(input_data.begin(), input_data.end(), 0);
|
||||
|
||||
// Global memory buffers
|
||||
DeviceMem in_buf(ck::wrapper::size(layout) * sizeof(ck::index_t));
|
||||
DeviceMem out_buf(ck::wrapper::size(layout) * sizeof(ck::index_t));
|
||||
|
||||
in_buf.ToDevice(input_data.data());
|
||||
out_buf.SetZero();
|
||||
|
||||
// Create tensors for global memory
|
||||
const auto input_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<const ck::index_t*>(in_buf.GetDeviceBuffer()), layout);
|
||||
auto output_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<ck::index_t*>(out_buf.GetDeviceBuffer()), layout);
|
||||
|
||||
const auto thread_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<1>{}, ck::Number<32>{}));
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<4>{}, ck::Number<64>{});
|
||||
|
||||
const ck::index_t grid_size_x = ck::math::integer_divide_ceil(
|
||||
ck::wrapper::size<0>(input_tensor_global), ck::wrapper::size<0>(tile_shape));
|
||||
const ck::index_t grid_size_y = ck::math::integer_divide_ceil(
|
||||
ck::wrapper::size<1>(input_tensor_global), ck::wrapper::size<1>(tile_shape));
|
||||
|
||||
const auto kernel = TestCopyDevice<decltype(input_tensor_global),
|
||||
decltype(output_tensor_global),
|
||||
decltype(tile_shape),
|
||||
decltype(thread_layout),
|
||||
UseOptimizedCopy>;
|
||||
launch_and_time_kernel(StreamConfig{},
|
||||
kernel,
|
||||
dim3(grid_size_x, grid_size_y, 1),
|
||||
dim3(ck::wrapper::size(thread_layout)),
|
||||
0,
|
||||
input_tensor_global,
|
||||
output_tensor_global,
|
||||
tile_shape,
|
||||
thread_layout);
|
||||
|
||||
// Verify results
|
||||
std::vector<ck::index_t> output_data(ck::wrapper::size(shape));
|
||||
out_buf.FromDevice(output_data.data());
|
||||
EXPECT_TRUE(ck::utils::check_err(output_data, input_data));
|
||||
}
|
||||
|
||||
TEST(TestCopyGlobalToGlobalViaLDS, GenericCopy) { PerformCopyGlobalToGlobalViaLDS<false>(); }
|
||||
TEST(TestCopyGlobalToGlobalViaLDS, OptimizedCopy) { PerformCopyGlobalToGlobalViaLDS<true>(); }
|
||||
376
test/wrapper/test_wrapper_gemm.cpp
Normal file
376
test/wrapper/test_wrapper_gemm.cpp
Normal file
@@ -0,0 +1,376 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <numeric>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/library/utility/fill.hpp"
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
#include "ck/wrapper/tensor.hpp"
|
||||
#include "ck/wrapper/operations/copy.hpp"
|
||||
#include "ck/wrapper/operations/gemm.hpp"
|
||||
#include "ck/wrapper/utils/kernel_utils.hpp"
|
||||
|
||||
template <typename DataType>
|
||||
void CheckResult(const std::vector<DataType>& a_data,
|
||||
const std::vector<DataType>& b_data,
|
||||
std::vector<DataType>& c_m_n_device_result,
|
||||
const ck::index_t M,
|
||||
const ck::index_t N,
|
||||
const ck::index_t K)
|
||||
{
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ReferenceGemmInstance = ck::tensor_operation::host::
|
||||
ReferenceGemm<DataType, DataType, DataType, float, PassThrough, PassThrough, PassThrough>;
|
||||
|
||||
Tensor<DataType> a_m_k(HostTensorDescriptor({M, K}));
|
||||
Tensor<DataType> b_k_n(HostTensorDescriptor({K, N}, {1, K}));
|
||||
Tensor<DataType> c_m_n_host_result(HostTensorDescriptor({M, N}));
|
||||
|
||||
a_m_k.mData = a_data;
|
||||
b_k_n.mData = b_data;
|
||||
|
||||
auto ref_op = ReferenceGemmInstance{};
|
||||
auto ref_invoker = ref_op.MakeInvoker();
|
||||
auto ref_argument = ref_op.MakeArgument(
|
||||
a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
EXPECT_TRUE(ck::utils::check_err(c_m_n_device_result, c_m_n_host_result.mData));
|
||||
}
|
||||
|
||||
template <bool DoPad, typename Layout, typename PaddingDims>
|
||||
__device__ auto ApplyPadding(const Layout& layout, const PaddingDims& padding_dims)
|
||||
{
|
||||
if constexpr(DoPad)
|
||||
{
|
||||
return ck::wrapper::pad(layout, padding_dims);
|
||||
}
|
||||
else
|
||||
{
|
||||
return layout;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DataType,
|
||||
typename GemmTraits,
|
||||
ck::index_t scalar_per_vector,
|
||||
typename BlockShape,
|
||||
typename ThreadLayout,
|
||||
bool DoPadding>
|
||||
__global__ void __CK_WRAPPER_LAUNCH_BOUNDS__ DeviceGemm(const void* p_a,
|
||||
const void* p_b,
|
||||
void* p_c,
|
||||
const ck::index_t M,
|
||||
const ck::index_t N,
|
||||
const ck::index_t K,
|
||||
const BlockShape tile_shape,
|
||||
const ThreadLayout thread_layout)
|
||||
{
|
||||
constexpr auto MPerBlock = ck::wrapper::size<0>(tile_shape);
|
||||
constexpr auto NPerBlock = ck::wrapper::size<1>(tile_shape);
|
||||
constexpr auto KPerBlock = ck::wrapper::size<2>(tile_shape);
|
||||
constexpr auto K1 = GemmTraits::K1;
|
||||
constexpr auto K0PerBlock = KPerBlock / K1;
|
||||
const auto K0 = ck::math::integer_divide_ceil(K, K1);
|
||||
|
||||
const auto tile_shape_k0_m_n_k1 = ck::make_tuple(K0PerBlock, MPerBlock, NPerBlock, K1);
|
||||
|
||||
const auto a_global_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(M, K), ck::make_tuple(K, 1));
|
||||
const auto b_global_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(N, K), ck::make_tuple(K, 1));
|
||||
const auto c_global_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(M, N), ck::make_tuple(N, 1));
|
||||
|
||||
auto a_padded_global_layout =
|
||||
ApplyPadding<DoPadding>(a_global_layout, ck::make_tuple(MPerBlock, KPerBlock));
|
||||
auto b_padded_global_layout =
|
||||
ApplyPadding<DoPadding>(b_global_layout, ck::make_tuple(NPerBlock, KPerBlock));
|
||||
auto c_padded_global_layout =
|
||||
ApplyPadding<DoPadding>(c_global_layout, ck::make_tuple(MPerBlock, NPerBlock));
|
||||
|
||||
// Reshape from M,K to K0,M,K1
|
||||
const auto reshaped_dims_idxs =
|
||||
ck::make_tuple(ck::Number<1>{}, ck::make_tuple(ck::Number<0>{}, ck::Number<2>{}));
|
||||
auto a_padded_unmerged_global_layout =
|
||||
ck::wrapper::unmerge<1>(a_padded_global_layout, ck::make_tuple(K0, K1), reshaped_dims_idxs);
|
||||
auto b_padded_unmerged_global_layout =
|
||||
ck::wrapper::unmerge<1>(b_padded_global_layout, ck::make_tuple(K0, K1), reshaped_dims_idxs);
|
||||
|
||||
auto a_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<const DataType*>(p_a), a_padded_unmerged_global_layout);
|
||||
auto b_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<const DataType*>(p_b), b_padded_unmerged_global_layout);
|
||||
auto c_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<DataType*>(p_c), c_padded_global_layout);
|
||||
|
||||
// Add extra M and N
|
||||
constexpr auto a_tile_layout = ck::wrapper::make_layout(
|
||||
ck::make_tuple(K0PerBlock, MPerBlock, K1),
|
||||
ck::make_tuple((MPerBlock + ck::Number<1>{}) * K1, K1, ck::Number<1>{}));
|
||||
constexpr auto b_tile_layout = ck::wrapper::make_layout(
|
||||
ck::make_tuple(K0PerBlock, NPerBlock, K1),
|
||||
ck::make_tuple((NPerBlock + ck::Number<1>{}) * K1, K1, ck::Number<1>{}));
|
||||
|
||||
__shared__ DataType lds_a[ck::wrapper::size(a_tile_layout) + NPerBlock];
|
||||
__shared__ DataType lds_b[ck::wrapper::size(b_tile_layout) + NPerBlock];
|
||||
|
||||
auto a_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
static_cast<DataType*>(lds_a), a_tile_layout);
|
||||
auto b_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
static_cast<DataType*>(lds_b), b_tile_layout);
|
||||
|
||||
const auto block_idxs = ck::make_tuple(ck::wrapper::slice(),
|
||||
static_cast<ck::index_t>(blockIdx.x),
|
||||
static_cast<ck::index_t>(blockIdx.y),
|
||||
ck::wrapper::slice());
|
||||
using DimAccessOrder = ck::Tuple<ck::Number<1>, ck::Number<0>, ck::Number<2>>;
|
||||
constexpr ck::index_t vector_dim = 2;
|
||||
|
||||
auto c_global_local_tile =
|
||||
ck::wrapper::make_local_tile(c_global_tensor,
|
||||
tile_shape_k0_m_n_k1,
|
||||
block_idxs,
|
||||
make_tuple(ck::wrapper::slice(K0PerBlock),
|
||||
ck::Number<1>{},
|
||||
ck::Number<1>{},
|
||||
ck::wrapper::slice(K1)));
|
||||
auto c_global_local_partition =
|
||||
ck::wrapper::make_blockwise_gemm_xdl_c_local_partition<DataType,
|
||||
decltype(a_tile_layout),
|
||||
decltype(b_tile_layout),
|
||||
ck::wrapper::size(thread_layout),
|
||||
GemmTraits>(c_global_local_tile);
|
||||
auto c_vgpr_reg = ck::wrapper::make_blockwise_gemm_xdl_c_vgpr<DataType,
|
||||
decltype(a_tile_layout),
|
||||
decltype(b_tile_layout),
|
||||
ck::wrapper::size(thread_layout),
|
||||
GemmTraits>();
|
||||
ck::wrapper::clear(c_vgpr_reg);
|
||||
|
||||
auto a_lds_tensor_local_partition =
|
||||
ck::wrapper::make_local_partition(a_lds_tensor, thread_layout, threadIdx.x);
|
||||
auto b_lds_tensor_local_partition =
|
||||
ck::wrapper::make_local_partition(b_lds_tensor, thread_layout, threadIdx.x);
|
||||
|
||||
auto make_global_partition = [&](auto tensor, auto projection, ck::index_t i) {
|
||||
const auto k_slice =
|
||||
ck::make_tuple(ck::wrapper::slice(i * K0PerBlock, (i + 1) * K0PerBlock),
|
||||
ck::wrapper::slice(),
|
||||
ck::wrapper::slice());
|
||||
auto local_tile = ck::wrapper::make_local_tile(
|
||||
tensor(k_slice), tile_shape_k0_m_n_k1, block_idxs, projection);
|
||||
return ck::wrapper::make_local_partition(local_tile, thread_layout, threadIdx.x);
|
||||
};
|
||||
|
||||
auto a_global_local_partition = make_global_partition(
|
||||
a_global_tensor,
|
||||
make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}),
|
||||
0);
|
||||
auto b_global_local_partition = make_global_partition(
|
||||
b_global_tensor,
|
||||
make_tuple(ck::Number<1>{}, ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}),
|
||||
0);
|
||||
|
||||
// (row-major vgpr layout)
|
||||
auto a_vgpr_tensor =
|
||||
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, DataType>(
|
||||
ck::wrapper::make_layout(
|
||||
shape(a_global_local_partition),
|
||||
ck::make_tuple(ck::wrapper::size<1>(a_global_local_partition) *
|
||||
ck::wrapper::size<2>(a_global_local_partition),
|
||||
ck::wrapper::size<2>(a_global_local_partition),
|
||||
ck::Number<1>{})));
|
||||
auto b_vgpr_tensor =
|
||||
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, DataType>(
|
||||
ck::wrapper::make_layout(
|
||||
shape(b_global_local_partition),
|
||||
ck::make_tuple(ck::wrapper::size<1>(a_global_local_partition) *
|
||||
ck::wrapper::size<2>(a_global_local_partition),
|
||||
ck::wrapper::size<2>(a_global_local_partition),
|
||||
ck::Number<1>{})));
|
||||
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(a_global_local_partition,
|
||||
a_vgpr_tensor);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(b_global_local_partition,
|
||||
b_vgpr_tensor);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(a_vgpr_tensor,
|
||||
a_lds_tensor_local_partition);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(b_vgpr_tensor,
|
||||
b_lds_tensor_local_partition);
|
||||
|
||||
const ck::index_t num_loop =
|
||||
__builtin_amdgcn_readfirstlane(ck::math::integer_divide_ceil(K, KPerBlock));
|
||||
if(num_loop > 1)
|
||||
{
|
||||
ck::index_t i = 0;
|
||||
do
|
||||
{
|
||||
auto a_global_local_partition_i = make_global_partition(
|
||||
a_global_tensor,
|
||||
make_tuple(
|
||||
ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}),
|
||||
i + 1);
|
||||
auto b_global_local_partition_i = make_global_partition(
|
||||
b_global_tensor,
|
||||
make_tuple(
|
||||
ck::Number<1>{}, ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}),
|
||||
i + 1);
|
||||
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
a_global_local_partition_i, a_vgpr_tensor);
|
||||
|
||||
ck::block_sync_lds();
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
b_global_local_partition_i, b_vgpr_tensor);
|
||||
|
||||
ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
|
||||
a_lds_tensor, b_lds_tensor, c_vgpr_reg);
|
||||
|
||||
ck::block_sync_lds();
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
a_vgpr_tensor, a_lds_tensor_local_partition);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
b_vgpr_tensor, b_lds_tensor_local_partition);
|
||||
|
||||
++i;
|
||||
} while(i < (num_loop - 1));
|
||||
}
|
||||
ck::block_sync_lds();
|
||||
ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
|
||||
a_lds_tensor, b_lds_tensor, c_vgpr_reg);
|
||||
|
||||
ck::wrapper::copy(c_vgpr_reg, c_global_local_partition);
|
||||
}
|
||||
|
||||
template <typename DataType,
|
||||
typename GemmTraits,
|
||||
ck::index_t scalar_per_vector,
|
||||
bool DoPadding,
|
||||
typename BlockShape,
|
||||
typename ThreadLayout>
|
||||
void PerformGemm(const ck::index_t M,
|
||||
const ck::index_t N,
|
||||
const ck::index_t K,
|
||||
const BlockShape& tile_shape,
|
||||
const ThreadLayout& thread_layout)
|
||||
{
|
||||
// Global memory buffers
|
||||
DeviceMem a_mem(M * K * sizeof(DataType));
|
||||
DeviceMem b_mem(K * N * sizeof(DataType));
|
||||
DeviceMem c_mem(M * N * sizeof(DataType));
|
||||
|
||||
std::vector<DataType> a_data(M * K);
|
||||
std::vector<DataType> b_data(K * N);
|
||||
ck::utils::FillUniformDistributionIntegerValue<DataType>{-5.f, 5.f}(a_data);
|
||||
ck::utils::FillUniformDistributionIntegerValue<DataType>{-5.f, 5.f}(b_data);
|
||||
|
||||
a_mem.ToDevice(a_data.data());
|
||||
b_mem.ToDevice(b_data.data());
|
||||
c_mem.SetZero();
|
||||
|
||||
const ck::index_t grid_size_x =
|
||||
ck::math::integer_divide_ceil(M, ck::wrapper::size<0>(tile_shape));
|
||||
const ck::index_t grid_size_y =
|
||||
ck::math::integer_divide_ceil(N, ck::wrapper::size<1>(tile_shape));
|
||||
|
||||
const auto kernel =
|
||||
DeviceGemm<DataType, GemmTraits, scalar_per_vector, BlockShape, ThreadLayout, DoPadding>;
|
||||
const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true},
|
||||
kernel,
|
||||
dim3(grid_size_x, grid_size_y, 1),
|
||||
dim3(ck::wrapper::size(thread_layout)),
|
||||
0,
|
||||
a_mem.GetDeviceBuffer(),
|
||||
b_mem.GetDeviceBuffer(),
|
||||
c_mem.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
tile_shape,
|
||||
thread_layout);
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
sizeof(DataType) * M * K + sizeof(DataType) * K * N + sizeof(DataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << std::endl;
|
||||
|
||||
std::vector<DataType> c_data(M * N);
|
||||
c_mem.FromDevice(c_data.data());
|
||||
CheckResult<DataType>(a_data, b_data, c_data, M, N, K);
|
||||
}
|
||||
|
||||
TEST(TestGemm, Float)
|
||||
{
|
||||
using DataType = float;
|
||||
// (dim1, dim2, dim0 thread layout)
|
||||
const auto thread_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
|
||||
ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}, ck::Number<1>{}));
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<16>{});
|
||||
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_4K1, 4, false>(
|
||||
512, 512, 128, tile_shape, thread_layout);
|
||||
// Irregular case
|
||||
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_4K1, 1, true>(
|
||||
129, 129, 67, tile_shape, thread_layout);
|
||||
}
|
||||
|
||||
TEST(TestGemm, Int8)
|
||||
{
|
||||
using DataType = int8_t;
|
||||
const auto thread_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
|
||||
ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}, ck::Number<1>{}));
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<64>{});
|
||||
PerformGemm<DataType,
|
||||
ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_16K1,
|
||||
16,
|
||||
false>(512, 512, 128, tile_shape, thread_layout);
|
||||
// Irregular case
|
||||
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_16K1, 1, true>(
|
||||
129, 129, 67, tile_shape, thread_layout);
|
||||
}
|
||||
|
||||
TEST(TestGemm, Half)
|
||||
{
|
||||
using DataType = ck::half_t;
|
||||
const auto thread_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
|
||||
ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}, ck::Number<1>{}));
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<32>{});
|
||||
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_8K1, 8, false>(
|
||||
512, 512, 128, tile_shape, thread_layout);
|
||||
// Irregular case
|
||||
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_8K1, 1, true>(
|
||||
129, 129, 67, tile_shape, thread_layout);
|
||||
}
|
||||
|
||||
TEST(TestGemm, Float_2x4_4x2_XdlPerWave)
|
||||
{
|
||||
using DataType = float;
|
||||
const auto thread_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
|
||||
ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}, ck::Number<1>{}));
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<256>{}, ck::Number<128>{}, ck::Number<16>{});
|
||||
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_4x2XdlPerWave_4K1, 4, false>(
|
||||
512, 512, 128, tile_shape, thread_layout);
|
||||
}
|
||||
474
test/wrapper/test_wrapper_layout.cpp
Normal file
474
test/wrapper/test_wrapper_layout.cpp
Normal file
@@ -0,0 +1,474 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
#include "ck/tensor_description/multi_index_transform_helper.hpp"
|
||||
|
||||
class TestWrapperLayout : public ::testing::Test
|
||||
{
|
||||
protected:
|
||||
static constexpr auto I0 = ck::Number<0>{};
|
||||
static constexpr auto I1 = ck::Number<1>{};
|
||||
|
||||
template <typename Desc,
|
||||
typename Desc1d,
|
||||
typename LayoutRuntime,
|
||||
typename LayoutCompiletime,
|
||||
typename Idxs>
|
||||
void Run(Desc& desc,
|
||||
Desc1d& desc_1d,
|
||||
LayoutRuntime& layout_runtime,
|
||||
LayoutCompiletime& layout_compiletime,
|
||||
const std::vector<Idxs>& idxs)
|
||||
{
|
||||
// 1d check
|
||||
EXPECT_EQ(desc_1d.GetLength(I0), ck::wrapper::size(layout_runtime));
|
||||
// Check layout compiletime and runtime result consistency
|
||||
EXPECT_EQ(ck::wrapper::size(layout_runtime), ck::wrapper::size(layout_compiletime));
|
||||
|
||||
for(ck::index_t i = 0; i < desc_1d.GetLength(I0); i++)
|
||||
{
|
||||
const ck::index_t layout_runtime_offset_1d = layout_runtime(ck::make_tuple(i));
|
||||
const ck::index_t layout_compiletime_offset_1d = layout_compiletime(ck::make_tuple(i));
|
||||
const ck::index_t desc_offset_1d = desc_1d.CalculateOffset(ck::make_tuple(i));
|
||||
EXPECT_EQ(layout_runtime_offset_1d, desc_offset_1d);
|
||||
EXPECT_EQ(layout_compiletime_offset_1d, layout_runtime_offset_1d);
|
||||
}
|
||||
// size(layout)-d check, don't check if access is hierarchical
|
||||
if constexpr(!IsNestedTuple(Idxs{}))
|
||||
{
|
||||
ck::static_for<0, Idxs::Size(), 1>{}([&](auto d) {
|
||||
EXPECT_EQ(desc.GetLength(ck::Number<d>{}), ck::wrapper::size<d>(layout_runtime));
|
||||
EXPECT_EQ(ck::wrapper::size<d>(layout_runtime),
|
||||
ck::wrapper::size<d>(layout_compiletime));
|
||||
});
|
||||
}
|
||||
for(const auto idx : idxs)
|
||||
{
|
||||
const ck::index_t layout_runtime_offset = layout_runtime(idx);
|
||||
const ck::index_t layout_compiletime_offset = layout_compiletime(idx);
|
||||
const ck::index_t desc_offset =
|
||||
desc.CalculateOffset(UnrollNestedTuple(idx)); // Unroll if nested
|
||||
EXPECT_EQ(layout_runtime_offset, desc_offset);
|
||||
EXPECT_EQ(layout_runtime_offset, layout_compiletime_offset);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(TestWrapperLayout, 2d)
|
||||
{
|
||||
// dims:(4, 3) strides:(1, 4)
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
constexpr ck::index_t s1 = 1;
|
||||
constexpr ck::index_t s0 = 4;
|
||||
const auto desc =
|
||||
ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
|
||||
// Reverse due to column major
|
||||
const auto desc_1d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1))),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}));
|
||||
const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0));
|
||||
const auto layout_compiletime =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
|
||||
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs;
|
||||
|
||||
for(ck::index_t h = 0; h < d1; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0; w++)
|
||||
{
|
||||
idxs.emplace_back(h, w);
|
||||
}
|
||||
}
|
||||
|
||||
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs);
|
||||
}
|
||||
|
||||
TEST_F(TestWrapperLayout, 3d_nested)
|
||||
{
|
||||
// dims:((2, 3), 4, 3) strides:((2, 4), 12, 48)
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
constexpr ck::index_t s3 = 2;
|
||||
constexpr ck::index_t s2 = 4;
|
||||
constexpr ck::index_t s1 = 12;
|
||||
constexpr ck::index_t s0 = 48;
|
||||
const auto desc = ck::make_naive_tensor_descriptor(
|
||||
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
|
||||
// Reverse due to column major
|
||||
const auto desc_1d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
|
||||
ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}));
|
||||
const auto desc_3d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
|
||||
ck::make_pass_through_transform(d1),
|
||||
ck::make_pass_through_transform(d2)),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<3>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
|
||||
const auto layout_runtime =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), d1, d0),
|
||||
ck::make_tuple(ck::make_tuple(s3, s2), s1, s0));
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(
|
||||
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}), ck::Number<d1>{}, ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
|
||||
ck::Number<s1>{},
|
||||
ck::Number<s0>{}));
|
||||
std::vector<ck::Tuple<ck::index_t, ck::index_t, ck::index_t>> idxs_3d;
|
||||
|
||||
for(ck::index_t d = 0; d < d2 * d3; d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < d1; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0; w++)
|
||||
{
|
||||
idxs_3d.emplace_back(d, h, w);
|
||||
}
|
||||
}
|
||||
}
|
||||
this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
|
||||
|
||||
// Check also 4d iteration
|
||||
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t, ck::index_t>> idxs_4d;
|
||||
|
||||
for(ck::index_t e = 0; e < d3; e++)
|
||||
{
|
||||
for(ck::index_t d = 0; d < d2; d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < d1; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0; w++)
|
||||
{
|
||||
idxs_4d.emplace_back(ck::make_tuple(e, d), h, w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
|
||||
}
|
||||
|
||||
TEST_F(TestWrapperLayout, 2d_nested)
|
||||
{
|
||||
// dims:((2, 3), (4, 3)) strides:((2, 4), (48, 12))
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
constexpr ck::index_t s3 = 2;
|
||||
constexpr ck::index_t s2 = 4;
|
||||
constexpr ck::index_t s1 = 48;
|
||||
constexpr ck::index_t s0 = 12;
|
||||
const auto desc = ck::make_naive_tensor_descriptor(
|
||||
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
|
||||
// Reverse due to column major
|
||||
const auto desc_1d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
|
||||
ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}));
|
||||
const auto desc_2d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
|
||||
ck::make_merge_transform(ck::make_tuple(d0, d1))),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<3, 2>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
|
||||
const auto layout_runtime =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), ck::make_tuple(d1, d0)),
|
||||
ck::make_tuple(ck::make_tuple(s3, s2), ck::make_tuple(s1, s0)));
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
|
||||
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
|
||||
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
|
||||
|
||||
for(ck::index_t h = 0; h < d2 * d3; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0 * d1; w++)
|
||||
{
|
||||
idxs_2d.emplace_back(h, w);
|
||||
}
|
||||
}
|
||||
this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
|
||||
// Check also 4d iteration
|
||||
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::Tuple<ck::index_t, ck::index_t>>>
|
||||
idxs_4d;
|
||||
|
||||
for(ck::index_t e = 0; e < d3; e++)
|
||||
{
|
||||
for(ck::index_t d = 0; d < d2; d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < d1; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0; w++)
|
||||
{
|
||||
idxs_4d.emplace_back(ck::make_tuple(e, d), ck::make_tuple(h, w));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
|
||||
}
|
||||
|
||||
TEST_F(TestWrapperLayout, 3d_double_nested)
|
||||
{
|
||||
// dims:(((2, 2), 3), (4, 3)) strides:(((2, 4), 8), (96, 24))
|
||||
constexpr ck::index_t d4 = 2;
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
constexpr ck::index_t s4 = 2;
|
||||
constexpr ck::index_t s3 = 4;
|
||||
constexpr ck::index_t s2 = 8;
|
||||
constexpr ck::index_t s1 = 96;
|
||||
constexpr ck::index_t s0 = 24;
|
||||
const auto desc = ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d4>{},
|
||||
ck::Number<d3>{},
|
||||
ck::Number<d2>{},
|
||||
ck::Number<d1>{},
|
||||
ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::Number<s4>{},
|
||||
ck::Number<s3>{},
|
||||
ck::Number<s2>{},
|
||||
ck::Number<s1>{},
|
||||
ck::Number<s0>{}));
|
||||
// Reverse due to column major
|
||||
const auto desc_1d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3, d4))),
|
||||
ck::make_tuple(ck::Sequence<4, 3, 2, 1, 0>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}));
|
||||
const auto desc_3d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d3, d4)),
|
||||
ck::make_pass_through_transform(d2),
|
||||
ck::make_merge_transform(ck::make_tuple(d0, d1))),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<4, 3>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
|
||||
const auto desc_2d = transform_tensor_descriptor(
|
||||
desc_3d,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3 * d4)),
|
||||
ck::make_pass_through_transform(d1 * d0)),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
|
||||
const auto layout_runtime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)),
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, s3), s2), ck::make_tuple(s1, s0)));
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
|
||||
ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
|
||||
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
|
||||
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
|
||||
|
||||
for(ck::index_t h = 0; h < d2 * d3 * d4; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0 * d1; w++)
|
||||
{
|
||||
idxs_2d.emplace_back(h, w);
|
||||
}
|
||||
}
|
||||
this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
|
||||
// Check also 3d iteration
|
||||
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>> idxs_3d;
|
||||
|
||||
for(ck::index_t d = 0; d < d3 * d4; d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < d2; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d1 * d0; w++)
|
||||
{
|
||||
idxs_3d.emplace_back(ck::make_tuple(d, h), w);
|
||||
}
|
||||
}
|
||||
}
|
||||
this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
|
||||
// Check also 5d iteration
|
||||
std::vector<ck::Tuple<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>,
|
||||
ck::Tuple<ck::index_t, ck::index_t>>>
|
||||
idxs_5d;
|
||||
|
||||
for(ck::index_t f = 0; f < d4; f++)
|
||||
{
|
||||
for(ck::index_t e = 0; e < d3; e++)
|
||||
{
|
||||
for(ck::index_t d = 0; d < d2; d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < d1; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0; w++)
|
||||
{
|
||||
idxs_5d.emplace_back(ck::make_tuple(ck::make_tuple(f, e), d),
|
||||
ck::make_tuple(h, w));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_5d);
|
||||
}
|
||||
|
||||
TEST(TestLayoutHelpers, SizeAndGet)
|
||||
{
|
||||
// dims:(((2, 2), 3), (4, 3))
|
||||
constexpr ck::index_t d4 = 2;
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
const auto layout_runtime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
|
||||
|
||||
// Size of layout
|
||||
EXPECT_EQ(ck::wrapper::size(layout_runtime), d4 * d3 * d2 * d1 * d0);
|
||||
EXPECT_EQ(ck::wrapper::size(layout_compiletime), d4 * d3 * d2 * d1 * d0);
|
||||
|
||||
// Size of dims
|
||||
EXPECT_EQ(ck::wrapper::size<0>(layout_runtime), d4 * d3 * d2);
|
||||
EXPECT_EQ(ck::wrapper::size<0>(layout_compiletime), d4 * d3 * d2);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(layout_runtime), d1 * d0);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(layout_compiletime), d1 * d0);
|
||||
|
||||
// Access through new layout (using get with layout object)
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_runtime)), d4 * d3);
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_compiletime)), d4 * d3);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
|
||||
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d4);
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
|
||||
d4);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d3);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
|
||||
d3);
|
||||
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
|
||||
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_runtime)), d1);
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_compiletime)), d1);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_runtime)), d0);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_compiletime)), d0);
|
||||
}
|
||||
|
||||
TEST(TestLayoutHelpers, DepthAndRank)
|
||||
{
|
||||
// dims:(((2, 2), 3), (4, 3))
|
||||
constexpr ck::index_t d4 = 2;
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
const auto layout_runtime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
|
||||
|
||||
EXPECT_EQ(ck::wrapper::depth(layout_runtime), 3);
|
||||
EXPECT_EQ(ck::wrapper::depth(layout_compiletime), 3);
|
||||
EXPECT_EQ(ck::wrapper::depth(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
|
||||
// Check for integer
|
||||
EXPECT_EQ(ck::wrapper::depth(d0), 0);
|
||||
|
||||
EXPECT_EQ(ck::wrapper::rank(layout_runtime), 2);
|
||||
EXPECT_EQ(ck::wrapper::rank(layout_compiletime), 2);
|
||||
EXPECT_EQ(ck::wrapper::rank(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
|
||||
// Check for integer
|
||||
EXPECT_EQ(ck::wrapper::rank(d0), 1);
|
||||
}
|
||||
|
||||
TEST(TestLayoutHelpers, ShapeAndStrides)
|
||||
{
|
||||
// dims:(((2, 2), 3), (4, 3))
|
||||
constexpr ck::index_t d4 = 2;
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
constexpr ck::index_t s4 = 2;
|
||||
constexpr ck::index_t s3 = 4;
|
||||
constexpr ck::index_t s2 = 8;
|
||||
constexpr ck::index_t s1 = 96;
|
||||
constexpr ck::index_t s0 = 24;
|
||||
const auto shape_compiletime = ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}));
|
||||
const auto strides_compiletime = ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<s4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
|
||||
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
|
||||
const auto shape_runtime =
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
|
||||
const auto strides_runtime =
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(s4, s3), s2), ck::make_tuple(s1, s0));
|
||||
const auto layout_runtime = ck::wrapper::make_layout(shape_runtime, strides_runtime);
|
||||
const auto layout_compiletime =
|
||||
ck::wrapper::make_layout(shape_compiletime, strides_compiletime);
|
||||
|
||||
constexpr bool check_compiletime_shape =
|
||||
std::is_same_v<decltype(shape_compiletime),
|
||||
std::remove_reference_t<decltype(shape(layout_compiletime))>>;
|
||||
constexpr bool check_runtime_shape =
|
||||
std::is_same_v<decltype(shape_runtime),
|
||||
std::remove_reference_t<decltype(shape(layout_runtime))>>;
|
||||
EXPECT_TRUE(check_compiletime_shape);
|
||||
EXPECT_TRUE(check_runtime_shape);
|
||||
}
|
||||
|
||||
TEST(TestLayoutHelpers, Hierarchical)
|
||||
{
|
||||
// dims:(((2, 2), 3), (4, 3))
|
||||
constexpr ck::index_t d4 = 2;
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
const auto runtime_shape =
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
|
||||
const auto layout_runtime = ck::wrapper::make_layout(runtime_shape);
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
|
||||
|
||||
EXPECT_EQ((ck::wrapper::rank<0, 0>(runtime_shape)), 2);
|
||||
EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_runtime)), 2);
|
||||
EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_compiletime)), 2);
|
||||
|
||||
EXPECT_EQ((ck::wrapper::depth<0, 0>(runtime_shape)), 1);
|
||||
EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_runtime)), 1);
|
||||
EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_compiletime)), 1);
|
||||
|
||||
EXPECT_EQ((ck::wrapper::size<0, 0>(runtime_shape)), d4 * d3);
|
||||
EXPECT_EQ((ck::wrapper::size<0, 0>(layout_runtime)), d4 * d3);
|
||||
EXPECT_EQ((ck::wrapper::size<0, 0>(layout_compiletime)), d4 * d3);
|
||||
|
||||
EXPECT_EQ((ck::wrapper::get<0, 0, 0>(runtime_shape)), d4);
|
||||
}
|
||||
115
test/wrapper/test_wrapper_partition.cpp
Normal file
115
test/wrapper/test_wrapper_partition.cpp
Normal file
@@ -0,0 +1,115 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <numeric>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
#include "ck/wrapper/tensor.hpp"
|
||||
|
||||
TEST(TestPartition, LocalPartition)
|
||||
{
|
||||
const auto shape =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{});
|
||||
const auto strides =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{});
|
||||
const auto layout = ck::wrapper::make_layout(shape, strides);
|
||||
|
||||
std::vector<ck::index_t> data(ck::wrapper::size(layout));
|
||||
std::iota(data.begin(), data.end(), 0);
|
||||
|
||||
const auto tensor =
|
||||
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
|
||||
|
||||
const auto thread_steps = ck::make_tuple(ck::Number<1>{}, ck::Number<8>{}, ck::Number<1>{});
|
||||
// row-major thread layout
|
||||
const auto thread_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<8>{}, ck::Number<1>{}),
|
||||
ck::make_tuple(ck::Number<8>{}, ck::Number<1>{}, ck::Number<1>{}));
|
||||
// 3d partition on 2d shape (calculate partition on 3d thread layout, and then skip first dim)
|
||||
const auto thread_projection =
|
||||
ck::make_tuple(ck::wrapper::slice(4), ck::Number<1>{}, ck::Number<1>{});
|
||||
constexpr ck::index_t projection_thread_length = ck::Number<4>{};
|
||||
|
||||
for(ck::index_t thread_id = 0;
|
||||
thread_id < ck::wrapper::size(thread_layout) / projection_thread_length;
|
||||
thread_id++)
|
||||
{
|
||||
const auto packed_partition =
|
||||
ck::wrapper::make_local_partition(tensor, thread_layout, thread_id, thread_projection);
|
||||
|
||||
const auto expected_partition_size =
|
||||
ck::wrapper::size(tensor) /
|
||||
(ck::wrapper::size(thread_layout) / projection_thread_length);
|
||||
const auto expected_partition_first_val = thread_id * ck::wrapper::size<1>(thread_steps);
|
||||
const auto expected_partition_second_val = expected_partition_first_val + 1;
|
||||
EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size);
|
||||
EXPECT_EQ(packed_partition(0), expected_partition_first_val);
|
||||
EXPECT_EQ(packed_partition(1), expected_partition_second_val);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestPartition, LocalTile)
|
||||
{
|
||||
const auto shape = ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}, ck::Number<4>{});
|
||||
const auto strides = ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}, ck::Number<64>{});
|
||||
const auto layout = ck::wrapper::make_layout(shape, strides);
|
||||
|
||||
std::vector<ck::index_t> data(ck::wrapper::size(layout));
|
||||
std::iota(data.begin(), data.end(), 0);
|
||||
|
||||
const auto tensor =
|
||||
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
|
||||
// 4d tile partitioning on 3d shape (calculate tile on 4d tile layout, and then skip last dim)
|
||||
const auto block_shape =
|
||||
ck::make_tuple(ck::Number<2>{}, ck::Number<4>{}, ck::Number<2>{}, ck::Number<2>{});
|
||||
const auto block_projection =
|
||||
ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(2));
|
||||
|
||||
const auto grid_shape =
|
||||
ck::make_tuple(ck::wrapper::size<0>(shape) / ck::wrapper::size<0>(block_shape),
|
||||
ck::wrapper::size<1>(shape) / ck::wrapper::size<1>(block_shape),
|
||||
ck::wrapper::size<2>(shape) / ck::wrapper::size<2>(block_shape));
|
||||
std::vector<ck::Tuple<ck::index_t, ck::index_t, ck::index_t, ck::index_t>> block_idxs;
|
||||
for(int i = 0; i < ck::wrapper::size<0>(grid_shape); i++)
|
||||
{
|
||||
for(int j = 0; j < ck::wrapper::size<1>(grid_shape); j++)
|
||||
{
|
||||
for(int k = 0; k < ck::wrapper::size<2>(grid_shape); k++)
|
||||
{
|
||||
block_idxs.emplace_back(i, j, k, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(auto block_idx : block_idxs)
|
||||
{
|
||||
constexpr ck::index_t projection_block_dim = ck::Number<2>{};
|
||||
const auto packed_tile =
|
||||
ck::wrapper::make_local_tile(tensor, block_shape, block_idx, block_projection);
|
||||
|
||||
const auto expected_tile_size = ck::wrapper::size(block_shape) / projection_block_dim;
|
||||
auto expected_tile_first_val = ck::wrapper::size<2>(block_idx) *
|
||||
ck::wrapper::size<2>(block_shape) *
|
||||
ck::wrapper::size<2>(strides);
|
||||
expected_tile_first_val += ck::wrapper::size<1>(block_idx) *
|
||||
ck::wrapper::size<1>(block_shape) *
|
||||
ck::wrapper::size<1>(strides);
|
||||
expected_tile_first_val += ck::wrapper::size<0>(block_idx) *
|
||||
ck::wrapper::size<0>(block_shape) *
|
||||
ck::wrapper::size<0>(strides);
|
||||
|
||||
const auto expected_tile_second_val = expected_tile_first_val + 1;
|
||||
EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size);
|
||||
EXPECT_EQ(packed_tile(0), expected_tile_first_val);
|
||||
EXPECT_EQ(packed_tile(1), expected_tile_second_val);
|
||||
}
|
||||
}
|
||||
209
test/wrapper/test_wrapper_tensor.cpp
Normal file
209
test/wrapper/test_wrapper_tensor.cpp
Normal file
@@ -0,0 +1,209 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
#include "ck/wrapper/tensor.hpp"
|
||||
|
||||
// Compare data in tensor with offset from layout.
|
||||
// Data and offset should match if physical memory has been initialized with
|
||||
// sequentially increasing values from 0.
|
||||
template <typename TensorType>
|
||||
__host__ __device__ bool TestTensorCheck3d(TensorType& tensor)
|
||||
{
|
||||
const auto& layout = ck::wrapper::layout(tensor);
|
||||
for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
|
||||
{
|
||||
const auto idx = ck::make_tuple(ck::make_tuple(d, h), w);
|
||||
if(tensor(idx) != layout(idx))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename TensorType>
|
||||
__host__ __device__ bool TestTensorCheck1d(TensorType& tensor, ck::index_t start_offset = 0)
|
||||
{
|
||||
const auto& layout = ck::wrapper::layout(tensor);
|
||||
for(ck::index_t w = 0; w < ck::wrapper::size<0>(layout); w++)
|
||||
{
|
||||
if(tensor(w) - start_offset != layout(ck::make_tuple(w)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <ck::index_t nelems, typename TensorType>
|
||||
__host__ __device__ bool StaticTestTensorCheck1d(TensorType& tensor)
|
||||
{
|
||||
const auto& layout = ck::wrapper::layout(tensor);
|
||||
bool success = true;
|
||||
ck::static_for<0, nelems, 1>{}([&](auto w) {
|
||||
if(tensor(ck::Number<w.value>{}) != layout(ck::make_tuple(w.value)))
|
||||
{
|
||||
success = false;
|
||||
}
|
||||
});
|
||||
return success;
|
||||
}
|
||||
|
||||
template <typename TensorType>
|
||||
__host__ __device__ void InitTensor(TensorType& tensor)
|
||||
{
|
||||
for(ck::index_t i = 0; i < ck::wrapper::size(ck::wrapper::layout(tensor)); i++)
|
||||
{
|
||||
tensor(i) = i;
|
||||
}
|
||||
}
|
||||
|
||||
template <ck::index_t nelems, typename TensorType>
|
||||
__host__ __device__ void StaticInitTensor(TensorType& tensor)
|
||||
{
|
||||
|
||||
ck::static_for<0, nelems, 1>{}([&](auto i) { tensor(ck::Number<i.value>{}) = i.value; });
|
||||
}
|
||||
|
||||
// Tests
|
||||
TEST(TestTensor, ReadWriteHostMemory)
|
||||
{
|
||||
constexpr ck::index_t nelems = 8;
|
||||
|
||||
std::array<ck::index_t, nelems> data;
|
||||
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
|
||||
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
|
||||
InitTensor(tensor);
|
||||
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor));
|
||||
EXPECT_TRUE(TestTensorCheck3d(tensor));
|
||||
}
|
||||
|
||||
__global__ void TestTensorReadWriteDevice(void* data, void* success)
|
||||
{
|
||||
constexpr ck::index_t nelems = 8;
|
||||
__shared__ ck::index_t p_shared[nelems];
|
||||
|
||||
ck::index_t* casted_data_ptr = static_cast<ck::index_t*>(data);
|
||||
bool* casted_success_ptr = static_cast<bool*>(success);
|
||||
|
||||
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
|
||||
constexpr auto vgpr_layout =
|
||||
ck::wrapper::make_layout(make_tuple(ck::Number<nelems>{}), make_tuple(ck::Number<1>{}));
|
||||
|
||||
auto tensor_global =
|
||||
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout);
|
||||
auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(p_shared, layout);
|
||||
auto tensor_vgpr =
|
||||
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
|
||||
vgpr_layout);
|
||||
|
||||
InitTensor(tensor_global);
|
||||
InitTensor(tensor_lds);
|
||||
StaticInitTensor<nelems>(tensor_vgpr);
|
||||
|
||||
*casted_success_ptr = TestTensorCheck1d(tensor_global);
|
||||
*casted_success_ptr &= TestTensorCheck3d(tensor_global);
|
||||
|
||||
*casted_success_ptr &= TestTensorCheck1d(tensor_lds);
|
||||
*casted_success_ptr &= TestTensorCheck3d(tensor_lds);
|
||||
|
||||
*casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_vgpr);
|
||||
}
|
||||
|
||||
TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory)
|
||||
{
|
||||
constexpr ck::index_t nelems = 8;
|
||||
std::array<ck::index_t, nelems> host_data;
|
||||
|
||||
DeviceMem data_buf(nelems * sizeof(ck::index_t));
|
||||
data_buf.ToDevice(&host_data[0]);
|
||||
DeviceMem success_buf(sizeof(bool));
|
||||
|
||||
launch_and_time_kernel(StreamConfig{},
|
||||
TestTensorReadWriteDevice,
|
||||
dim3(1),
|
||||
dim3(1),
|
||||
0,
|
||||
data_buf.GetDeviceBuffer(),
|
||||
success_buf.GetDeviceBuffer());
|
||||
|
||||
bool success;
|
||||
success_buf.FromDevice(&success);
|
||||
EXPECT_TRUE(success);
|
||||
}
|
||||
|
||||
TEST(TestTensor, Slicing)
|
||||
{
|
||||
constexpr ck::index_t nelems = 8;
|
||||
|
||||
std::array<ck::index_t, nelems> data;
|
||||
const auto shape = ck::make_tuple(ck::make_tuple(2, 2), 2);
|
||||
const auto strides = ck::make_tuple(ck::make_tuple(1, 2), 4);
|
||||
const auto layout = ck::wrapper::make_layout(shape, strides);
|
||||
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
|
||||
InitTensor(tensor);
|
||||
|
||||
auto tensor2x2x2 =
|
||||
tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2));
|
||||
EXPECT_EQ(tensor2x2x2(0), layout(ck::make_tuple(ck::make_tuple(0, 0), 0)));
|
||||
EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2);
|
||||
EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2);
|
||||
EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2));
|
||||
|
||||
auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2));
|
||||
EXPECT_EQ(tensor2x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0)));
|
||||
EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2);
|
||||
EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2);
|
||||
EXPECT_EQ(ck::wrapper::size(tensor2x2), 4);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor2x2));
|
||||
|
||||
auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2));
|
||||
EXPECT_EQ(tensor1x1(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 1)));
|
||||
EXPECT_EQ(rank(tensor1x1), 2);
|
||||
EXPECT_EQ(depth(tensor1x1), 2);
|
||||
EXPECT_EQ(size(tensor1x1), 1);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor1x1));
|
||||
|
||||
auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2));
|
||||
EXPECT_EQ(tensor2(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 0)));
|
||||
EXPECT_EQ(ck::wrapper::rank(tensor2), 1);
|
||||
EXPECT_EQ(ck::wrapper::depth(tensor2), 1);
|
||||
EXPECT_EQ(ck::wrapper::size(tensor2), 2);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor2));
|
||||
|
||||
auto tensor2_v2 = tensor(2, ck::wrapper::slice(0, 2));
|
||||
EXPECT_EQ(tensor2_v2(0), layout(ck::make_tuple(2, 0)));
|
||||
EXPECT_EQ(ck::wrapper::rank(tensor2_v2), 1);
|
||||
EXPECT_EQ(ck::wrapper::depth(tensor2_v2), 1);
|
||||
EXPECT_EQ(ck::wrapper::size(tensor2_v2), 2);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor2_v2));
|
||||
|
||||
// negative indexing
|
||||
auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice());
|
||||
EXPECT_EQ(tensor1x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0)));
|
||||
EXPECT_EQ(rank(tensor1x2), 2);
|
||||
EXPECT_EQ(depth(tensor1x2), 2);
|
||||
EXPECT_EQ(size(tensor1x2), 2);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor1x2));
|
||||
}
|
||||
Reference in New Issue
Block a user