mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 09:16:52 +00:00
fix merge from upstream
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
|
||||
set(target 0)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST gpu_list AND target EQUAL 0)
|
||||
@@ -6,4 +6,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
|
||||
target_link_libraries(test_batched_gemm PRIVATE utility device_batched_gemm_instance)
|
||||
set(target 1)
|
||||
endif()
|
||||
endforeach()
|
||||
endforeach()
|
||||
@@ -1,4 +1,4 @@
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
|
||||
set(target 0)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST gpu_list AND target EQUAL 0)
|
||||
@@ -10,4 +10,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
|
||||
set(target 1)
|
||||
endif()
|
||||
endif()
|
||||
endforeach()
|
||||
endforeach()
|
||||
@@ -1,4 +1,4 @@
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
|
||||
set(target 0)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST gpu_list AND target EQUAL 0)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
|
||||
set(target 0)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST gpu_list AND target EQUAL 0)
|
||||
@@ -10,4 +10,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
|
||||
set(target 1)
|
||||
endif()
|
||||
endif()
|
||||
endforeach()
|
||||
endforeach()
|
||||
@@ -1,4 +1,4 @@
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
|
||||
set(target 0)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST gpu_list AND target EQUAL 0)
|
||||
@@ -26,4 +26,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
|
||||
endif()
|
||||
set(target 1)
|
||||
endif()
|
||||
endforeach()
|
||||
endforeach()
|
||||
@@ -1,4 +1,4 @@
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
|
||||
set(target 0)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST gpu_list AND target EQUAL 0)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
|
||||
set(target 0)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST gpu_list AND target EQUAL 0)
|
||||
@@ -6,4 +6,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
|
||||
target_link_libraries(test_convnd_bwd_data PRIVATE utility device_conv1d_bwd_data_instance device_conv2d_bwd_data_instance device_conv3d_bwd_data_instance)
|
||||
set(target 1)
|
||||
endif()
|
||||
endforeach()
|
||||
endforeach()
|
||||
@@ -1,4 +1,4 @@
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
|
||||
set(target 0)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST gpu_list AND target EQUAL 0)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
|
||||
set(target 0)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST gpu_list AND target EQUAL 0)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
|
||||
set(target 0)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST gpu_list AND target EQUAL 0)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
list(APPEND gpu_list_xdl gfx908 gfx90a gfx940)
|
||||
list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102)
|
||||
list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1103)
|
||||
set(target 0)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST gpu_list_xdl AND target EQUAL 0)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
list(APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
|
||||
list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102)
|
||||
list(APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942)
|
||||
list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1103)
|
||||
|
||||
set(target 0)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
@@ -17,4 +17,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
|
||||
target_link_libraries(test_grouped_convnd_bwd_weight_interface PRIVATE utility)
|
||||
set(target 1)
|
||||
endif()
|
||||
endforeach()
|
||||
endforeach()
|
||||
@@ -1,4 +1,4 @@
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
|
||||
set(target 0)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST gpu_list AND target EQUAL 0)
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "test_permute_scale_impl.hpp"
|
||||
#include "profiler/profile_permute_scale_impl.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
@@ -15,15 +15,32 @@ class TestPermute : public ::testing::Test
|
||||
using ADataType = std::tuple_element_t<0, Tuple>;
|
||||
using BDataType = std::tuple_element_t<1, Tuple>;
|
||||
|
||||
void Run()
|
||||
constexpr bool skip_case()
|
||||
{
|
||||
std::vector<std::vector<ck::index_t>> lengths = {
|
||||
{4, 2, 1, 8}, {1, 1, 1, 1}, {16, 8, 32, 64}, {32, 64, 128, 128}};
|
||||
|
||||
for(auto length : lengths)
|
||||
#ifndef CK_ENABLE_FP16
|
||||
if constexpr(ck::is_same_v<ADataType, F16> || ck::is_same_v<BDataType, F16>)
|
||||
{
|
||||
bool success =
|
||||
ck::test_permute_scale_impl<ADataType, BDataType, 4>(true, 2, false, false, length);
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
#ifndef CK_ENABLE_FP32
|
||||
if constexpr(ck::is_same_v<ADataType, F32> || ck::is_same_v<BDataType, F32>)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
template <ck::index_t NDims>
|
||||
void Run(std::vector<ck::index_t> lengths,
|
||||
std::vector<ck::index_t> input_strides,
|
||||
std::vector<ck::index_t> output_strides)
|
||||
{
|
||||
if(!skip_case())
|
||||
{
|
||||
bool success = ck::profiler::profile_permute_scale_impl<ADataType, BDataType, NDims>(
|
||||
true, 2, false, false, lengths, input_strides, output_strides);
|
||||
EXPECT_TRUE(success);
|
||||
}
|
||||
}
|
||||
@@ -32,5 +49,52 @@ class TestPermute : public ::testing::Test
|
||||
using KernelTypes = ::testing::Types<std::tuple<F16, F16>, std::tuple<F32, F32>>;
|
||||
|
||||
TYPED_TEST_SUITE(TestPermute, KernelTypes);
|
||||
TYPED_TEST(TestPermute, Test_FP16) { this->Run(); }
|
||||
TYPED_TEST(TestPermute, Test_FP32) { this->Run(); }
|
||||
TYPED_TEST(TestPermute, Test1D)
|
||||
{
|
||||
constexpr ck::index_t NumDims = 1;
|
||||
this->template Run<NumDims>({16}, {1}, {1});
|
||||
this->template Run<NumDims>({16}, {1}, {2});
|
||||
this->template Run<NumDims>({1}, {1}, {1});
|
||||
}
|
||||
|
||||
TYPED_TEST(TestPermute, Test2D)
|
||||
{
|
||||
constexpr ck::index_t NumDims = 2;
|
||||
this->template Run<NumDims>({8, 16}, {16, 1}, {1, 8});
|
||||
this->template Run<NumDims>({8, 16}, {1, 8}, {16, 1});
|
||||
this->template Run<NumDims>({1, 1}, {1, 1}, {1, 1});
|
||||
}
|
||||
|
||||
TYPED_TEST(TestPermute, Test3D)
|
||||
{
|
||||
constexpr ck::index_t NumDims = 3;
|
||||
this->template Run<NumDims>({8, 2, 8}, {16, 8, 1}, {1, 8, 16});
|
||||
this->template Run<NumDims>({8, 2, 8}, {1, 8, 16}, {16, 8, 1});
|
||||
this->template Run<NumDims>({1, 1, 1}, {1, 1, 1}, {1, 1, 1});
|
||||
}
|
||||
|
||||
TYPED_TEST(TestPermute, Test4D)
|
||||
{
|
||||
constexpr ck::index_t NumDims = 4;
|
||||
this->template Run<NumDims>({8, 2, 3, 8}, {48, 24, 8, 1}, {1, 8, 16, 48});
|
||||
this->template Run<NumDims>({8, 2, 3, 8}, {1, 8, 16, 48}, {48, 24, 8, 1});
|
||||
this->template Run<NumDims>({1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1});
|
||||
}
|
||||
|
||||
TYPED_TEST(TestPermute, Test5D)
|
||||
{
|
||||
constexpr ck::index_t NumDims = 5;
|
||||
this->template Run<NumDims>({8, 2, 3, 4, 8}, {192, 96, 32, 8, 1}, {1, 8, 16, 48, 192});
|
||||
this->template Run<NumDims>({8, 2, 3, 4, 8}, {1, 8, 16, 48, 192}, {192, 96, 32, 8, 1});
|
||||
this->template Run<NumDims>({1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1});
|
||||
}
|
||||
|
||||
TYPED_TEST(TestPermute, Test6D)
|
||||
{
|
||||
constexpr ck::index_t NumDims = 6;
|
||||
this->template Run<NumDims>(
|
||||
{8, 2, 3, 4, 5, 8}, {960, 480, 160, 40, 8, 1}, {1, 8, 16, 48, 192, 960});
|
||||
this->template Run<NumDims>(
|
||||
{8, 2, 3, 4, 5, 8}, {1, 8, 16, 48, 192, 960}, {960, 480, 160, 40, 8, 1});
|
||||
this->template Run<NumDims>({1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1});
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
|
||||
list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
|
||||
set(target 0)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST gpu_list AND target EQUAL 0)
|
||||
|
||||
@@ -1,14 +1,21 @@
|
||||
add_gtest_executable(test_layout test_layout.cpp)
|
||||
target_link_libraries(test_layout PRIVATE utility)
|
||||
add_gtest_executable(test_tensor test_tensor.cpp)
|
||||
target_link_libraries(test_tensor PRIVATE utility)
|
||||
add_gtest_executable(test_copy test_copy.cpp)
|
||||
target_link_libraries(test_copy PRIVATE utility)
|
||||
add_gtest_executable(test_partition test_partition.cpp)
|
||||
target_link_libraries(test_partition PRIVATE utility)
|
||||
add_custom_target(test_wrapper)
|
||||
|
||||
add_gtest_executable(test_wrapper_layout test_wrapper_layout.cpp)
|
||||
target_link_libraries(test_wrapper_layout PRIVATE utility)
|
||||
add_dependencies(test_wrapper test_wrapper_layout)
|
||||
add_gtest_executable(test_wrapper_tensor test_wrapper_tensor.cpp)
|
||||
target_link_libraries(test_wrapper_tensor PRIVATE utility)
|
||||
add_dependencies(test_wrapper test_wrapper_tensor)
|
||||
add_gtest_executable(test_wrapper_copy test_wrapper_copy.cpp)
|
||||
target_link_libraries(test_wrapper_copy PRIVATE utility)
|
||||
add_dependencies(test_wrapper test_wrapper_copy)
|
||||
add_gtest_executable(test_wrapper_partition test_wrapper_partition.cpp)
|
||||
target_link_libraries(test_wrapper_partition PRIVATE utility)
|
||||
add_dependencies(test_wrapper test_wrapper_partition)
|
||||
if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR
|
||||
GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR
|
||||
GPU_TARGETS MATCHES "gfx942" OR GPU_TARGETS MATCHES "gfx950")
|
||||
add_gtest_executable(test_gemm test_gemm.cpp)
|
||||
target_link_libraries(test_gemm PRIVATE utility)
|
||||
GPU_TARGETS MATCHES "gfx942")
|
||||
add_gtest_executable(test_wrapper_gemm test_wrapper_gemm.cpp)
|
||||
target_link_libraries(test_wrapper_gemm PRIVATE utility)
|
||||
add_dependencies(test_wrapper test_wrapper_gemm)
|
||||
endif()
|
||||
|
||||
135
test/wrapper/test_wrapper_copy.cpp
Normal file
135
test/wrapper/test_wrapper_copy.cpp
Normal file
@@ -0,0 +1,135 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <numeric>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
#include "ck/wrapper/tensor.hpp"
|
||||
#include "ck/wrapper/operations/copy.hpp"
|
||||
|
||||
// Test copy from Global to Global through LDS and VGPR
|
||||
template <typename InputTensor,
|
||||
typename OutputTensor,
|
||||
typename BlockShape,
|
||||
typename ThreadLayout,
|
||||
bool UseOptimizedCopy>
|
||||
__global__ void TestCopyDevice(const InputTensor input_tensor,
|
||||
OutputTensor output_tensor,
|
||||
const BlockShape tile_shape,
|
||||
const ThreadLayout thread_layout)
|
||||
{
|
||||
__shared__ ck::index_t p_shared[ck::wrapper::size(tile_shape)];
|
||||
const auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
p_shared, ck::wrapper::make_layout(tile_shape));
|
||||
|
||||
const auto block_idxs =
|
||||
ck::make_tuple(static_cast<ck::index_t>(blockIdx.x), static_cast<ck::index_t>(blockIdx.y));
|
||||
|
||||
// Get local tiles for global memory
|
||||
const auto input_local_tile =
|
||||
ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idxs);
|
||||
const auto output_local_tile =
|
||||
ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idxs);
|
||||
|
||||
// Get partition per thread
|
||||
const auto input_local_partition =
|
||||
ck::wrapper::make_local_partition(input_local_tile, thread_layout, threadIdx.x);
|
||||
auto lds_local_partition =
|
||||
ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x);
|
||||
auto output_local_partition =
|
||||
ck::wrapper::make_local_partition(output_local_tile, thread_layout, threadIdx.x);
|
||||
|
||||
// Allocate VGPR
|
||||
auto tensor_vgpr =
|
||||
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
|
||||
ck::wrapper::make_layout(shape(lds_local_partition)));
|
||||
|
||||
// Perform copy
|
||||
if constexpr(UseOptimizedCopy)
|
||||
{
|
||||
using DimAccessOrder = ck::Tuple<ck::Number<1>, ck::Number<0>>;
|
||||
constexpr ck::index_t vector_dim = 0;
|
||||
constexpr ck::index_t scalar_per_vector = 2;
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(input_local_partition,
|
||||
lds_local_partition);
|
||||
// TODO: Enable optimized copy for static buffers
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(lds_local_partition,
|
||||
tensor_vgpr);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(tensor_vgpr,
|
||||
output_local_partition);
|
||||
}
|
||||
else
|
||||
{
|
||||
ck::wrapper::copy(input_local_partition, lds_local_partition);
|
||||
ck::wrapper::copy(lds_local_partition, tensor_vgpr);
|
||||
ck::wrapper::copy(tensor_vgpr, output_local_partition);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool UseOptimizedCopy>
|
||||
void PerformCopyGlobalToGlobalViaLDS()
|
||||
{
|
||||
const auto shape =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<256>{});
|
||||
const auto strides =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<2>{}), ck::Number<4>{});
|
||||
const auto layout = ck::wrapper::make_layout(shape, strides);
|
||||
|
||||
// 0, 1, 2, ..., size(shape) - 1
|
||||
std::vector<ck::index_t> input_data(ck::wrapper::size(shape));
|
||||
std::iota(input_data.begin(), input_data.end(), 0);
|
||||
|
||||
// Global memory buffers
|
||||
DeviceMem in_buf(ck::wrapper::size(layout) * sizeof(ck::index_t));
|
||||
DeviceMem out_buf(ck::wrapper::size(layout) * sizeof(ck::index_t));
|
||||
|
||||
in_buf.ToDevice(input_data.data());
|
||||
out_buf.SetZero();
|
||||
|
||||
// Create tensors for global memory
|
||||
const auto input_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<const ck::index_t*>(in_buf.GetDeviceBuffer()), layout);
|
||||
auto output_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<ck::index_t*>(out_buf.GetDeviceBuffer()), layout);
|
||||
|
||||
const auto thread_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<1>{}, ck::Number<32>{}));
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<4>{}, ck::Number<64>{});
|
||||
|
||||
const ck::index_t grid_size_x = ck::math::integer_divide_ceil(
|
||||
ck::wrapper::size<0>(input_tensor_global), ck::wrapper::size<0>(tile_shape));
|
||||
const ck::index_t grid_size_y = ck::math::integer_divide_ceil(
|
||||
ck::wrapper::size<1>(input_tensor_global), ck::wrapper::size<1>(tile_shape));
|
||||
|
||||
const auto kernel = TestCopyDevice<decltype(input_tensor_global),
|
||||
decltype(output_tensor_global),
|
||||
decltype(tile_shape),
|
||||
decltype(thread_layout),
|
||||
UseOptimizedCopy>;
|
||||
launch_and_time_kernel(StreamConfig{},
|
||||
kernel,
|
||||
dim3(grid_size_x, grid_size_y, 1),
|
||||
dim3(ck::wrapper::size(thread_layout)),
|
||||
0,
|
||||
input_tensor_global,
|
||||
output_tensor_global,
|
||||
tile_shape,
|
||||
thread_layout);
|
||||
|
||||
// Verify results
|
||||
std::vector<ck::index_t> output_data(ck::wrapper::size(shape));
|
||||
out_buf.FromDevice(output_data.data());
|
||||
EXPECT_TRUE(ck::utils::check_err(output_data, input_data));
|
||||
}
|
||||
|
||||
TEST(TestCopyGlobalToGlobalViaLDS, GenericCopy) { PerformCopyGlobalToGlobalViaLDS<false>(); }
|
||||
TEST(TestCopyGlobalToGlobalViaLDS, OptimizedCopy) { PerformCopyGlobalToGlobalViaLDS<true>(); }
|
||||
376
test/wrapper/test_wrapper_gemm.cpp
Normal file
376
test/wrapper/test_wrapper_gemm.cpp
Normal file
@@ -0,0 +1,376 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <numeric>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/library/utility/fill.hpp"
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
#include "ck/wrapper/tensor.hpp"
|
||||
#include "ck/wrapper/operations/copy.hpp"
|
||||
#include "ck/wrapper/operations/gemm.hpp"
|
||||
#include "ck/wrapper/utils/kernel_utils.hpp"
|
||||
|
||||
template <typename DataType>
|
||||
void CheckResult(const std::vector<DataType>& a_data,
|
||||
const std::vector<DataType>& b_data,
|
||||
std::vector<DataType>& c_m_n_device_result,
|
||||
const ck::index_t M,
|
||||
const ck::index_t N,
|
||||
const ck::index_t K)
|
||||
{
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ReferenceGemmInstance = ck::tensor_operation::host::
|
||||
ReferenceGemm<DataType, DataType, DataType, float, PassThrough, PassThrough, PassThrough>;
|
||||
|
||||
Tensor<DataType> a_m_k(HostTensorDescriptor({M, K}));
|
||||
Tensor<DataType> b_k_n(HostTensorDescriptor({K, N}, {1, K}));
|
||||
Tensor<DataType> c_m_n_host_result(HostTensorDescriptor({M, N}));
|
||||
|
||||
a_m_k.mData = a_data;
|
||||
b_k_n.mData = b_data;
|
||||
|
||||
auto ref_op = ReferenceGemmInstance{};
|
||||
auto ref_invoker = ref_op.MakeInvoker();
|
||||
auto ref_argument = ref_op.MakeArgument(
|
||||
a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
EXPECT_TRUE(ck::utils::check_err(c_m_n_device_result, c_m_n_host_result.mData));
|
||||
}
|
||||
|
||||
template <bool DoPad, typename Layout, typename PaddingDims>
|
||||
__device__ auto ApplyPadding(const Layout& layout, const PaddingDims& padding_dims)
|
||||
{
|
||||
if constexpr(DoPad)
|
||||
{
|
||||
return ck::wrapper::pad(layout, padding_dims);
|
||||
}
|
||||
else
|
||||
{
|
||||
return layout;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DataType,
|
||||
typename GemmTraits,
|
||||
ck::index_t scalar_per_vector,
|
||||
typename BlockShape,
|
||||
typename ThreadLayout,
|
||||
bool DoPadding>
|
||||
__global__ void __CK_WRAPPER_LAUNCH_BOUNDS__ DeviceGemm(const void* p_a,
|
||||
const void* p_b,
|
||||
void* p_c,
|
||||
const ck::index_t M,
|
||||
const ck::index_t N,
|
||||
const ck::index_t K,
|
||||
const BlockShape tile_shape,
|
||||
const ThreadLayout thread_layout)
|
||||
{
|
||||
constexpr auto MPerBlock = ck::wrapper::size<0>(tile_shape);
|
||||
constexpr auto NPerBlock = ck::wrapper::size<1>(tile_shape);
|
||||
constexpr auto KPerBlock = ck::wrapper::size<2>(tile_shape);
|
||||
constexpr auto K1 = GemmTraits::K1;
|
||||
constexpr auto K0PerBlock = KPerBlock / K1;
|
||||
const auto K0 = ck::math::integer_divide_ceil(K, K1);
|
||||
|
||||
const auto tile_shape_k0_m_n_k1 = ck::make_tuple(K0PerBlock, MPerBlock, NPerBlock, K1);
|
||||
|
||||
const auto a_global_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(M, K), ck::make_tuple(K, 1));
|
||||
const auto b_global_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(N, K), ck::make_tuple(K, 1));
|
||||
const auto c_global_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(M, N), ck::make_tuple(N, 1));
|
||||
|
||||
auto a_padded_global_layout =
|
||||
ApplyPadding<DoPadding>(a_global_layout, ck::make_tuple(MPerBlock, KPerBlock));
|
||||
auto b_padded_global_layout =
|
||||
ApplyPadding<DoPadding>(b_global_layout, ck::make_tuple(NPerBlock, KPerBlock));
|
||||
auto c_padded_global_layout =
|
||||
ApplyPadding<DoPadding>(c_global_layout, ck::make_tuple(MPerBlock, NPerBlock));
|
||||
|
||||
// Reshape from M,K to K0,M,K1
|
||||
const auto reshaped_dims_idxs =
|
||||
ck::make_tuple(ck::Number<1>{}, ck::make_tuple(ck::Number<0>{}, ck::Number<2>{}));
|
||||
auto a_padded_unmerged_global_layout =
|
||||
ck::wrapper::unmerge<1>(a_padded_global_layout, ck::make_tuple(K0, K1), reshaped_dims_idxs);
|
||||
auto b_padded_unmerged_global_layout =
|
||||
ck::wrapper::unmerge<1>(b_padded_global_layout, ck::make_tuple(K0, K1), reshaped_dims_idxs);
|
||||
|
||||
auto a_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<const DataType*>(p_a), a_padded_unmerged_global_layout);
|
||||
auto b_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<const DataType*>(p_b), b_padded_unmerged_global_layout);
|
||||
auto c_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<DataType*>(p_c), c_padded_global_layout);
|
||||
|
||||
// Add extra M and N
|
||||
constexpr auto a_tile_layout = ck::wrapper::make_layout(
|
||||
ck::make_tuple(K0PerBlock, MPerBlock, K1),
|
||||
ck::make_tuple((MPerBlock + ck::Number<1>{}) * K1, K1, ck::Number<1>{}));
|
||||
constexpr auto b_tile_layout = ck::wrapper::make_layout(
|
||||
ck::make_tuple(K0PerBlock, NPerBlock, K1),
|
||||
ck::make_tuple((NPerBlock + ck::Number<1>{}) * K1, K1, ck::Number<1>{}));
|
||||
|
||||
__shared__ DataType lds_a[ck::wrapper::size(a_tile_layout) + NPerBlock];
|
||||
__shared__ DataType lds_b[ck::wrapper::size(b_tile_layout) + NPerBlock];
|
||||
|
||||
auto a_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
static_cast<DataType*>(lds_a), a_tile_layout);
|
||||
auto b_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
static_cast<DataType*>(lds_b), b_tile_layout);
|
||||
|
||||
const auto block_idxs = ck::make_tuple(ck::wrapper::slice(),
|
||||
static_cast<ck::index_t>(blockIdx.x),
|
||||
static_cast<ck::index_t>(blockIdx.y),
|
||||
ck::wrapper::slice());
|
||||
using DimAccessOrder = ck::Tuple<ck::Number<1>, ck::Number<0>, ck::Number<2>>;
|
||||
constexpr ck::index_t vector_dim = 2;
|
||||
|
||||
auto c_global_local_tile =
|
||||
ck::wrapper::make_local_tile(c_global_tensor,
|
||||
tile_shape_k0_m_n_k1,
|
||||
block_idxs,
|
||||
make_tuple(ck::wrapper::slice(K0PerBlock),
|
||||
ck::Number<1>{},
|
||||
ck::Number<1>{},
|
||||
ck::wrapper::slice(K1)));
|
||||
auto c_global_local_partition =
|
||||
ck::wrapper::make_blockwise_gemm_xdl_c_local_partition<DataType,
|
||||
decltype(a_tile_layout),
|
||||
decltype(b_tile_layout),
|
||||
ck::wrapper::size(thread_layout),
|
||||
GemmTraits>(c_global_local_tile);
|
||||
auto c_vgpr_reg = ck::wrapper::make_blockwise_gemm_xdl_c_vgpr<DataType,
|
||||
decltype(a_tile_layout),
|
||||
decltype(b_tile_layout),
|
||||
ck::wrapper::size(thread_layout),
|
||||
GemmTraits>();
|
||||
ck::wrapper::clear(c_vgpr_reg);
|
||||
|
||||
auto a_lds_tensor_local_partition =
|
||||
ck::wrapper::make_local_partition(a_lds_tensor, thread_layout, threadIdx.x);
|
||||
auto b_lds_tensor_local_partition =
|
||||
ck::wrapper::make_local_partition(b_lds_tensor, thread_layout, threadIdx.x);
|
||||
|
||||
auto make_global_partition = [&](auto tensor, auto projection, ck::index_t i) {
|
||||
const auto k_slice =
|
||||
ck::make_tuple(ck::wrapper::slice(i * K0PerBlock, (i + 1) * K0PerBlock),
|
||||
ck::wrapper::slice(),
|
||||
ck::wrapper::slice());
|
||||
auto local_tile = ck::wrapper::make_local_tile(
|
||||
tensor(k_slice), tile_shape_k0_m_n_k1, block_idxs, projection);
|
||||
return ck::wrapper::make_local_partition(local_tile, thread_layout, threadIdx.x);
|
||||
};
|
||||
|
||||
auto a_global_local_partition = make_global_partition(
|
||||
a_global_tensor,
|
||||
make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}),
|
||||
0);
|
||||
auto b_global_local_partition = make_global_partition(
|
||||
b_global_tensor,
|
||||
make_tuple(ck::Number<1>{}, ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}),
|
||||
0);
|
||||
|
||||
// (row-major vgpr layout)
|
||||
auto a_vgpr_tensor =
|
||||
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, DataType>(
|
||||
ck::wrapper::make_layout(
|
||||
shape(a_global_local_partition),
|
||||
ck::make_tuple(ck::wrapper::size<1>(a_global_local_partition) *
|
||||
ck::wrapper::size<2>(a_global_local_partition),
|
||||
ck::wrapper::size<2>(a_global_local_partition),
|
||||
ck::Number<1>{})));
|
||||
auto b_vgpr_tensor =
|
||||
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, DataType>(
|
||||
ck::wrapper::make_layout(
|
||||
shape(b_global_local_partition),
|
||||
ck::make_tuple(ck::wrapper::size<1>(a_global_local_partition) *
|
||||
ck::wrapper::size<2>(a_global_local_partition),
|
||||
ck::wrapper::size<2>(a_global_local_partition),
|
||||
ck::Number<1>{})));
|
||||
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(a_global_local_partition,
|
||||
a_vgpr_tensor);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(b_global_local_partition,
|
||||
b_vgpr_tensor);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(a_vgpr_tensor,
|
||||
a_lds_tensor_local_partition);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(b_vgpr_tensor,
|
||||
b_lds_tensor_local_partition);
|
||||
|
||||
const ck::index_t num_loop =
|
||||
__builtin_amdgcn_readfirstlane(ck::math::integer_divide_ceil(K, KPerBlock));
|
||||
if(num_loop > 1)
|
||||
{
|
||||
ck::index_t i = 0;
|
||||
do
|
||||
{
|
||||
auto a_global_local_partition_i = make_global_partition(
|
||||
a_global_tensor,
|
||||
make_tuple(
|
||||
ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}),
|
||||
i + 1);
|
||||
auto b_global_local_partition_i = make_global_partition(
|
||||
b_global_tensor,
|
||||
make_tuple(
|
||||
ck::Number<1>{}, ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}),
|
||||
i + 1);
|
||||
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
a_global_local_partition_i, a_vgpr_tensor);
|
||||
|
||||
ck::block_sync_lds();
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
b_global_local_partition_i, b_vgpr_tensor);
|
||||
|
||||
ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
|
||||
a_lds_tensor, b_lds_tensor, c_vgpr_reg);
|
||||
|
||||
ck::block_sync_lds();
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
a_vgpr_tensor, a_lds_tensor_local_partition);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
b_vgpr_tensor, b_lds_tensor_local_partition);
|
||||
|
||||
++i;
|
||||
} while(i < (num_loop - 1));
|
||||
}
|
||||
ck::block_sync_lds();
|
||||
ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
|
||||
a_lds_tensor, b_lds_tensor, c_vgpr_reg);
|
||||
|
||||
ck::wrapper::copy(c_vgpr_reg, c_global_local_partition);
|
||||
}
|
||||
|
||||
template <typename DataType,
|
||||
typename GemmTraits,
|
||||
ck::index_t scalar_per_vector,
|
||||
bool DoPadding,
|
||||
typename BlockShape,
|
||||
typename ThreadLayout>
|
||||
void PerformGemm(const ck::index_t M,
|
||||
const ck::index_t N,
|
||||
const ck::index_t K,
|
||||
const BlockShape& tile_shape,
|
||||
const ThreadLayout& thread_layout)
|
||||
{
|
||||
// Global memory buffers
|
||||
DeviceMem a_mem(M * K * sizeof(DataType));
|
||||
DeviceMem b_mem(K * N * sizeof(DataType));
|
||||
DeviceMem c_mem(M * N * sizeof(DataType));
|
||||
|
||||
std::vector<DataType> a_data(M * K);
|
||||
std::vector<DataType> b_data(K * N);
|
||||
ck::utils::FillUniformDistributionIntegerValue<DataType>{-5.f, 5.f}(a_data);
|
||||
ck::utils::FillUniformDistributionIntegerValue<DataType>{-5.f, 5.f}(b_data);
|
||||
|
||||
a_mem.ToDevice(a_data.data());
|
||||
b_mem.ToDevice(b_data.data());
|
||||
c_mem.SetZero();
|
||||
|
||||
const ck::index_t grid_size_x =
|
||||
ck::math::integer_divide_ceil(M, ck::wrapper::size<0>(tile_shape));
|
||||
const ck::index_t grid_size_y =
|
||||
ck::math::integer_divide_ceil(N, ck::wrapper::size<1>(tile_shape));
|
||||
|
||||
const auto kernel =
|
||||
DeviceGemm<DataType, GemmTraits, scalar_per_vector, BlockShape, ThreadLayout, DoPadding>;
|
||||
const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true},
|
||||
kernel,
|
||||
dim3(grid_size_x, grid_size_y, 1),
|
||||
dim3(ck::wrapper::size(thread_layout)),
|
||||
0,
|
||||
a_mem.GetDeviceBuffer(),
|
||||
b_mem.GetDeviceBuffer(),
|
||||
c_mem.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
tile_shape,
|
||||
thread_layout);
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
sizeof(DataType) * M * K + sizeof(DataType) * K * N + sizeof(DataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << std::endl;
|
||||
|
||||
std::vector<DataType> c_data(M * N);
|
||||
c_mem.FromDevice(c_data.data());
|
||||
CheckResult<DataType>(a_data, b_data, c_data, M, N, K);
|
||||
}
|
||||
|
||||
TEST(TestGemm, Float)
|
||||
{
|
||||
using DataType = float;
|
||||
// (dim1, dim2, dim0 thread layout)
|
||||
const auto thread_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
|
||||
ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}, ck::Number<1>{}));
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<16>{});
|
||||
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_4K1, 4, false>(
|
||||
512, 512, 128, tile_shape, thread_layout);
|
||||
// Irregular case
|
||||
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_4K1, 1, true>(
|
||||
129, 129, 67, tile_shape, thread_layout);
|
||||
}
|
||||
|
||||
TEST(TestGemm, Int8)
|
||||
{
|
||||
using DataType = int8_t;
|
||||
const auto thread_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
|
||||
ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}, ck::Number<1>{}));
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<64>{});
|
||||
PerformGemm<DataType,
|
||||
ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_16K1,
|
||||
16,
|
||||
false>(512, 512, 128, tile_shape, thread_layout);
|
||||
// Irregular case
|
||||
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_16K1, 1, true>(
|
||||
129, 129, 67, tile_shape, thread_layout);
|
||||
}
|
||||
|
||||
TEST(TestGemm, Half)
|
||||
{
|
||||
using DataType = ck::half_t;
|
||||
const auto thread_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
|
||||
ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}, ck::Number<1>{}));
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<32>{});
|
||||
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_8K1, 8, false>(
|
||||
512, 512, 128, tile_shape, thread_layout);
|
||||
// Irregular case
|
||||
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_8K1, 1, true>(
|
||||
129, 129, 67, tile_shape, thread_layout);
|
||||
}
|
||||
|
||||
TEST(TestGemm, Float_2x4_4x2_XdlPerWave)
|
||||
{
|
||||
using DataType = float;
|
||||
const auto thread_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
|
||||
ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}, ck::Number<1>{}));
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<256>{}, ck::Number<128>{}, ck::Number<16>{});
|
||||
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_4x2XdlPerWave_4K1, 4, false>(
|
||||
512, 512, 128, tile_shape, thread_layout);
|
||||
}
|
||||
474
test/wrapper/test_wrapper_layout.cpp
Normal file
474
test/wrapper/test_wrapper_layout.cpp
Normal file
@@ -0,0 +1,474 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
#include "ck/tensor_description/multi_index_transform_helper.hpp"
|
||||
|
||||
class TestWrapperLayout : public ::testing::Test
|
||||
{
|
||||
protected:
|
||||
static constexpr auto I0 = ck::Number<0>{};
|
||||
static constexpr auto I1 = ck::Number<1>{};
|
||||
|
||||
template <typename Desc,
|
||||
typename Desc1d,
|
||||
typename LayoutRuntime,
|
||||
typename LayoutCompiletime,
|
||||
typename Idxs>
|
||||
void Run(Desc& desc,
|
||||
Desc1d& desc_1d,
|
||||
LayoutRuntime& layout_runtime,
|
||||
LayoutCompiletime& layout_compiletime,
|
||||
const std::vector<Idxs>& idxs)
|
||||
{
|
||||
// 1d check
|
||||
EXPECT_EQ(desc_1d.GetLength(I0), ck::wrapper::size(layout_runtime));
|
||||
// Check layout compiletime and runtime result consistency
|
||||
EXPECT_EQ(ck::wrapper::size(layout_runtime), ck::wrapper::size(layout_compiletime));
|
||||
|
||||
for(ck::index_t i = 0; i < desc_1d.GetLength(I0); i++)
|
||||
{
|
||||
const ck::index_t layout_runtime_offset_1d = layout_runtime(ck::make_tuple(i));
|
||||
const ck::index_t layout_compiletime_offset_1d = layout_compiletime(ck::make_tuple(i));
|
||||
const ck::index_t desc_offset_1d = desc_1d.CalculateOffset(ck::make_tuple(i));
|
||||
EXPECT_EQ(layout_runtime_offset_1d, desc_offset_1d);
|
||||
EXPECT_EQ(layout_compiletime_offset_1d, layout_runtime_offset_1d);
|
||||
}
|
||||
// size(layout)-d check, don't check if access is hierarchical
|
||||
if constexpr(!IsNestedTuple(Idxs{}))
|
||||
{
|
||||
ck::static_for<0, Idxs::Size(), 1>{}([&](auto d) {
|
||||
EXPECT_EQ(desc.GetLength(ck::Number<d>{}), ck::wrapper::size<d>(layout_runtime));
|
||||
EXPECT_EQ(ck::wrapper::size<d>(layout_runtime),
|
||||
ck::wrapper::size<d>(layout_compiletime));
|
||||
});
|
||||
}
|
||||
for(const auto idx : idxs)
|
||||
{
|
||||
const ck::index_t layout_runtime_offset = layout_runtime(idx);
|
||||
const ck::index_t layout_compiletime_offset = layout_compiletime(idx);
|
||||
const ck::index_t desc_offset =
|
||||
desc.CalculateOffset(UnrollNestedTuple(idx)); // Unroll if nested
|
||||
EXPECT_EQ(layout_runtime_offset, desc_offset);
|
||||
EXPECT_EQ(layout_runtime_offset, layout_compiletime_offset);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(TestWrapperLayout, 2d)
|
||||
{
|
||||
// dims:(4, 3) strides:(1, 4)
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
constexpr ck::index_t s1 = 1;
|
||||
constexpr ck::index_t s0 = 4;
|
||||
const auto desc =
|
||||
ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
|
||||
// Reverse due to column major
|
||||
const auto desc_1d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1))),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}));
|
||||
const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0));
|
||||
const auto layout_compiletime =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
|
||||
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs;
|
||||
|
||||
for(ck::index_t h = 0; h < d1; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0; w++)
|
||||
{
|
||||
idxs.emplace_back(h, w);
|
||||
}
|
||||
}
|
||||
|
||||
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs);
|
||||
}
|
||||
|
||||
TEST_F(TestWrapperLayout, 3d_nested)
|
||||
{
|
||||
// dims:((2, 3), 4, 3) strides:((2, 4), 12, 48)
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
constexpr ck::index_t s3 = 2;
|
||||
constexpr ck::index_t s2 = 4;
|
||||
constexpr ck::index_t s1 = 12;
|
||||
constexpr ck::index_t s0 = 48;
|
||||
const auto desc = ck::make_naive_tensor_descriptor(
|
||||
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
|
||||
// Reverse due to column major
|
||||
const auto desc_1d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
|
||||
ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}));
|
||||
const auto desc_3d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
|
||||
ck::make_pass_through_transform(d1),
|
||||
ck::make_pass_through_transform(d2)),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<3>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
|
||||
const auto layout_runtime =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), d1, d0),
|
||||
ck::make_tuple(ck::make_tuple(s3, s2), s1, s0));
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(
|
||||
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}), ck::Number<d1>{}, ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
|
||||
ck::Number<s1>{},
|
||||
ck::Number<s0>{}));
|
||||
std::vector<ck::Tuple<ck::index_t, ck::index_t, ck::index_t>> idxs_3d;
|
||||
|
||||
for(ck::index_t d = 0; d < d2 * d3; d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < d1; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0; w++)
|
||||
{
|
||||
idxs_3d.emplace_back(d, h, w);
|
||||
}
|
||||
}
|
||||
}
|
||||
this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
|
||||
|
||||
// Check also 4d iteration
|
||||
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t, ck::index_t>> idxs_4d;
|
||||
|
||||
for(ck::index_t e = 0; e < d3; e++)
|
||||
{
|
||||
for(ck::index_t d = 0; d < d2; d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < d1; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0; w++)
|
||||
{
|
||||
idxs_4d.emplace_back(ck::make_tuple(e, d), h, w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
|
||||
}
|
||||
|
||||
TEST_F(TestWrapperLayout, 2d_nested)
|
||||
{
|
||||
// dims:((2, 3), (4, 3)) strides:((2, 4), (48, 12))
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
constexpr ck::index_t s3 = 2;
|
||||
constexpr ck::index_t s2 = 4;
|
||||
constexpr ck::index_t s1 = 48;
|
||||
constexpr ck::index_t s0 = 12;
|
||||
const auto desc = ck::make_naive_tensor_descriptor(
|
||||
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
|
||||
// Reverse due to column major
|
||||
const auto desc_1d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
|
||||
ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}));
|
||||
const auto desc_2d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
|
||||
ck::make_merge_transform(ck::make_tuple(d0, d1))),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<3, 2>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
|
||||
const auto layout_runtime =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), ck::make_tuple(d1, d0)),
|
||||
ck::make_tuple(ck::make_tuple(s3, s2), ck::make_tuple(s1, s0)));
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
|
||||
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
|
||||
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
|
||||
|
||||
for(ck::index_t h = 0; h < d2 * d3; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0 * d1; w++)
|
||||
{
|
||||
idxs_2d.emplace_back(h, w);
|
||||
}
|
||||
}
|
||||
this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
|
||||
// Check also 4d iteration
|
||||
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::Tuple<ck::index_t, ck::index_t>>>
|
||||
idxs_4d;
|
||||
|
||||
for(ck::index_t e = 0; e < d3; e++)
|
||||
{
|
||||
for(ck::index_t d = 0; d < d2; d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < d1; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0; w++)
|
||||
{
|
||||
idxs_4d.emplace_back(ck::make_tuple(e, d), ck::make_tuple(h, w));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
|
||||
}
|
||||
|
||||
TEST_F(TestWrapperLayout, 3d_double_nested)
|
||||
{
|
||||
// dims:(((2, 2), 3), (4, 3)) strides:(((2, 4), 8), (96, 24))
|
||||
constexpr ck::index_t d4 = 2;
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
constexpr ck::index_t s4 = 2;
|
||||
constexpr ck::index_t s3 = 4;
|
||||
constexpr ck::index_t s2 = 8;
|
||||
constexpr ck::index_t s1 = 96;
|
||||
constexpr ck::index_t s0 = 24;
|
||||
const auto desc = ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d4>{},
|
||||
ck::Number<d3>{},
|
||||
ck::Number<d2>{},
|
||||
ck::Number<d1>{},
|
||||
ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::Number<s4>{},
|
||||
ck::Number<s3>{},
|
||||
ck::Number<s2>{},
|
||||
ck::Number<s1>{},
|
||||
ck::Number<s0>{}));
|
||||
// Reverse due to column major
|
||||
const auto desc_1d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3, d4))),
|
||||
ck::make_tuple(ck::Sequence<4, 3, 2, 1, 0>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}));
|
||||
const auto desc_3d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d3, d4)),
|
||||
ck::make_pass_through_transform(d2),
|
||||
ck::make_merge_transform(ck::make_tuple(d0, d1))),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<4, 3>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
|
||||
const auto desc_2d = transform_tensor_descriptor(
|
||||
desc_3d,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3 * d4)),
|
||||
ck::make_pass_through_transform(d1 * d0)),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
|
||||
const auto layout_runtime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)),
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, s3), s2), ck::make_tuple(s1, s0)));
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
|
||||
ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
|
||||
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
|
||||
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
|
||||
|
||||
for(ck::index_t h = 0; h < d2 * d3 * d4; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0 * d1; w++)
|
||||
{
|
||||
idxs_2d.emplace_back(h, w);
|
||||
}
|
||||
}
|
||||
this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
|
||||
// Check also 3d iteration
|
||||
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>> idxs_3d;
|
||||
|
||||
for(ck::index_t d = 0; d < d3 * d4; d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < d2; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d1 * d0; w++)
|
||||
{
|
||||
idxs_3d.emplace_back(ck::make_tuple(d, h), w);
|
||||
}
|
||||
}
|
||||
}
|
||||
this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
|
||||
// Check also 5d iteration
|
||||
std::vector<ck::Tuple<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>,
|
||||
ck::Tuple<ck::index_t, ck::index_t>>>
|
||||
idxs_5d;
|
||||
|
||||
for(ck::index_t f = 0; f < d4; f++)
|
||||
{
|
||||
for(ck::index_t e = 0; e < d3; e++)
|
||||
{
|
||||
for(ck::index_t d = 0; d < d2; d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < d1; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0; w++)
|
||||
{
|
||||
idxs_5d.emplace_back(ck::make_tuple(ck::make_tuple(f, e), d),
|
||||
ck::make_tuple(h, w));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_5d);
|
||||
}
|
||||
|
||||
TEST(TestLayoutHelpers, SizeAndGet)
|
||||
{
|
||||
// dims:(((2, 2), 3), (4, 3))
|
||||
constexpr ck::index_t d4 = 2;
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
const auto layout_runtime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
|
||||
|
||||
// Size of layout
|
||||
EXPECT_EQ(ck::wrapper::size(layout_runtime), d4 * d3 * d2 * d1 * d0);
|
||||
EXPECT_EQ(ck::wrapper::size(layout_compiletime), d4 * d3 * d2 * d1 * d0);
|
||||
|
||||
// Size of dims
|
||||
EXPECT_EQ(ck::wrapper::size<0>(layout_runtime), d4 * d3 * d2);
|
||||
EXPECT_EQ(ck::wrapper::size<0>(layout_compiletime), d4 * d3 * d2);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(layout_runtime), d1 * d0);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(layout_compiletime), d1 * d0);
|
||||
|
||||
// Access through new layout (using get with layout object)
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_runtime)), d4 * d3);
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_compiletime)), d4 * d3);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
|
||||
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d4);
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
|
||||
d4);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d3);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
|
||||
d3);
|
||||
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
|
||||
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_runtime)), d1);
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_compiletime)), d1);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_runtime)), d0);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_compiletime)), d0);
|
||||
}
|
||||
|
||||
TEST(TestLayoutHelpers, DepthAndRank)
|
||||
{
|
||||
// dims:(((2, 2), 3), (4, 3))
|
||||
constexpr ck::index_t d4 = 2;
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
const auto layout_runtime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
|
||||
|
||||
EXPECT_EQ(ck::wrapper::depth(layout_runtime), 3);
|
||||
EXPECT_EQ(ck::wrapper::depth(layout_compiletime), 3);
|
||||
EXPECT_EQ(ck::wrapper::depth(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
|
||||
// Check for integer
|
||||
EXPECT_EQ(ck::wrapper::depth(d0), 0);
|
||||
|
||||
EXPECT_EQ(ck::wrapper::rank(layout_runtime), 2);
|
||||
EXPECT_EQ(ck::wrapper::rank(layout_compiletime), 2);
|
||||
EXPECT_EQ(ck::wrapper::rank(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
|
||||
// Check for integer
|
||||
EXPECT_EQ(ck::wrapper::rank(d0), 1);
|
||||
}
|
||||
|
||||
TEST(TestLayoutHelpers, ShapeAndStrides)
|
||||
{
|
||||
// dims:(((2, 2), 3), (4, 3))
|
||||
constexpr ck::index_t d4 = 2;
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
constexpr ck::index_t s4 = 2;
|
||||
constexpr ck::index_t s3 = 4;
|
||||
constexpr ck::index_t s2 = 8;
|
||||
constexpr ck::index_t s1 = 96;
|
||||
constexpr ck::index_t s0 = 24;
|
||||
const auto shape_compiletime = ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}));
|
||||
const auto strides_compiletime = ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<s4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
|
||||
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
|
||||
const auto shape_runtime =
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
|
||||
const auto strides_runtime =
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(s4, s3), s2), ck::make_tuple(s1, s0));
|
||||
const auto layout_runtime = ck::wrapper::make_layout(shape_runtime, strides_runtime);
|
||||
const auto layout_compiletime =
|
||||
ck::wrapper::make_layout(shape_compiletime, strides_compiletime);
|
||||
|
||||
constexpr bool check_compiletime_shape =
|
||||
std::is_same_v<decltype(shape_compiletime),
|
||||
std::remove_reference_t<decltype(shape(layout_compiletime))>>;
|
||||
constexpr bool check_runtime_shape =
|
||||
std::is_same_v<decltype(shape_runtime),
|
||||
std::remove_reference_t<decltype(shape(layout_runtime))>>;
|
||||
EXPECT_TRUE(check_compiletime_shape);
|
||||
EXPECT_TRUE(check_runtime_shape);
|
||||
}
|
||||
|
||||
TEST(TestLayoutHelpers, Hierarchical)
|
||||
{
|
||||
// dims:(((2, 2), 3), (4, 3))
|
||||
constexpr ck::index_t d4 = 2;
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
const auto runtime_shape =
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
|
||||
const auto layout_runtime = ck::wrapper::make_layout(runtime_shape);
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
|
||||
|
||||
EXPECT_EQ((ck::wrapper::rank<0, 0>(runtime_shape)), 2);
|
||||
EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_runtime)), 2);
|
||||
EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_compiletime)), 2);
|
||||
|
||||
EXPECT_EQ((ck::wrapper::depth<0, 0>(runtime_shape)), 1);
|
||||
EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_runtime)), 1);
|
||||
EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_compiletime)), 1);
|
||||
|
||||
EXPECT_EQ((ck::wrapper::size<0, 0>(runtime_shape)), d4 * d3);
|
||||
EXPECT_EQ((ck::wrapper::size<0, 0>(layout_runtime)), d4 * d3);
|
||||
EXPECT_EQ((ck::wrapper::size<0, 0>(layout_compiletime)), d4 * d3);
|
||||
|
||||
EXPECT_EQ((ck::wrapper::get<0, 0, 0>(runtime_shape)), d4);
|
||||
}
|
||||
115
test/wrapper/test_wrapper_partition.cpp
Normal file
115
test/wrapper/test_wrapper_partition.cpp
Normal file
@@ -0,0 +1,115 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <numeric>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
#include "ck/wrapper/tensor.hpp"
|
||||
|
||||
TEST(TestPartition, LocalPartition)
|
||||
{
|
||||
const auto shape =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{});
|
||||
const auto strides =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{});
|
||||
const auto layout = ck::wrapper::make_layout(shape, strides);
|
||||
|
||||
std::vector<ck::index_t> data(ck::wrapper::size(layout));
|
||||
std::iota(data.begin(), data.end(), 0);
|
||||
|
||||
const auto tensor =
|
||||
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
|
||||
|
||||
const auto thread_steps = ck::make_tuple(ck::Number<1>{}, ck::Number<8>{}, ck::Number<1>{});
|
||||
// row-major thread layout
|
||||
const auto thread_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<8>{}, ck::Number<1>{}),
|
||||
ck::make_tuple(ck::Number<8>{}, ck::Number<1>{}, ck::Number<1>{}));
|
||||
// 3d partition on 2d shape (calculate partition on 3d thread layout, and then skip first dim)
|
||||
const auto thread_projection =
|
||||
ck::make_tuple(ck::wrapper::slice(4), ck::Number<1>{}, ck::Number<1>{});
|
||||
constexpr ck::index_t projection_thread_length = ck::Number<4>{};
|
||||
|
||||
for(ck::index_t thread_id = 0;
|
||||
thread_id < ck::wrapper::size(thread_layout) / projection_thread_length;
|
||||
thread_id++)
|
||||
{
|
||||
const auto packed_partition =
|
||||
ck::wrapper::make_local_partition(tensor, thread_layout, thread_id, thread_projection);
|
||||
|
||||
const auto expected_partition_size =
|
||||
ck::wrapper::size(tensor) /
|
||||
(ck::wrapper::size(thread_layout) / projection_thread_length);
|
||||
const auto expected_partition_first_val = thread_id * ck::wrapper::size<1>(thread_steps);
|
||||
const auto expected_partition_second_val = expected_partition_first_val + 1;
|
||||
EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size);
|
||||
EXPECT_EQ(packed_partition(0), expected_partition_first_val);
|
||||
EXPECT_EQ(packed_partition(1), expected_partition_second_val);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestPartition, LocalTile)
|
||||
{
|
||||
const auto shape = ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}, ck::Number<4>{});
|
||||
const auto strides = ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}, ck::Number<64>{});
|
||||
const auto layout = ck::wrapper::make_layout(shape, strides);
|
||||
|
||||
std::vector<ck::index_t> data(ck::wrapper::size(layout));
|
||||
std::iota(data.begin(), data.end(), 0);
|
||||
|
||||
const auto tensor =
|
||||
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
|
||||
// 4d tile partitioning on 3d shape (calculate tile on 4d tile layout, and then skip last dim)
|
||||
const auto block_shape =
|
||||
ck::make_tuple(ck::Number<2>{}, ck::Number<4>{}, ck::Number<2>{}, ck::Number<2>{});
|
||||
const auto block_projection =
|
||||
ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(2));
|
||||
|
||||
const auto grid_shape =
|
||||
ck::make_tuple(ck::wrapper::size<0>(shape) / ck::wrapper::size<0>(block_shape),
|
||||
ck::wrapper::size<1>(shape) / ck::wrapper::size<1>(block_shape),
|
||||
ck::wrapper::size<2>(shape) / ck::wrapper::size<2>(block_shape));
|
||||
std::vector<ck::Tuple<ck::index_t, ck::index_t, ck::index_t, ck::index_t>> block_idxs;
|
||||
for(int i = 0; i < ck::wrapper::size<0>(grid_shape); i++)
|
||||
{
|
||||
for(int j = 0; j < ck::wrapper::size<1>(grid_shape); j++)
|
||||
{
|
||||
for(int k = 0; k < ck::wrapper::size<2>(grid_shape); k++)
|
||||
{
|
||||
block_idxs.emplace_back(i, j, k, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(auto block_idx : block_idxs)
|
||||
{
|
||||
constexpr ck::index_t projection_block_dim = ck::Number<2>{};
|
||||
const auto packed_tile =
|
||||
ck::wrapper::make_local_tile(tensor, block_shape, block_idx, block_projection);
|
||||
|
||||
const auto expected_tile_size = ck::wrapper::size(block_shape) / projection_block_dim;
|
||||
auto expected_tile_first_val = ck::wrapper::size<2>(block_idx) *
|
||||
ck::wrapper::size<2>(block_shape) *
|
||||
ck::wrapper::size<2>(strides);
|
||||
expected_tile_first_val += ck::wrapper::size<1>(block_idx) *
|
||||
ck::wrapper::size<1>(block_shape) *
|
||||
ck::wrapper::size<1>(strides);
|
||||
expected_tile_first_val += ck::wrapper::size<0>(block_idx) *
|
||||
ck::wrapper::size<0>(block_shape) *
|
||||
ck::wrapper::size<0>(strides);
|
||||
|
||||
const auto expected_tile_second_val = expected_tile_first_val + 1;
|
||||
EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size);
|
||||
EXPECT_EQ(packed_tile(0), expected_tile_first_val);
|
||||
EXPECT_EQ(packed_tile(1), expected_tile_second_val);
|
||||
}
|
||||
}
|
||||
209
test/wrapper/test_wrapper_tensor.cpp
Normal file
209
test/wrapper/test_wrapper_tensor.cpp
Normal file
@@ -0,0 +1,209 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
#include "ck/wrapper/tensor.hpp"
|
||||
|
||||
// Compare data in tensor with offset from layout.
|
||||
// Data and offset should match if physical memory has been initialized with
|
||||
// sequentially increasing values from 0.
|
||||
template <typename TensorType>
|
||||
__host__ __device__ bool TestTensorCheck3d(TensorType& tensor)
|
||||
{
|
||||
const auto& layout = ck::wrapper::layout(tensor);
|
||||
for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
|
||||
{
|
||||
const auto idx = ck::make_tuple(ck::make_tuple(d, h), w);
|
||||
if(tensor(idx) != layout(idx))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename TensorType>
|
||||
__host__ __device__ bool TestTensorCheck1d(TensorType& tensor, ck::index_t start_offset = 0)
|
||||
{
|
||||
const auto& layout = ck::wrapper::layout(tensor);
|
||||
for(ck::index_t w = 0; w < ck::wrapper::size<0>(layout); w++)
|
||||
{
|
||||
if(tensor(w) - start_offset != layout(ck::make_tuple(w)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <ck::index_t nelems, typename TensorType>
|
||||
__host__ __device__ bool StaticTestTensorCheck1d(TensorType& tensor)
|
||||
{
|
||||
const auto& layout = ck::wrapper::layout(tensor);
|
||||
bool success = true;
|
||||
ck::static_for<0, nelems, 1>{}([&](auto w) {
|
||||
if(tensor(ck::Number<w.value>{}) != layout(ck::make_tuple(w.value)))
|
||||
{
|
||||
success = false;
|
||||
}
|
||||
});
|
||||
return success;
|
||||
}
|
||||
|
||||
template <typename TensorType>
|
||||
__host__ __device__ void InitTensor(TensorType& tensor)
|
||||
{
|
||||
for(ck::index_t i = 0; i < ck::wrapper::size(ck::wrapper::layout(tensor)); i++)
|
||||
{
|
||||
tensor(i) = i;
|
||||
}
|
||||
}
|
||||
|
||||
template <ck::index_t nelems, typename TensorType>
|
||||
__host__ __device__ void StaticInitTensor(TensorType& tensor)
|
||||
{
|
||||
|
||||
ck::static_for<0, nelems, 1>{}([&](auto i) { tensor(ck::Number<i.value>{}) = i.value; });
|
||||
}
|
||||
|
||||
// Tests
|
||||
TEST(TestTensor, ReadWriteHostMemory)
|
||||
{
|
||||
constexpr ck::index_t nelems = 8;
|
||||
|
||||
std::array<ck::index_t, nelems> data;
|
||||
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
|
||||
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
|
||||
InitTensor(tensor);
|
||||
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor));
|
||||
EXPECT_TRUE(TestTensorCheck3d(tensor));
|
||||
}
|
||||
|
||||
__global__ void TestTensorReadWriteDevice(void* data, void* success)
|
||||
{
|
||||
constexpr ck::index_t nelems = 8;
|
||||
__shared__ ck::index_t p_shared[nelems];
|
||||
|
||||
ck::index_t* casted_data_ptr = static_cast<ck::index_t*>(data);
|
||||
bool* casted_success_ptr = static_cast<bool*>(success);
|
||||
|
||||
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
|
||||
constexpr auto vgpr_layout =
|
||||
ck::wrapper::make_layout(make_tuple(ck::Number<nelems>{}), make_tuple(ck::Number<1>{}));
|
||||
|
||||
auto tensor_global =
|
||||
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout);
|
||||
auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(p_shared, layout);
|
||||
auto tensor_vgpr =
|
||||
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
|
||||
vgpr_layout);
|
||||
|
||||
InitTensor(tensor_global);
|
||||
InitTensor(tensor_lds);
|
||||
StaticInitTensor<nelems>(tensor_vgpr);
|
||||
|
||||
*casted_success_ptr = TestTensorCheck1d(tensor_global);
|
||||
*casted_success_ptr &= TestTensorCheck3d(tensor_global);
|
||||
|
||||
*casted_success_ptr &= TestTensorCheck1d(tensor_lds);
|
||||
*casted_success_ptr &= TestTensorCheck3d(tensor_lds);
|
||||
|
||||
*casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_vgpr);
|
||||
}
|
||||
|
||||
TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory)
|
||||
{
|
||||
constexpr ck::index_t nelems = 8;
|
||||
std::array<ck::index_t, nelems> host_data;
|
||||
|
||||
DeviceMem data_buf(nelems * sizeof(ck::index_t));
|
||||
data_buf.ToDevice(&host_data[0]);
|
||||
DeviceMem success_buf(sizeof(bool));
|
||||
|
||||
launch_and_time_kernel(StreamConfig{},
|
||||
TestTensorReadWriteDevice,
|
||||
dim3(1),
|
||||
dim3(1),
|
||||
0,
|
||||
data_buf.GetDeviceBuffer(),
|
||||
success_buf.GetDeviceBuffer());
|
||||
|
||||
bool success;
|
||||
success_buf.FromDevice(&success);
|
||||
EXPECT_TRUE(success);
|
||||
}
|
||||
|
||||
TEST(TestTensor, Slicing)
|
||||
{
|
||||
constexpr ck::index_t nelems = 8;
|
||||
|
||||
std::array<ck::index_t, nelems> data;
|
||||
const auto shape = ck::make_tuple(ck::make_tuple(2, 2), 2);
|
||||
const auto strides = ck::make_tuple(ck::make_tuple(1, 2), 4);
|
||||
const auto layout = ck::wrapper::make_layout(shape, strides);
|
||||
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
|
||||
InitTensor(tensor);
|
||||
|
||||
auto tensor2x2x2 =
|
||||
tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2));
|
||||
EXPECT_EQ(tensor2x2x2(0), layout(ck::make_tuple(ck::make_tuple(0, 0), 0)));
|
||||
EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2);
|
||||
EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2);
|
||||
EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2));
|
||||
|
||||
auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2));
|
||||
EXPECT_EQ(tensor2x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0)));
|
||||
EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2);
|
||||
EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2);
|
||||
EXPECT_EQ(ck::wrapper::size(tensor2x2), 4);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor2x2));
|
||||
|
||||
auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2));
|
||||
EXPECT_EQ(tensor1x1(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 1)));
|
||||
EXPECT_EQ(rank(tensor1x1), 2);
|
||||
EXPECT_EQ(depth(tensor1x1), 2);
|
||||
EXPECT_EQ(size(tensor1x1), 1);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor1x1));
|
||||
|
||||
auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2));
|
||||
EXPECT_EQ(tensor2(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 0)));
|
||||
EXPECT_EQ(ck::wrapper::rank(tensor2), 1);
|
||||
EXPECT_EQ(ck::wrapper::depth(tensor2), 1);
|
||||
EXPECT_EQ(ck::wrapper::size(tensor2), 2);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor2));
|
||||
|
||||
auto tensor2_v2 = tensor(2, ck::wrapper::slice(0, 2));
|
||||
EXPECT_EQ(tensor2_v2(0), layout(ck::make_tuple(2, 0)));
|
||||
EXPECT_EQ(ck::wrapper::rank(tensor2_v2), 1);
|
||||
EXPECT_EQ(ck::wrapper::depth(tensor2_v2), 1);
|
||||
EXPECT_EQ(ck::wrapper::size(tensor2_v2), 2);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor2_v2));
|
||||
|
||||
// negative indexing
|
||||
auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice());
|
||||
EXPECT_EQ(tensor1x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0)));
|
||||
EXPECT_EQ(rank(tensor1x2), 2);
|
||||
EXPECT_EQ(depth(tensor1x2), 2);
|
||||
EXPECT_EQ(size(tensor1x2), 2);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor1x2));
|
||||
}
|
||||
Reference in New Issue
Block a user