mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 06:49:15 +00:00
Add optimized copy to ck wrapper (#1126)
* Add optimized copy to ck wrapper * Example optimizations * Fixes * Move img2col test to client example * Refactor example * Fix docs * Fixes * Fix * Fixes * Fixes * Fixes * Fixes * Fixes --------- Co-authored-by: zjing14 <zhangjing14@gmail.com>
This commit is contained in:
@@ -21,49 +21,59 @@ template <typename InputTensor,
|
||||
typename OutputTensor,
|
||||
typename BlockShape,
|
||||
typename ThreadLayoutShape,
|
||||
typename LocalTileSteps,
|
||||
typename LocalPartitionSteps>
|
||||
bool UseOptimizedCopy>
|
||||
__global__ void TestCopyDevice(const InputTensor input_tensor,
|
||||
OutputTensor output_tensor,
|
||||
const BlockShape tile_shape,
|
||||
const ThreadLayoutShape thread_layout,
|
||||
const LocalTileSteps block_steps,
|
||||
const LocalPartitionSteps thread_steps)
|
||||
const ThreadLayoutShape thread_layout)
|
||||
{
|
||||
__shared__ ck::index_t p_shared[ck::wrapper::size(tile_shape)];
|
||||
auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
const auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
p_shared, ck::wrapper::make_layout(tile_shape));
|
||||
|
||||
const auto block_idxs = ck::make_tuple(ck::make_tuple(0, 0), blockIdx.x);
|
||||
const auto block_idx = static_cast<ck::index_t>(blockIdx.x);
|
||||
|
||||
// Get local tiles for global memory
|
||||
const auto input_local_tile =
|
||||
ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idxs, block_steps);
|
||||
const auto input_local_tile = ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idx);
|
||||
const auto output_local_tile =
|
||||
ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idxs, block_steps);
|
||||
ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idx);
|
||||
|
||||
// Get partition per thread
|
||||
const auto input_local_partition = ck::wrapper::make_local_partition(
|
||||
input_local_tile, thread_layout, threadIdx.x, thread_steps);
|
||||
const auto input_local_partition =
|
||||
ck::wrapper::make_local_partition(input_local_tile, thread_layout, threadIdx.x);
|
||||
auto lds_local_partition =
|
||||
ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x, thread_steps);
|
||||
auto output_local_partition = ck::wrapper::make_local_partition(
|
||||
output_local_tile, thread_layout, threadIdx.x, thread_steps);
|
||||
ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x);
|
||||
auto output_local_partition =
|
||||
ck::wrapper::make_local_partition(output_local_tile, thread_layout, threadIdx.x);
|
||||
|
||||
// Allocate VGPR
|
||||
constexpr ck::index_t scalar_per_vector = 1;
|
||||
constexpr ck::index_t vgpr_size = ck::wrapper::size(lds_local_partition);
|
||||
auto tensor_vgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr,
|
||||
vgpr_size,
|
||||
scalar_per_vector,
|
||||
ck::index_t>();
|
||||
auto tensor_vgpr =
|
||||
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
|
||||
layout(lds_local_partition));
|
||||
|
||||
// Perform copy
|
||||
ck::wrapper::copy(input_local_partition, lds_local_partition);
|
||||
ck::wrapper::copy(lds_local_partition, tensor_vgpr);
|
||||
ck::wrapper::copy(tensor_vgpr, output_local_partition);
|
||||
if constexpr(UseOptimizedCopy)
|
||||
{
|
||||
using DimAccessOrder = ck::Tuple<ck::Number<1>, ck::Number<0>>;
|
||||
constexpr ck::index_t vector_dim = 0;
|
||||
constexpr ck::index_t scalar_per_vector = 2;
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(input_local_partition,
|
||||
lds_local_partition);
|
||||
// TODO: Enable optimized copy for static buffers
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(lds_local_partition,
|
||||
tensor_vgpr);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(tensor_vgpr,
|
||||
output_local_partition);
|
||||
}
|
||||
else
|
||||
{
|
||||
ck::wrapper::copy(input_local_partition, lds_local_partition);
|
||||
ck::wrapper::copy(lds_local_partition, tensor_vgpr);
|
||||
ck::wrapper::copy(tensor_vgpr, output_local_partition);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool UseOptimizedCopy>
|
||||
void PerformCopyGlobalToGlobalViaLDS()
|
||||
{
|
||||
const auto shape =
|
||||
@@ -89,15 +99,8 @@ void PerformCopyGlobalToGlobalViaLDS()
|
||||
auto output_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<ck::index_t*>(out_buf.GetDeviceBuffer()), layout);
|
||||
|
||||
const auto thread_layout =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<32>{});
|
||||
const auto tile_shape =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<64>{});
|
||||
|
||||
const auto thread_steps =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<2>{});
|
||||
const auto block_steps =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<64>{});
|
||||
const auto thread_layout = ck::make_tuple(ck::Number<1>{}, ck::Number<32>{});
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<4>{}, ck::Number<64>{});
|
||||
|
||||
const ck::index_t grid_size = ck::math::integer_divide_ceil(
|
||||
ck::wrapper::size(input_tensor_global), ck::wrapper::size(tile_shape));
|
||||
@@ -106,8 +109,7 @@ void PerformCopyGlobalToGlobalViaLDS()
|
||||
decltype(output_tensor_global),
|
||||
decltype(tile_shape),
|
||||
decltype(thread_layout),
|
||||
decltype(block_steps),
|
||||
decltype(thread_steps)>;
|
||||
UseOptimizedCopy>;
|
||||
launch_and_time_kernel(StreamConfig{},
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
@@ -116,9 +118,7 @@ void PerformCopyGlobalToGlobalViaLDS()
|
||||
input_tensor_global,
|
||||
output_tensor_global,
|
||||
tile_shape,
|
||||
thread_layout,
|
||||
block_steps,
|
||||
thread_steps);
|
||||
thread_layout);
|
||||
|
||||
// Verify results
|
||||
std::vector<ck::index_t> output_data(ck::wrapper::size(shape));
|
||||
@@ -126,4 +126,5 @@ void PerformCopyGlobalToGlobalViaLDS()
|
||||
EXPECT_TRUE(ck::utils::check_err(output_data, input_data));
|
||||
}
|
||||
|
||||
TEST(TestCopy, CopyGlobalToGlobalViaLDS) { PerformCopyGlobalToGlobalViaLDS(); }
|
||||
TEST(TestCopyGlobalToGlobalViaLDS, GenericCopy) { PerformCopyGlobalToGlobalViaLDS<false>(); }
|
||||
TEST(TestCopyGlobalToGlobalViaLDS, OptimizedCopy) { PerformCopyGlobalToGlobalViaLDS<true>(); }
|
||||
|
||||
@@ -29,42 +29,29 @@ TEST(TestPartition, LocalPartition)
|
||||
const auto tensor =
|
||||
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
|
||||
|
||||
const auto thread_steps =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<1>{}), ck::Number<1>{});
|
||||
const auto thread_layout =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<8>{}, ck::Number<1>{}), ck::Number<1>{});
|
||||
|
||||
for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++)
|
||||
{
|
||||
const auto raked_partition =
|
||||
ck::wrapper::make_local_partition(tensor, thread_layout, thread_id);
|
||||
|
||||
const auto expected_partition_size =
|
||||
ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout);
|
||||
EXPECT_EQ(ck::wrapper::size(raked_partition), expected_partition_size);
|
||||
EXPECT_EQ(raked_partition(0), thread_id);
|
||||
}
|
||||
const auto thread_steps = ck::make_tuple(ck::Number<8>{}, ck::Number<1>{});
|
||||
const auto thread_layout = ck::make_tuple(ck::Number<8>{}, ck::Number<1>{});
|
||||
|
||||
for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++)
|
||||
{
|
||||
const auto packed_partition =
|
||||
ck::wrapper::make_local_partition(tensor, thread_layout, thread_id, thread_steps);
|
||||
ck::wrapper::make_local_partition(tensor, thread_layout, thread_id);
|
||||
|
||||
const auto expected_partition_size =
|
||||
ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout);
|
||||
const auto expected_partition_first_val = thread_id * ck::wrapper::size<0, 0>(thread_steps);
|
||||
const auto expected_partition_first_val = thread_id * ck::wrapper::size<0>(thread_steps);
|
||||
const auto expected_partition_second_val = expected_partition_first_val + 1;
|
||||
EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size);
|
||||
EXPECT_EQ(packed_partition(0), expected_partition_first_val);
|
||||
EXPECT_EQ(packed_partition(1), expected_partition_second_val);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestPartition, LocalTile)
|
||||
{
|
||||
const auto shape =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{});
|
||||
const auto strides =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{});
|
||||
const auto layout = ck::wrapper::make_layout(shape, strides);
|
||||
const auto shape = ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}, ck::Number<4>{});
|
||||
const auto strides = ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}, ck::Number<64>{});
|
||||
const auto layout = ck::wrapper::make_layout(shape, strides);
|
||||
|
||||
std::vector<ck::index_t> data(ck::wrapper::size(layout));
|
||||
std::iota(data.begin(), data.end(), 0);
|
||||
@@ -72,48 +59,34 @@ TEST(TestPartition, LocalTile)
|
||||
const auto tensor =
|
||||
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
|
||||
|
||||
const auto block_steps =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{});
|
||||
const auto block_shape =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{});
|
||||
const auto block_layout =
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{});
|
||||
const auto block_shape = ck::make_tuple(ck::Number<2>{}, ck::Number<4>{}, ck::Number<2>{});
|
||||
const auto num_blocks =
|
||||
ck::make_tuple(ck::wrapper::size<0>(shape) / ck::wrapper::size<0>(block_shape),
|
||||
ck::wrapper::size<1>(shape) / ck::wrapper::size<1>(block_shape),
|
||||
ck::wrapper::size<2>(shape) / ck::wrapper::size<2>(block_shape));
|
||||
std::vector<ck::index_t> block_idxs(ck::wrapper::size(num_blocks));
|
||||
std::iota(block_idxs.begin(), block_idxs.end(), 0);
|
||||
|
||||
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>> block_idxs;
|
||||
for(ck::index_t x = 0; x < ck::wrapper::size<0, 0>(block_layout); x++)
|
||||
for(auto block_idx : block_idxs)
|
||||
{
|
||||
for(ck::index_t y = 0; y < ck::wrapper::size<0, 1>(block_layout); y++)
|
||||
{
|
||||
for(ck::index_t z = 0; z < ck::wrapper::size<1>(block_layout); z++)
|
||||
{
|
||||
block_idxs.emplace_back(ck::make_tuple(x, y), z);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(const auto& block_idx : block_idxs)
|
||||
{
|
||||
const auto raked_tile = ck::wrapper::make_local_tile(tensor, block_shape, block_idx);
|
||||
const auto packed_tile = ck::wrapper::make_local_tile(tensor, block_shape, block_idx);
|
||||
|
||||
const auto expected_tile_size = ck::wrapper::size(block_shape);
|
||||
EXPECT_EQ(ck::wrapper::size(raked_tile), expected_tile_size);
|
||||
EXPECT_EQ(raked_tile(0), layout(block_idx));
|
||||
}
|
||||
auto expected_tile_first_val = (block_idx % ck::wrapper::size<2>(num_blocks)) *
|
||||
ck::wrapper::size<2>(block_shape) *
|
||||
ck::wrapper::size<2>(strides);
|
||||
block_idx /= ck::wrapper::size<2>(num_blocks);
|
||||
expected_tile_first_val += (block_idx % ck::wrapper::size<1>(num_blocks)) *
|
||||
ck::wrapper::size<1>(block_shape) *
|
||||
ck::wrapper::size<1>(strides);
|
||||
block_idx /= ck::wrapper::size<1>(num_blocks);
|
||||
expected_tile_first_val += (block_idx % ck::wrapper::size<0>(num_blocks)) *
|
||||
ck::wrapper::size<0>(block_shape) *
|
||||
ck::wrapper::size<0>(strides);
|
||||
|
||||
for(const auto& block_idx : block_idxs)
|
||||
{
|
||||
const auto packed_tile =
|
||||
ck::wrapper::make_local_tile(tensor, block_shape, block_idx, block_steps);
|
||||
|
||||
const auto expected_tile_size = ck::wrapper::size(block_shape);
|
||||
const auto expected_tile_first_val =
|
||||
ck::wrapper::size<0, 0>(block_idx) * ck::wrapper::size<0, 0>(block_shape) *
|
||||
ck::wrapper::size<0, 0>(strides) +
|
||||
ck::wrapper::size<0, 1>(block_idx) * ck::wrapper::size<0, 1>(block_shape) *
|
||||
ck::wrapper::size<0, 1>(strides) +
|
||||
ck::wrapper::size<1>(block_idx) * ck::wrapper::size<1>(block_shape) *
|
||||
ck::wrapper::size<1>(strides);
|
||||
const auto expected_tile_second_val = expected_tile_first_val + 1;
|
||||
EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size);
|
||||
EXPECT_EQ(packed_tile(0), expected_tile_first_val);
|
||||
EXPECT_EQ(packed_tile(1), expected_tile_second_val);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
@@ -100,31 +100,26 @@ TEST(TestTensor, ReadWriteHostMemory)
|
||||
|
||||
__global__ void TestTensorReadWriteDevice(void* data, void* success)
|
||||
{
|
||||
constexpr ck::index_t nelems = 8;
|
||||
constexpr ck::index_t scalar_per_vector = 1;
|
||||
constexpr ck::index_t nelems = 8;
|
||||
__shared__ ck::index_t p_shared[nelems];
|
||||
|
||||
ck::index_t* casted_data_ptr = static_cast<ck::index_t*>(data);
|
||||
bool* casted_success_ptr = static_cast<bool*>(success);
|
||||
|
||||
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
|
||||
constexpr auto vgpr_layout =
|
||||
ck::wrapper::make_layout(make_tuple(ck::Number<nelems>{}), make_tuple(ck::Number<1>{}));
|
||||
|
||||
auto tensor_global =
|
||||
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout);
|
||||
auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(p_shared, layout);
|
||||
auto tensor_vgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr,
|
||||
nelems,
|
||||
scalar_per_vector,
|
||||
ck::index_t>();
|
||||
auto tensor_sgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Sgpr,
|
||||
nelems,
|
||||
scalar_per_vector,
|
||||
ck::index_t>();
|
||||
auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(p_shared, layout);
|
||||
auto tensor_vgpr =
|
||||
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
|
||||
vgpr_layout);
|
||||
|
||||
InitTensor(tensor_global);
|
||||
InitTensor(tensor_lds);
|
||||
StaticInitTensor<nelems>(tensor_vgpr);
|
||||
StaticInitTensor<nelems>(tensor_sgpr);
|
||||
|
||||
*casted_success_ptr = TestTensorCheck1d(tensor_global);
|
||||
*casted_success_ptr &= TestTensorCheck3d(tensor_global);
|
||||
@@ -133,8 +128,6 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success)
|
||||
*casted_success_ptr &= TestTensorCheck3d(tensor_lds);
|
||||
|
||||
*casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_vgpr);
|
||||
|
||||
*casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_sgpr);
|
||||
}
|
||||
|
||||
TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory)
|
||||
|
||||
Reference in New Issue
Block a user