mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 10:59:55 +00:00
[CK_TILE] Support for elementwise kernel (#2246)
* Elementwise kernel implementation
Co-authored-by: Sami Aario <samaario@amd.com>
Co-authored-by: Mohsen Saffari <mohsen.saffari@amd.com>
Co-authored-by: yashagar <yashagar@amd.com>
* Elementwise with generalized nDims
* Adding the n-ary input tensor feature
* Generalize dimensions on top of inputs
* Add TFLOPS + remove std usage for tuples
* 1D basecase optimization
* Cleanup code + refactoring to a common interface
* Generalize to unary and add an example
* Cleanup, refactoring and commenting
* Suggestions for LWPCK-3170: elementwise kernel improvements
* Clang-format: remod.py
* Replace InputTensorType with XDataType as the type of input_tensors
* Add Tuple::apply and use it in ElementWiseKernel::operator to call operation with the exact number of arguments in xs
* Move examples to folder 19_elementwise
* Add missing copyright headers and fix some existing ones
* Replace an assert with throw std::runtime_error in elementwise example
* Avoid reading the output by using make_static_distributed_tensor for y_tile
* Removed two unused includes
* No need to move windows to the next block when each workgroup processes a single tile
* Only copy input tensors to the device
* Use get_warp_size to obtain warp size, and use ceiling division for grid size also for the unary example
* Adding output strides to the kernel, transposition example and update the other examples
* Changes made by remod.py
* Use default template parameter values for memory operation and coherence in a call to make_naive_tensor_view
* Move binary operations to include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp
* Reuse generic reference binary/unary operation in examples + refactoring the transpose reference
* Fix comments in elementwise_example.cpp
- Refer to AMD terminology except when suggesting NVIDIA alternatives in parentheses
- ElementWiseTraits was renamed to ElementWiseShape
- Adopt suggestions made by Copilot when prompted to check for factual or typographical errors
* Simplify CMakeLists.txt and remove the unused variables this uncovers
* Rename a file and fix some copyright statements
* Changes made by script/clang-format-overwrite.sh
* Add basic unit test for ElementWiseKernel
* Remove left-over uninformative comment in apply unit test
* Changes made by clang-format-overwrite.sh
* fixup! Use default template parameter values for memory operation and coherence in a call to make_naive_tensor_view
* Clean up test_tuple_apply.cpp and test_elementwise_1d.cpp
* Use make_uniform_array_with_factory to define h_xs and d_xs_mems_owner as type std::array
* Use a DeviceMem constructor that calls get_element_space_size_in_bytes internally
* Move examples to folder 20_elementwise
* Reduced register pressure on the CK tile elementwise kernel + add 4d input example to be able benchmark against old CK
* Fix CLang formating
* Bump up the elementwise example folder number
* Elementwise: add padding + minor cleanup
* Add Vector Size inference + fix issue with wrong vectorization due to missing GuaranteedLastDimensionVectorStride setting in make_naive_tensor_view
* Add isSupportedArg to Elementwise kernel + addapt example and unit tests
* Fix clang-format on the unit test file
---------
Co-authored-by: Damien Lejeune <damien.lejeune@amd.com>
Co-authored-by: Sami Aario <samaario@amd.com>
Co-authored-by: Mohsen Saffari <mohsen.saffari@amd.com>
Co-authored-by: Aviral Goel <aviral.goel@amd.com>
[ROCm/composable_kernel commit: 606b0cc947]
This commit is contained in:
committed by
GitHub
parent
bdb86fee78
commit
094e5bad50
@@ -264,10 +264,14 @@ struct tuple : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>
|
||||
|
||||
#define TP_COM_() static_assert(I < size(), "wrong! out of range")
|
||||
// clang-format off
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() const { TP_COM_(); return impl::getv<I>(*this); }
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) const { TP_COM_(); return get<I>(); }
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() { TP_COM_(); return impl::getv<I>(*this); }
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) { TP_COM_(); return get<I>(); }
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() const & { TP_COM_(); return impl::getv<I>(*this); }
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) const & { TP_COM_(); return get<I>(); }
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() & { TP_COM_(); return impl::getv<I>(*this); }
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) & { TP_COM_(); return get<I>(); }
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() && { TP_COM_(); return impl::getv<I>(std::move(*this)); }
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) && { TP_COM_(); return std::move(*this).template get<I>(); }
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() const && { TP_COM_(); return impl::getv<I>(std::move(*this)); }
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) const &&{ TP_COM_(); return std::move(*this).template get<I>(); }
|
||||
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) at() const { TP_COM_(); return impl::getv<I>(*this); }
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) at(number<I>) const { TP_COM_(); return get<I>(); }
|
||||
@@ -470,6 +474,12 @@ transform_tuples_impl(F f, const X& x, const Y& y, const Z& z, sequence<Is...>)
|
||||
return make_tuple(f(x.at(number<Is>{}), y.at(number<Is>{}), z.at(number<Is>{}))...);
|
||||
}
|
||||
|
||||
template <typename F, typename Tuple, index_t... Is>
|
||||
constexpr decltype(auto) apply_impl(F&& f, Tuple&& t, sequence<Is...>)
|
||||
{
|
||||
return std::forward<F>(f)(std::forward<Tuple>(t).get(number<Is>{})...);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename F, typename X>
|
||||
@@ -493,6 +503,13 @@ CK_TILE_HOST_DEVICE constexpr auto transform_tuples(F f, const X& x, const Y& y,
|
||||
f, x, y, z, typename arithmetic_sequence_gen<0, X::size(), 1>::type{});
|
||||
}
|
||||
|
||||
template <typename F, typename Tuple>
|
||||
constexpr decltype(auto) apply(F&& f, Tuple&& t)
|
||||
{
|
||||
constexpr index_t N = std::decay_t<Tuple>::size();
|
||||
return detail::apply_impl(std::forward<F>(f), std::forward<Tuple>(t), make_index_sequence<N>{});
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename F, typename X, index_t... Is>
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
#include "ck_tile/host/reference/reference_rowwise_quantization2d.hpp"
|
||||
#include "ck_tile/host/reference/reference_softmax.hpp"
|
||||
#include "ck_tile/host/reference/reference_topk.hpp"
|
||||
#include "ck_tile/host/reference/reference_transpose.hpp"
|
||||
#include "ck_tile/host/rotating_buffers.hpp"
|
||||
#include "ck_tile/host/stream_config.hpp"
|
||||
#include "ck_tile/host/stream_utils.hpp"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
|
||||
33
include/ck_tile/host/reference/reference_transpose.hpp
Normal file
33
include/ck_tile/host/reference/reference_transpose.hpp
Normal file
@@ -0,0 +1,33 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/host_tensor.hpp"
|
||||
#include <thread>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ADataType, typename BDataType>
|
||||
void reference_transpose_elementwise(const HostTensor<ADataType>& a, HostTensor<BDataType>& b)
|
||||
{
|
||||
ck_tile::index_t M = static_cast<ck_tile::index_t>(a.mDesc.get_lengths()[0]);
|
||||
ck_tile::index_t N = static_cast<ck_tile::index_t>(a.mDesc.get_lengths()[1]);
|
||||
|
||||
// Ensure the b tensor is sized correctly for N x M
|
||||
if(static_cast<ck_tile::index_t>(b.mDesc.get_lengths()[0]) != N ||
|
||||
static_cast<ck_tile::index_t>(b.mDesc.get_lengths()[1]) != M)
|
||||
{
|
||||
throw std::runtime_error("Output tensor b has incorrect dimensions for transpose.");
|
||||
}
|
||||
|
||||
auto f = [&](auto i, auto j) {
|
||||
auto v_a = a(i, j);
|
||||
b(j, i) = ck_tile::type_convert<BDataType>(v_a);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f, M, N)(std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -3,6 +3,11 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/ops/elementwise/binary_elementwise_operation.hpp"
|
||||
#include "ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp"
|
||||
#include "ck_tile/ops/elementwise/pipeline/elementwise_pipeline_default_policy.hpp"
|
||||
#include "ck_tile/ops/elementwise/pipeline/elementwise_pipeline_problem.hpp"
|
||||
#include "ck_tile/ops/elementwise/pipeline/elementwise_shape.hpp"
|
||||
#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
|
||||
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
|
||||
#include "ck_tile/ops/common/tensor_layout.hpp"
|
||||
|
||||
@@ -0,0 +1,94 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
namespace element_wise {
|
||||
|
||||
struct Add
|
||||
{
|
||||
template <typename Y, typename X0, typename X1>
|
||||
__host__ __device__ constexpr void operator()(Y& y, const X0& x0, const X1& x1) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void
|
||||
operator()<float>(float& y, const float& x0, const float& x1) const
|
||||
{
|
||||
y = x0 + x1;
|
||||
};
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void
|
||||
operator()<double>(double& y, const double& x0, const double& x1) const
|
||||
{
|
||||
y = x0 + x1;
|
||||
};
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void
|
||||
operator()<float>(float& y, const float& x0, const half_t& x1) const
|
||||
{
|
||||
y = x0 + type_convert<half_t>(x1);
|
||||
};
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void
|
||||
operator()<half_t>(half_t& y, const float& x0, const float& x1) const
|
||||
{
|
||||
y = type_convert<half_t>(x0 + x1);
|
||||
};
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void
|
||||
operator()<half_t>(half_t& y, const float& x0, const half_t& x1) const
|
||||
{
|
||||
y = type_convert<half_t>(x0) + x1;
|
||||
};
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void
|
||||
operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
|
||||
{
|
||||
y = x0 + x1;
|
||||
};
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void
|
||||
operator()<float>(float& y, const float& x0, const bf16_t& x1) const
|
||||
{
|
||||
const float x1_tmp = type_convert<float>(x1);
|
||||
y = x0 + x1_tmp;
|
||||
}
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void
|
||||
operator()<bf16_t>(bf16_t& y, const bf16_t& x0, const bf16_t& x1) const
|
||||
{
|
||||
const float x1_tmp = type_convert<float>(x0);
|
||||
const float x2_tmp = type_convert<float>(x1);
|
||||
const float y_tmp = x1_tmp + x2_tmp;
|
||||
y = type_convert<bf16_t>(y_tmp);
|
||||
}
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void
|
||||
operator()<bf16_t>(bf16_t& y, const float& x0, const bf16_t& x1) const
|
||||
{
|
||||
const float x2_tmp = type_convert<float>(x1);
|
||||
const float y_tmp = x0 + x2_tmp;
|
||||
y = type_convert<bf16_t>(y_tmp);
|
||||
}
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void
|
||||
operator()<int8_t>(int8_t& y, const int8_t& x0, const int8_t& x1) const
|
||||
{
|
||||
y = x0 + x1;
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace element_wise
|
||||
} // namespace ck_tile
|
||||
123
include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp
Normal file
123
include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp
Normal file
@@ -0,0 +1,123 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/ops/common.hpp"
|
||||
#include "ck_tile/ops/elementwise/pipeline/elementwise_pipeline_problem.hpp"
|
||||
#include "ck_tile/ops/elementwise/pipeline/elementwise_pipeline_default_policy.hpp"
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename Problem_, typename Policy_>
|
||||
struct ElementWiseKernel
|
||||
{
|
||||
using Problem = ck_tile::remove_cvref_t<Problem_>;
|
||||
using Policy = ck_tile::remove_cvref_t<Policy_>;
|
||||
|
||||
using XDataType = ck_tile::remove_cvref_t<typename Problem::XDataType>;
|
||||
using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
|
||||
using YDataType = ck_tile::remove_cvref_t<typename Problem::YDataType>;
|
||||
using ElementWiseOperation = ck_tile::remove_cvref_t<typename Problem::ElementWiseOperation>;
|
||||
|
||||
template <typename... XDataType, typename Dims>
|
||||
CK_TILE_DEVICE void operator()(Dims lens,
|
||||
Dims input_strides,
|
||||
Dims output_strides,
|
||||
const tuple<XDataType...>& input_tensors,
|
||||
YDataType* p_y) const
|
||||
{
|
||||
using S = typename Problem::BlockShape;
|
||||
|
||||
// Setup block-level coordinates and transforms
|
||||
const index_t iM = get_block_id() * S::kBlockM;
|
||||
const auto merge_transform = make_merge_transform(lens);
|
||||
|
||||
// Load all input tiles into registers.
|
||||
// The lambda structure here is intended to minimize the lifetime
|
||||
// of intermediate objects (views, windows) used for loading.
|
||||
const auto x_tiles = ck_tile::generate_tuple(
|
||||
[&](auto i) {
|
||||
const auto tensor_view = make_naive_tensor_view<address_space_enum::global>(
|
||||
input_tensors.get(i), lens, input_strides, number<S::kVectorM>{}, number<1>{});
|
||||
|
||||
const auto transformed_tensor = pad_tensor_view(
|
||||
transform_tensor_view(tensor_view,
|
||||
ck_tile::make_tuple(merge_transform),
|
||||
ck_tile::make_tuple(make_index_sequence<Dims::size()>{}),
|
||||
ck_tile::make_tuple(sequence<0>{})),
|
||||
ck_tile::make_tuple(number<S::kBlockM>{}),
|
||||
sequence<Problem::kPad>{});
|
||||
|
||||
const auto x_window =
|
||||
make_tile_window(transformed_tensor,
|
||||
ck_tile::make_tuple(number<S::kBlockM>{}),
|
||||
{iM},
|
||||
Policy::template MakeXBlockTileDistribution<Problem>());
|
||||
|
||||
return load_tile(x_window);
|
||||
},
|
||||
number<sizeof...(XDataType)>{});
|
||||
|
||||
// Setup output tile in registers.
|
||||
const auto& x_tile0 = x_tiles.get(number<0>{});
|
||||
auto y_tile = make_static_distributed_tensor<YDataType>(x_tile0.get_tile_distribution());
|
||||
|
||||
// Perform element-wise computation.
|
||||
const auto spans = x_tile0.get_distributed_spans();
|
||||
sweep_tile_span(spans[number<0>{}], [&](auto idx) {
|
||||
const auto tile_idx = make_tuple(idx);
|
||||
apply(
|
||||
[&](auto&&... tiles) {
|
||||
ElementWiseOperation{}(y_tile(tile_idx),
|
||||
type_convert<ComputeDataType>(tiles[tile_idx])...);
|
||||
},
|
||||
x_tiles);
|
||||
});
|
||||
|
||||
// Setup output window and store the result tile.
|
||||
const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
|
||||
p_y, lens, output_strides, number<S::kVectorM>{});
|
||||
|
||||
const auto transformed_y_m_n = pad_tensor_view(
|
||||
transform_tensor_view(y_m_n,
|
||||
ck_tile::make_tuple(merge_transform),
|
||||
ck_tile::make_tuple(make_index_sequence<Dims::size()>{}),
|
||||
ck_tile::make_tuple(sequence<0>{})),
|
||||
ck_tile::make_tuple(number<S::kBlockM>{}),
|
||||
sequence<Problem::kPad>{});
|
||||
|
||||
auto y_window = make_tile_window(transformed_y_m_n,
|
||||
make_tuple(number<S::kBlockM>{}),
|
||||
{iM},
|
||||
y_tile.get_tile_distribution());
|
||||
|
||||
store_tile(y_window, cast_tile<YDataType>(y_tile));
|
||||
}
|
||||
|
||||
template <typename... Ints>
|
||||
CK_TILE_HOST static bool IsSupportedArgument(const ck_tile::tuple<Ints...>& input_sizes)
|
||||
{
|
||||
int total_elements = 1;
|
||||
const auto kVectorM = Problem_::BlockShape::kVectorM;
|
||||
|
||||
apply([&](auto&&... args) { ((total_elements *= args), ...); }, input_sizes);
|
||||
|
||||
if((total_elements % kVectorM) != 0)
|
||||
{
|
||||
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
|
||||
{
|
||||
CK_TILE_ERROR("Conditions not met: total number of input elements (",
|
||||
total_elements,
|
||||
") should be multiple of the vectorization size (",
|
||||
kVectorM,
|
||||
")");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,29 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
struct ElementWiseDefaultPolicy
|
||||
{
|
||||
template <typename Problem>
|
||||
CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution()
|
||||
{
|
||||
using S = typename Problem::BlockShape;
|
||||
return make_static_tile_distribution(
|
||||
tile_distribution_encoding<sequence<>, // Replicate
|
||||
tuple<sequence<S::kRepeatM,
|
||||
S::kWarpPerBlockM,
|
||||
S::kThreadPerWarpM,
|
||||
S::kVectorM>>, // Hierarchical
|
||||
tuple<sequence<1>, sequence<1>>, // Parallel
|
||||
tuple<sequence<1>, sequence<2>>, // Parallel
|
||||
sequence<1, 1>, // Yield
|
||||
sequence<0, 3>>{} // Yield
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,26 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core/utility/type_traits.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename XDataType_,
|
||||
typename ComputeDataType_,
|
||||
typename YDataType_,
|
||||
typename BlockShape_,
|
||||
typename ElementWiseOperation_,
|
||||
bool kPad_ = true>
|
||||
struct ElementWisePipelineProblem
|
||||
{
|
||||
using XDataType = remove_cvref_t<XDataType_>;
|
||||
using ComputeDataType = remove_cvref_t<ComputeDataType_>;
|
||||
using YDataType = remove_cvref_t<YDataType_>;
|
||||
using BlockShape = remove_cvref_t<BlockShape_>;
|
||||
using ElementWiseOperation = remove_cvref_t<ElementWiseOperation_>;
|
||||
static constexpr bool kPad = kPad_;
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,29 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core/utility/type_traits.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename BlockWarps, typename BlockTile, typename WarpTile, typename ComputeDataType>
|
||||
struct ElementWiseShape
|
||||
{
|
||||
static constexpr index_t kBlockM = BlockTile::at(number<0>{});
|
||||
|
||||
static constexpr index_t kWarpM = WarpTile::at(number<0>{});
|
||||
|
||||
static constexpr index_t kVectorM = 16 / sizeof(ComputeDataType);
|
||||
|
||||
static constexpr index_t kWarpPerBlockM = BlockWarps::at(number<0>{});
|
||||
|
||||
static constexpr index_t kThreadPerWarpM = kWarpM / kVectorM;
|
||||
|
||||
static constexpr index_t kRepeatM = kBlockM / (kWarpPerBlockM * kWarpM);
|
||||
|
||||
static constexpr index_t kBlockSize =
|
||||
ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
|
||||
Reference in New Issue
Block a user