Add optimized copy to ck wrapper (#1126)

* Add optimized copy to ck wrapper * Example optimizations * Fixes * Move img2col test to client example * Refactor example * Fix docs * Fixes * Fix * Fixes * Fixes * Fixes * Fixes * Fixes --------- Co-authored-by: zjing14 <zhangjing14@gmail.com>
2026-04-20 06:49:15 +00:00 · 2024-01-19 11:29:00 +01:00
parent 38882d8ab5
commit 7e4eb4b800
17 changed files with 1109 additions and 865 deletions
--- a/include/ck/wrapper/utils/layout_utils.hpp
+++ b/include/ck/wrapper/utils/layout_utils.hpp
@@ -22,14 +22,19 @@ namespace wrapper {
 // Disable from doxygen docs generation
 /// @cond
 // forward declaration
-template <typename Shape, typename UnnestedDescriptorType>
+template <typename Shape, typename UnrolledDescriptorType>
 struct Layout;

 template <typename T>
 using is_tuple = decltype(std::declval<T&>().IsTuple());

 namespace {
-// Generate packed (column-major) strides if not passed
+/**
+ * \brief Generate packed (column-major) strides if not passed
+ *
+ * \param shape Tensor shape.
+ * \return Generated column-major strides.
+ */
 template <typename... Ts>
 __host__ __device__ constexpr static auto
 GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape)
@@ -50,9 +55,16 @@ GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape)
        Number<decltype(unrolled_shape)::Size()>{});
 }

+/**
+ * \brief Create naive tensor descriptor from nested shape.
+ *
+ * \param shape Tensor shape.
+ * \param strides Tensor strides.
+ * \return Unrolled descriptor
+ */
 template <typename LayoutShape, typename LayoutStrides>
-__host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shape,
-                                                         const LayoutStrides& strides)
+__host__ __device__ constexpr auto MakeUnrolledDescriptor(const LayoutShape& shape,
+                                                          const LayoutStrides& strides)
 {
    const auto unrolled_shape = UnrollNestedTuple(shape);
    if constexpr(is_same_v<LayoutStrides, Tuple<>>)
@@ -86,8 +98,8 @@ __host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shap
 template <typename Shape, typename Strides>
 __host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides& strides)
 {
-    using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Strides{}));
-    return Layout<Shape, UnnestedDescriptorType>(shape, MakeFlattenDescriptor(shape, strides));
+    using UnrolledDescriptorType = decltype(MakeUnrolledDescriptor(Shape{}, Strides{}));
+    return Layout<Shape, UnrolledDescriptorType>(shape, MakeUnrolledDescriptor(shape, strides));
 }

 /**
@@ -100,15 +112,19 @@ __host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides
 template <typename Shape>
 __host__ __device__ constexpr auto make_layout(const Shape& shape)
 {
-    using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Tuple<>{}));
-    return Layout<Shape, UnnestedDescriptorType>(shape, MakeFlattenDescriptor(shape, Tuple<>{}));
+    using UnrolledDescriptorType = decltype(MakeUnrolledDescriptor(Shape{}, Tuple<>{}));
+    return Layout<Shape, UnrolledDescriptorType>(shape, MakeUnrolledDescriptor(shape, Tuple<>{}));
 }

 // Layout helpers
 // get
-// Get dim (could be returned from get with empty Idxs)
+
 /**
 * \private
+ * \brief Get dim.
+ *
+ * \param dim Dimension.
+ * \return Returned the same dimension.
 */
 template <typename T>
 __host__ __device__ T constexpr get(const T& dim)
@@ -178,7 +194,7 @@ __host__ __device__ constexpr auto get(const Layout<Shape, FlattenDesc>& layout)
        },
        Number<old_shape_dims>{});

-    const auto& flatten_desc = layout.GetUnnestedDescriptor();
+    const auto& flatten_desc = layout.GetUnrolledDescriptor();
    auto new_desc = transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
    return Layout<decltype(new_shape), decltype(new_desc)>(new_shape, new_desc);
 }
@@ -197,9 +213,12 @@ __host__ __device__ constexpr auto get(const T& elem)
 }

 // size
-// Get dim size (could be returned from get function)
 /**
 * \private
+ * \brief Get size.
+ *
+ * \param dim Size.
+ * \return Returned the same size.
 */
 template <typename T>
 __host__ __device__ T constexpr size(const T& dim)
@@ -214,8 +233,8 @@ __host__ __device__ T constexpr size(const T& dim)
 * \param layout Layout to get Shape of.
 * \return Requsted length.
 */
-template <index_t idx, typename Shape, typename UnnestedDescriptorType>
-__host__ __device__ constexpr auto size(const Layout<Shape, UnnestedDescriptorType>& layout)
+template <index_t idx, typename Shape, typename UnrolledDescriptorType>
+__host__ __device__ constexpr auto size(const Layout<Shape, UnrolledDescriptorType>& layout)
 {
    return layout.template GetLength<idx>();
 }
@@ -240,8 +259,8 @@ __host__ __device__ constexpr auto size(const Tuple<ShapeDims...>& shape)
 * \param layout Layout to calculate shape size.
 * \return Requsted size.
 */
-template <typename Shape, typename UnnestedDescriptorType>
-__host__ __device__ constexpr auto size(const Layout<Shape, UnnestedDescriptorType>& layout)
+template <typename Shape, typename UnrolledDescriptorType>
+__host__ __device__ constexpr auto size(const Layout<Shape, UnrolledDescriptorType>& layout)
 {
    return layout.GetLengths();
 }
@@ -280,9 +299,9 @@ __host__ __device__ constexpr auto size(const T& elem)
 * \param layout Layout to calculate rank.
 * \return Requsted rank.
 */
-template <typename Shape, typename UnnestedDescriptorType>
+template <typename Shape, typename UnrolledDescriptorType>
 __host__ __device__ constexpr auto
-rank([[maybe_unused]] const Layout<Shape, UnnestedDescriptorType>& layout)
+rank([[maybe_unused]] const Layout<Shape, UnrolledDescriptorType>& layout)
 {
    return Shape::Size();
 }
@@ -302,17 +321,25 @@ __host__ __device__ constexpr auto rank([[maybe_unused]] const Tuple<Dims...>& t

 /**
 * \private
+ * \brief Rank for scalar
+ *
+ * \param dim Dimension scalar.
+ * \return Returned 1.
 */
 template <index_t IDim>
-__host__ __device__ constexpr index_t rank(const Number<IDim>&)
+__host__ __device__ constexpr index_t rank([[maybe_unused]] const Number<IDim>& dim)
 {
    return 1;
 }

 /**
 * \private
+ * \brief Rank for scalar
+ *
+ * \param dim Dimension scalar.
+ * \return Returned 1.
 */
-__host__ __device__ constexpr index_t rank(const index_t&) { return 1; }
+__host__ __device__ constexpr index_t rank([[maybe_unused]] const index_t& dim) { return 1; }

 /**
 * \brief Hierarchical rank.
@@ -334,8 +361,8 @@ __host__ __device__ constexpr auto rank(const T& elem)
 * \param layout Layout to calculate depth.
 * \return Requsted depth.
 */
-template <typename Shape, typename UnnestedDescriptorType>
-__host__ __device__ constexpr auto depth(const Layout<Shape, UnnestedDescriptorType>& layout)
+template <typename Shape, typename UnrolledDescriptorType>
+__host__ __device__ constexpr auto depth(const Layout<Shape, UnrolledDescriptorType>& layout)
 {
    const auto& shape = layout.GetShape();
    return TupleDepth(shape);
@@ -355,17 +382,25 @@ __host__ __device__ constexpr auto depth(const Tuple<Dims...>& tuple)

 /**
 * \private
+ * \brief Depth for scalar
+ *
+ * \param dim Scalar.
+ * \return Returned 0.
 */
 template <index_t IDim>
-__host__ __device__ constexpr index_t depth(const Number<IDim>&)
+__host__ __device__ constexpr index_t depth([[maybe_unused]] const Number<IDim>& dim)
 {
    return 0;
 }

 /**
 * \private
+ * \brief Depth for scalar
+ *
+ * \param dim Scalar.
+ * \return Returned 0.
 */
-__host__ __device__ constexpr index_t depth(const index_t&) { return 0; }
+__host__ __device__ constexpr index_t depth([[maybe_unused]] const index_t& dim) { return 0; }

 /**
 * \brief Hierarchical depth.
--- a/include/ck/wrapper/utils/tensor_partition.hpp
+++ b/include/ck/wrapper/utils/tensor_partition.hpp
@@ -6,12 +6,22 @@
 #include "tensor_utils.hpp"
 #include "layout_utils.hpp"

+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+
 namespace ck {
 namespace wrapper {

 namespace {
-// Calculate shape for partition based on number of threads per each dim and
-// previous shape
+
+/**
+ * \brief Calculate shape for partition based on number of threads per each dim and
+ * previous shape
+ *
+ * \param shape Base tensor shape.
+ * \param thread_lengths Tuple of thread lengths.
+ * \return Partition shape.
+ */
 template <typename... Ts, typename... Ls>
 __host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple<Ts...>& shape,
                                                                const Tuple<Ls...>& thread_lengths)
@@ -20,265 +30,165 @@ __host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple<Ts..
    return generate_tuple(
        [&](auto i) {
            constexpr auto num_i = Number<i>{};
-            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
-            {
-                // if tuple then recurrence
-                return CalculateLocalPartitionShape(shape.At(num_i), thread_lengths.At(num_i));
-            }
-            else
-            {
-                const auto slice_len = shape.At(num_i) / thread_lengths.At(num_i);
-                return slice_len;
-            }
-        },
-        Number<Tuple<Ts...>::Size()>{});
-}
-
-// Calculate shape for partition based on number of threads per each dim,
-// previous strides and steps
-template <typename... Ts, typename... Ls, typename... Steps, typename FlattenDescType>
-__host__ __device__ constexpr auto
-CalculateLocalPartitionDescriptor(const Tuple<Ts...>& shape,
-                                  const Tuple<Ls...>& thread_lengths,
-                                  const Tuple<Steps...>& steps,
-                                  const FlattenDescType& flatten_desc)
-{
-
-    static_assert(Tuple<Ts...>::Size() == Tuple<Ls...>::Size(), "Wrong thread_lengths shape.");
-    const auto unrolled_thread_lengths = UnrollNestedTuple(thread_lengths);
-    const auto unrolled_shape          = UnrollNestedTuple(shape);
-    constexpr auto dims                = decltype(unrolled_thread_lengths)::Size();
-
-    using UnrolledStepsType = decltype(UnrollNestedTuple(steps));
-
-    using I1 = Number<1>;
-
-    const auto transforms = generate_tuple(
-        [&](auto i) {
-            constexpr auto num_i = Number<i>{};
-            if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
-            {
-                // By default raked partition
-                const auto partition_stride = unrolled_thread_lengths.At(num_i);
-                return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
-                                            make_tuple(partition_stride));
-            }
-            else if constexpr(!is_same_v<tuple_element_t<i.value, UnrolledStepsType>, index_t>)
-            {
-                // Compiletime partition
-                if constexpr(is_same_v<tuple_element_t<i.value, UnrolledStepsType>, I1>)
-                {
-                    // raked
-                    const auto partition_stride = unrolled_thread_lengths.At(num_i);
-                    return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
-                                                make_tuple(partition_stride));
-                }
-                else
-                {
-                    // packed
-                    return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
-                                                make_tuple(I1{}));
-                }
-            }
-            else
-            {
-                // Runtime partition
-                if(steps.At(num_i) == 1)
-                {
-                    // raked
-                    const auto partition_stride = unrolled_thread_lengths.At(num_i);
-                    return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
-                                                make_tuple(partition_stride));
-                }
-                else
-                {
-                    // packed
-                    return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
-                                                make_tuple(I1{}));
-                }
-            }
-        },
-        Number<dims>{});
-
-    const auto lower_dims =
-        generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<dims>{});
-    const auto upper_dims =
-        generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<dims>{});
-    return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
-}
-
-template <typename... Ls, typename... Steps>
-__host__ __device__ constexpr auto CalculateLayoutOffsetIdxImpl(const Tuple<Ls...>& thread_lengths,
-                                                                const Tuple<Steps...>& steps,
-                                                                index_t& thread_id)
-{
-    return generate_tuple(
-        [&](auto i) {
-            constexpr auto num_i = Number<i>{};
-            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ls...>>>::value)
-            {
-                // if tuple then recurrence
-                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
-                {
-                    return CalculateLayoutOffsetIdxImpl(
-                        thread_lengths.At(num_i), Tuple<>{}, thread_id);
-                }
-                else
-                {
-                    return CalculateLayoutOffsetIdxImpl(
-                        thread_lengths.At(num_i), steps.At(num_i), thread_id);
-                }
-            }
-            else
-            {
-                // Update thread_id after each dim
-                const auto dim_thread_id = thread_id % thread_lengths.At(num_i);
-                thread_id /= thread_lengths.At(num_i);
-                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
-                {
-                    return dim_thread_id;
-                }
-                else
-                {
-                    // Apply step
-                    return steps.At(num_i) * dim_thread_id;
-                }
-            }
+            const auto slice_len = size<num_i>(shape) / thread_lengths.At(num_i);
+            return slice_len;
        },
        Number<Tuple<Ls...>::Size()>{});
 }

-// Convert integer thread_idx to tuple index with steps applied
-template <typename... Ls, typename... Steps>
-__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Ls...>& thread_lengths,
-                                                            const Tuple<Steps...>& steps,
-                                                            const index_t thread_id)
+/**
+ * \brief Calculate total number of blocks.
+ *
+ * \param shape Base tensor shape.
+ * \param tile_shape Tile shape.
+ * \return Tuple with blocks number.
+ */
+template <typename... Ts, typename... Ls>
+__host__ __device__ constexpr auto CalculateGridSize(const Tuple<Ts...>& shape,
+                                                     const Tuple<Ls...>& tile_shape)
 {
-    // Create tmp thread_id copy for CalculateLayoutOffsetIdxImpl updates
-    index_t thread_id_copy = thread_id;
-    return CalculateLayoutOffsetIdxImpl(thread_lengths, steps, thread_id_copy);
+    static_assert(Tuple<Ts...>::Size() == Tuple<Ls...>::Size(), "Wrong thread_lengths shape.");
+    return generate_tuple([&](auto i) { return size<i>(shape) / size<i>(tile_shape); },
+                          Number<Tuple<Ls...>::Size()>{});
 }

-// Apply steps to index represented as tuple
-template <typename... Steps, typename... Idxs>
-__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Steps...>& steps,
-                                                            const Tuple<Idxs...>& block_idxs)
+/**
+ * \brief Calculate scaled offset for new partition/tile.
+ *
+ * \param thread_idxs Thread 1d id.
+ * \param partition_lengths_seq Sequence of partition shape.
+ * \param old_offset_idxs Multi index offset from base tensor to shift values.
+ * \return Partition shape.
+ */
+template <typename ThreadIdxs, typename PartitionLengthsSeq, typename OldOffsetIdxs>
+__host__ __device__ constexpr auto
+CalculateOffsetMultiIdxs(const ThreadIdxs& thread_idxs,
+                         const PartitionLengthsSeq& partition_lengths_seq,
+                         const OldOffsetIdxs& old_offset_idxs)
 {
-    return generate_tuple(
-        [&](auto i) {
-            constexpr auto num_i = Number<i>{};
-            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Idxs...>>>::value)
-            {
-                // if tuple then recurrence
-                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
-                {
-                    return CalculateLayoutOffsetIdx(Tuple<>{}, block_idxs.At(num_i));
-                }
-                else
-                {
-                    return CalculateLayoutOffsetIdx(steps.At(num_i), block_idxs.At(num_i));
-                }
-            }
-            else
-            {
-                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
-                {
-                    return block_idxs.At(num_i);
-                }
-                else
-                {
-                    // apply step
-                    return steps.At(num_i) * block_idxs.At(num_i);
-                }
-            }
-        },
-        Number<Tuple<Idxs...>::Size()>{});
+    return thread_idxs * partition_lengths_seq + old_offset_idxs;
 }

-// User passes only shape per block to the make_local_tile function. This function calculates
-// block layout based on the shape.
-template <typename... Ts, typename... BlockDims>
-__host__ __device__ constexpr auto CalculateBlockLengths(const Tuple<Ts...>& shape,
-                                                         const Tuple<BlockDims...>& tile_shape)
-{
-    return generate_tuple(
-        [&](auto i) {
-            constexpr auto num_i = Number<i>{};
-            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
-            {
-                // if tuple then recurrence
-                return CalculateBlockLengths(shape.At(num_i), tile_shape.At(num_i));
-            }
-            else
-            {
-                return shape.At(num_i) / tile_shape.At(num_i);
-            }
-        },
-        Number<Tuple<Ts...>::Size()>{});
-}
 } // namespace

 /**
- * \brief Create local partition for thread.
+ * \brief Create local partition for thread (At now only packed partition
+ * is supported).
 *
 * \param tensor Tensor for partition.
- * \param thread_lengths Layout of threads.
+ * \param thread_lengths Layout of threads (could not be nested).
 * \param thread_id Thread index represented as integer.
- * \param steps Thread step (default=1, raked partition)
 * \return Partition tensor.
 */
-template <typename TensorType, typename ThreadLengthsTuple, typename StepsTuple = Tuple<>>
-__host__ __device__ constexpr auto make_local_partition(const TensorType& tensor,
-                                                        const ThreadLengthsTuple& thread_lengths,
-                                                        const index_t thread_id,
-                                                        const StepsTuple steps = StepsTuple{})
+template <typename TensorType, typename ThreadLengthsTuple>
+__host__ __device__ constexpr auto
+make_local_partition(TensorType& tensor,
+                     [[maybe_unused]] const ThreadLengthsTuple& thread_lengths,
+                     const index_t thread_id)
 {
-    // Create shape, strides and layout for new partition tensor
-    const auto partition_shape = CalculateLocalPartitionShape(shape(tensor), thread_lengths);
-    // Create new descriptor and layout
-    const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor();
-    auto partition_desc =
-        CalculateLocalPartitionDescriptor(shape(tensor), thread_lengths, steps, flatten_desc);
-    const auto partition_layout = Layout<decltype(partition_shape), decltype(partition_desc)>(
-        partition_shape, partition_desc);
-    // Calculate offset for new partition tensor
-    const auto offset_idx       = CalculateLayoutOffsetIdx(thread_lengths, steps, thread_id);
-    const auto partition_offset = layout(tensor)(offset_idx);
-    return make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer() + partition_offset,
-                                                             partition_layout);
+    static_assert(!IsNestedTuple(ThreadLengthsTuple{}));
+    // Calculate new partition shape
+    const auto& tensor_shape = shape(tensor);
+    constexpr auto partition_shape =
+        CalculateLocalPartitionShape(decltype(tensor_shape){}, ThreadLengthsTuple{});
+    // Create Thread Cluster Descriptor
+    constexpr auto partition_lengths_seq = generate_sequence_v2(
+        [&](auto I) { return size<I>(partition_shape); }, Number<ThreadLengthsTuple::Size()>{});
+    constexpr auto thread_lengths_seq =
+        generate_sequence_v2([&](auto I) { return size<I>(ThreadLengthsTuple{}); },
+                             Number<ThreadLengthsTuple::Size()>{});
+    constexpr auto thread_cluster_desc_ = make_cluster_descriptor(thread_lengths_seq);
+    // Calculate thread idxs and offsets
+    const auto thread_idxs = thread_cluster_desc_.CalculateBottomIndex(make_multi_index(thread_id));
+    const auto offset_multi_idxs =
+        CalculateOffsetMultiIdxs(thread_idxs, partition_lengths_seq, tensor.GetMultiIdxOffsets());
+    // Create new layout and tensor
+    auto& flatten_desc = layout(tensor).GetUnrolledDescriptor();
+    const auto partition_layout =
+        Layout<remove_reference_t<decltype(partition_shape)>, decltype(flatten_desc)>(
+            partition_shape, flatten_desc);
+    auto partition_tensor =
+        make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer(), partition_layout);
+    // Apply offsets
+    partition_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs));
+    return partition_tensor;
 }

 /**
- * \brief Create local tile for thread block.
+ * \brief Create local tile for thread block. (At now only packed tile
+ * is supported).
+ *
+ * \note Temporary to gain the best performance use 2d
+ * tile_shape.
+ *
 *
 * \param tensor Tensor for partition.
 * \param tile_shape Shapes of requested tile.
- * \param block_idx Block index represented as tuple.
- * \param steps Block step (default=1, raked partition)
+ * \param block_id Block index represented as integer.
+
 * \return Tile tensor.
 */
-template <typename TensorType,
-          typename BlockShapeTuple,
-          typename BlockIdxTuple,
-          typename StepsTuple = Tuple<>>
-__host__ __device__ constexpr auto make_local_tile(const TensorType& tensor,
-                                                   const BlockShapeTuple& tile_shape,
-                                                   const BlockIdxTuple& block_idx,
-                                                   const StepsTuple steps = StepsTuple{})
+template <typename TensorType, typename BlockShapeTuple>
+__host__ __device__ constexpr auto
+make_local_tile(const TensorType& tensor, const BlockShapeTuple& tile_shape, const index_t block_id)
 {
-    // Create block lengths, strides and layout for new tile tensor
-    const auto block_lengths = CalculateBlockLengths(shape(tensor), tile_shape);
-    // Create new descriptor and layout
-    const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor();
-    auto tile_desc =
-        CalculateLocalPartitionDescriptor(tile_shape, block_lengths, steps, flatten_desc);
-    const auto tile_layout = Layout<remove_reference_t<decltype(tile_shape)>, decltype(tile_desc)>(
-        tile_shape, tile_desc);
-    // Calculate offset for new partition tensor
-    const auto offset_idx  = CalculateLayoutOffsetIdx(steps, block_idx);
-    const auto tile_offset = layout(tensor)(offset_idx);
-    return make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer() + tile_offset,
-                                                             tile_layout);
+    static_assert(!IsNestedTuple(BlockShapeTuple{}));
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    auto& aligned_desc = layout(tensor).GetMergedNestingDescriptor();
+
+    if constexpr(BlockShapeTuple::Size() == I2)
+    {
+        // Optimized version for 2d tile shape [MxK]
+        const auto block_2_tile_map =
+            BlockToCTileMap_M00_N0_M01Adapt<BlockShapeTuple{}.At(I0),
+                                            BlockShapeTuple{}.At(I1),
+                                            remove_cvref_t<decltype(aligned_desc)>>(aligned_desc);
+        const auto block_work_idx =
+            block_2_tile_map.CalculateBottomIndex(make_multi_index(block_id));
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * size<0>(tile_shape));
+        const index_t k_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * size<1>(tile_shape));
+        const auto offset_multi_idxs =
+            make_tuple(m_block_data_idx_on_grid, k_block_data_idx_on_grid);
+        // Create new layout and tensor
+        const auto tile_layout =
+            Layout<remove_reference_t<decltype(tile_shape)>, decltype(aligned_desc)>(tile_shape,
+                                                                                     aligned_desc);
+        auto tile_tensor =
+            make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer(), tile_layout);
+        // Apply offsets
+        tile_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs));
+        return tile_tensor;
+    }
+    else
+    {
+        // Calculate offsets
+        // Sequence with data to process per block
+        constexpr auto tile_shape_seq =
+            generate_sequence_v2([](auto I) { return size(BlockShapeTuple{}.At(I)); },
+                                 Number<BlockShapeTuple::Size()>{});
+        // Tuple with number of blocks
+        const auto block_lengths           = CalculateGridSize(shape(tensor), tile_shape);
+        constexpr auto block_cluster_desc_ = make_cluster_descriptor(block_lengths);
+        const auto block_idxs =
+            block_cluster_desc_.CalculateBottomIndex(make_multi_index(block_id));
+        const auto offset_multi_idxs =
+            CalculateOffsetMultiIdxs(block_idxs, tile_shape_seq, tensor.GetMultiIdxOffsets());
+        // Create new layout and tensor
+        const auto tile_layout =
+            Layout<remove_reference_t<decltype(tile_shape)>, decltype(aligned_desc)>(tile_shape,
+                                                                                     aligned_desc);
+        auto tile_tensor =
+            make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer(), tile_layout);
+        // Apply offsets
+        tile_tensor.SetMultiIdxOffset(to_multi_index(offset_multi_idxs));
+        return tile_tensor;
+    }
 }

 } // namespace wrapper
--- a/include/ck/wrapper/utils/tensor_utils.hpp
+++ b/include/ck/wrapper/utils/tensor_utils.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -10,6 +10,7 @@
 #include "ck/utility/tuple_helper.hpp"
 #include "ck/utility/dynamic_buffer.hpp"
 #include "ck/utility/amd_address_space.hpp"
+#include "ck/utility/multi_index.hpp"

 namespace ck {
 namespace wrapper {
@@ -27,16 +28,12 @@ using MemoryTypeEnum = AddressSpaceEnum;
 // Disable from doxygen docs generation
 /// @cond
 // forward declarations
-template <typename Shape, typename UnnestedDescriptorType>
+template <typename Shape, typename UnrolledDescriptorType>
 struct Layout;
 template <MemoryTypeEnum BufferAddressSpace,
          typename ElementType,
          typename Shape,
-          typename UnnestedDescriptorType,
-          index_t NumVectors,     // params for Register memory
-          index_t ScalarPerVector // param for Register memory
-          >
-
+          typename UnrolledDescriptorType>
 struct Tensor;

 template <typename FromType, typename ToType>
@@ -45,13 +42,22 @@ struct Slice
    __host__ __device__ constexpr Slice() : from_(), to_() {}
    __host__ __device__ constexpr Slice(FromType from, ToType to) : from_(from), to_(to) {}

+    /**
+     * \brief Calculate slice range.
+     *
+     * \param dim Dimension size.
+     * \return Slice range.
+     */
    template <typename T>
    __host__ __device__ constexpr auto range(const T& dim) const
    {
        if constexpr(is_same_v<FromType, index_t> || is_same_v<ToType, index_t> ||
                     is_same_v<T, index_t>)
        {
-            assert(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_) && "Invalid range");
+            if(!(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_)))
+            {
+                throw std::runtime_error("Invalid range");
+            }
            if(to_ < 0)
            {
                return dim - from_ + to_ + 1;
@@ -101,40 +107,27 @@ using is_tuple = decltype(std::declval<T&>().IsTuple());
 template <MemoryTypeEnum MemoryType,
          typename ElementType,
          typename Shape,
-          typename UnnestedDescriptorType>
+          typename UnrolledDescriptorType>
 constexpr auto make_tensor(ElementType* pointer,
-                           const Layout<Shape, UnnestedDescriptorType>& layout)
+                           const Layout<Shape, UnrolledDescriptorType>& layout)
 {
-    return Tensor<MemoryType,
-                  ElementType,
-                  Shape,
-                  UnnestedDescriptorType,
-                  0 /*NumVectors*/,
-                  0 /*ScalarPerVector*/>(pointer, layout);
+    return Tensor<MemoryType, ElementType, Shape, UnrolledDescriptorType>(pointer, layout);
 }

 /**
 * \brief Make SGPR or VGPR tensor function.
 *
 * \tparam MemoryType Type of memory.
- * \tparam NumVectors Number of vectors.
- * \tparam ScalarPerVector Scalars per vector.
 * \tparam ElementType Memory data type.
 * \return Constructed tensor.
 */
 template <MemoryTypeEnum MemoryType,
-          index_t NumVectors,
-          index_t ScalarPerVector,
-          typename ElementType>
-constexpr auto make_register_tensor()
+          typename ElementType,
+          typename Shape,
+          typename UnrolledDescriptorType>
+constexpr auto make_register_tensor(const Layout<Shape, UnrolledDescriptorType>& layout)
 {
-    const auto layout = make_layout(make_tuple(Number<NumVectors>{}), make_tuple(Number<1>{}));
-    return Tensor<MemoryType,
-                  ElementType,
-                  Tuple<Number<NumVectors>>,
-                  std::remove_const_t<remove_reference_t<decltype(layout.GetUnnestedDescriptor())>>,
-                  NumVectors,
-                  ScalarPerVector>(layout);
+    return Tensor<MemoryType, ElementType, Shape, UnrolledDescriptorType>(layout);
 }

 /**
@@ -146,15 +139,9 @@ constexpr auto make_register_tensor()
 template <MemoryTypeEnum BufferAddressSpace,
          typename ElementType,
          typename Shape,
-          typename UnnestedDescriptorType,
-          index_t NumVectors,
-          index_t ScalarPerVector>
-__host__ __device__ constexpr const auto& layout(const Tensor<BufferAddressSpace,
-                                                              ElementType,
-                                                              Shape,
-                                                              UnnestedDescriptorType,
-                                                              NumVectors,
-                                                              ScalarPerVector>& tensor)
+          typename UnrolledDescriptorType>
+__host__ __device__ constexpr const auto&
+layout(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
 {
    return tensor.GetLayout();
 }
@@ -170,15 +157,9 @@ template <index_t... Idxs,
          MemoryTypeEnum BufferAddressSpace,
          typename ElementType,
          typename Shape,
-          typename UnnestedDescriptorType,
-          index_t NumVectors,
-          index_t ScalarPerVector>
-__host__ __device__ constexpr auto size(const Tensor<BufferAddressSpace,
-                                                     ElementType,
-                                                     Shape,
-                                                     UnnestedDescriptorType,
-                                                     NumVectors,
-                                                     ScalarPerVector>& tensor)
+          typename UnrolledDescriptorType>
+__host__ __device__ constexpr auto
+size(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
 {
    return size<Idxs...>(tensor.GetLayout());
 }
@@ -194,15 +175,9 @@ template <index_t... Idxs,
          MemoryTypeEnum BufferAddressSpace,
          typename ElementType,
          typename Shape,
-          typename UnnestedDescriptorType,
-          index_t NumVectors,
-          index_t ScalarPerVector>
-__host__ __device__ constexpr auto rank(const Tensor<BufferAddressSpace,
-                                                     ElementType,
-                                                     Shape,
-                                                     UnnestedDescriptorType,
-                                                     NumVectors,
-                                                     ScalarPerVector>& tensor)
+          typename UnrolledDescriptorType>
+__host__ __device__ constexpr auto
+rank(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
 {
    return rank<Idxs...>(tensor.GetLayout());
 }
@@ -218,15 +193,9 @@ template <index_t... Idxs,
          MemoryTypeEnum BufferAddressSpace,
          typename ElementType,
          typename Shape,
-          typename UnnestedDescriptorType,
-          index_t NumVectors,
-          index_t ScalarPerVector>
-__host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace,
-                                                      ElementType,
-                                                      Shape,
-                                                      UnnestedDescriptorType,
-                                                      NumVectors,
-                                                      ScalarPerVector>& tensor)
+          typename UnrolledDescriptorType>
+__host__ __device__ constexpr auto
+depth(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
 {
    return depth<Idxs...>(tensor.GetLayout());
 }
@@ -240,15 +209,9 @@ __host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace,
 template <MemoryTypeEnum BufferAddressSpace,
          typename ElementType,
          typename Shape,
-          typename UnnestedDescriptorType,
-          index_t NumVectors,
-          index_t ScalarPerVector>
-__host__ __device__ constexpr const auto& shape(const Tensor<BufferAddressSpace,
-                                                             ElementType,
-                                                             Shape,
-                                                             UnnestedDescriptorType,
-                                                             NumVectors,
-                                                             ScalarPerVector>& tensor)
+          typename UnrolledDescriptorType>
+__host__ __device__ constexpr const auto&
+shape(const Tensor<BufferAddressSpace, ElementType, Shape, UnrolledDescriptorType>& tensor)
 {
    return shape(tensor.GetLayout());
 }