From f60cd9d7a6911f30b412a6405f0041221bc64ea9 Mon Sep 17 00:00:00 2001
From: Sam Wu <sam.wu2@amd.com>
Date: Tue, 5 Dec 2023 11:05:55 -0700
Subject: [PATCH 01/18] Standardize documentation for ReadtheDocs (#1057)

Relates to https://github.com/RadeonOpenCompute/rocm-docs-core/issues/330
---
 .github/dependabot.yml       |  6 ++++++
 .gitignore                   |  1 -
 .readthedocs.yaml            | 10 +++++-----
 docs/conf.py                 | 27 +++++++++++++++++++--------
 docs/doxygen/Doxyfile        |  2 +-
 docs/sphinx/_toc.yml.in      |  6 +++---
 docs/sphinx/requirements.in  |  2 +-
 docs/sphinx/requirements.txt |  6 ++----
 8 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 276690bd4f..0e0a252eb6 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -10,3 +10,9 @@ updates:
     open-pull-requests-limit: 10
     schedule:
       interval: "daily"
+    labels:
+      - "documentation"
+      - "dependencies"
+      - "ci:docs-only"
+    reviewers:
+      - "samjwu"
diff --git a/.gitignore b/.gitignore
index 7af066c82d..340f11cbd2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,5 +54,4 @@ _images/
 _static/
 _templates/
 _toc.yml
-docBin/
 _doxygen/
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 5f50df2525..9e6678abe5 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -3,11 +3,6 @@
 
 version: 2
 
-build:
-   os: ubuntu-22.04
-   tools:
-      python: "3.8"
-
 sphinx:
    configuration: docs/conf.py
 
@@ -16,3 +11,8 @@ formats: [htmlzip, pdf, epub]
 python:
    install:
    - requirements: docs/sphinx/requirements.txt
+
+build:
+   os: ubuntu-22.04
+   tools:
+      python: "3.8"
diff --git a/docs/conf.py b/docs/conf.py
index 0de590da1a..e441ff1ced 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -4,23 +4,34 @@
 # list see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 
-import subprocess
+import re
 
 from rocm_docs import ROCmDocs
 
+html_theme_options = {"flavor": "list"}
 
-name = "Composable Kernel"
-get_version = r'sed -n -e "s/^rocm_setup_version(.* \([0-9\.]\{1,\}\).*/\1/p" ../CMakeLists.txt'
-version = subprocess.getoutput(get_version)
-if len(version) > 0:
-    name = f"{name} {version}"
+with open('../CMakeLists.txt', encoding='utf-8') as f:
+    match = re.search(r'.*set\(version ([0-9.]+)[^0-9.]+', f.read())
+    if not match:
+        raise ValueError("VERSION not found!")
+    version_number = match[1]
+left_nav_title = f"Composable Kernel {version_number} Documentation"
+
+# for PDF output on Read the Docs
+project = "Composable Kernel Documentation"
+author = "Advanced Micro Devices, Inc."
+copyright = "Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved."
+version = version_number
+release = version_number
 
 external_toc_path = "./sphinx/_toc.yml"
 
-docs_core = ROCmDocs(f"{name} Documentation")
-docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/docBin/xml")
+docs_core = ROCmDocs(left_nav_title)
+docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
 docs_core.setup()
 
+external_projects_current_project = "composable_kernel"
+
 mathjax3_config = {
 'tex': {
     'macros': {
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index 1084f94c81..2594422095 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -58,7 +58,7 @@ PROJECT_LOGO           =
 # entered, it will be relative to the location where doxygen was started. If
 # left blank the current directory will be used.
 
-OUTPUT_DIRECTORY       = docBin
+OUTPUT_DIRECTORY       = .
 
 # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
 # directories (in 2 levels) under the output directory of each output format and
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index 83dd1e7b1a..c37ba29cec 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -5,6 +5,6 @@ defaults:
   maxdepth: 6
 root: index
 subtrees:
-  - caption: About
-    entries:
-      - file: license
+- caption: About
+  entries:
+    - file: license
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index c4ce8be79a..f5ee431e7d 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core>=0.20.0
+rocm-docs-core==0.29.0
 sphinxcontrib-bibtex==2.6.1
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 5852315958..0442ae9a2b 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -96,9 +96,7 @@ pygments==2.14.0
     #   pydata-sphinx-theme
     #   sphinx
 pyjwt[crypto]==2.6.0
-    # via
-    #   pygithub
-    #   pyjwt
+    # via pygithub
 pynacl==1.5.0
     # via pygithub
 pytz==2023.3.post1
@@ -113,7 +111,7 @@ requests==2.28.2
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==0.27.0
+rocm-docs-core==0.29.0
     # via -r requirements.in
 six==1.16.0
     # via

From 836b7e557d028cc2d7c6b341352253fd81003e54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Wed, 6 Dec 2023 11:58:59 +0100
Subject: [PATCH 02/18] Introduce wrapper library (#1071)

* Introduce wrapper library

* Update cmake files

* Revert "Update cmake files"

This reverts commit c27f88b56590c11a88e26d5d0df7aca51a08133d.

* Fix comments
---
 CHANGELOG.md                                  |   1 +
 .../25_tensor_transforms/CMakeLists.txt       |   4 +
 .../tensor_transform.cpp                      |   0
 .../tensor_transform_using_wrapper.cpp        |  31 +-
 docs/doxygen/Doxyfile                         |   4 +-
 docs/index.rst                                |   2 +
 docs/wrapper.rst                              |  54 ++
 example/64_tensor_transforms/CMakeLists.txt   |   2 -
 include/ck/utility/tuple_helper.hpp           |  12 +
 .../ck/wrapper/layout.hpp                     | 181 ++-----
 include/ck/wrapper/layout_utils.hpp           | 321 ++++++++++++
 test/CMakeLists.txt                           |   1 +
 test/wrapper/CMakeLists.txt                   |   2 +
 test/wrapper/test_layout.cpp                  | 481 ++++++++++++++++++
 14 files changed, 945 insertions(+), 151 deletions(-)
 create mode 100644 client_example/25_tensor_transforms/CMakeLists.txt
 rename {example/64_tensor_transforms => client_example/25_tensor_transforms}/tensor_transform.cpp (100%)
 rename {example/64_tensor_transforms => client_example/25_tensor_transforms}/tensor_transform_using_wrapper.cpp (74%)
 create mode 100644 docs/wrapper.rst
 delete mode 100644 example/64_tensor_transforms/CMakeLists.txt
 rename example/64_tensor_transforms/tensor_transform_wrapper.hpp => include/ck/wrapper/layout.hpp (68%)
 create mode 100644 include/ck/wrapper/layout_utils.hpp
 create mode 100644 test/wrapper/CMakeLists.txt
 create mode 100644 test/wrapper/test_layout.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3e46a4ab4b..3da22fc790 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@ None
 - Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
 - Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799)
 - Support for Batched Gemm DL (#732)
+- Introduce wrapper sublibrary (limited functionality) (#1071)
 
 ### Changes
  - Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
diff --git a/client_example/25_tensor_transforms/CMakeLists.txt b/client_example/25_tensor_transforms/CMakeLists.txt
new file mode 100644
index 0000000000..d1543fb0ef
--- /dev/null
+++ b/client_example/25_tensor_transforms/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_executable(client_tensor_transform tensor_transform.cpp)
+target_link_libraries(client_tensor_transform PRIVATE composable_kernel::device_other_operations)
+add_executable(client_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp)
+target_link_libraries(client_tensor_transform_using_wrapper PRIVATE composable_kernel::device_other_operations)
diff --git a/example/64_tensor_transforms/tensor_transform.cpp b/client_example/25_tensor_transforms/tensor_transform.cpp
similarity index 100%
rename from example/64_tensor_transforms/tensor_transform.cpp
rename to client_example/25_tensor_transforms/tensor_transform.cpp
diff --git a/example/64_tensor_transforms/tensor_transform_using_wrapper.cpp b/client_example/25_tensor_transforms/tensor_transform_using_wrapper.cpp
similarity index 74%
rename from example/64_tensor_transforms/tensor_transform_using_wrapper.cpp
rename to client_example/25_tensor_transforms/tensor_transform_using_wrapper.cpp
index df2449e99d..de9fcde0b4 100644
--- a/example/64_tensor_transforms/tensor_transform_using_wrapper.cpp
+++ b/client_example/25_tensor_transforms/tensor_transform_using_wrapper.cpp
@@ -9,7 +9,7 @@
 #include "ck/utility/tuple.hpp"
 #include "ck/utility/sequence.hpp"
 
-#include "tensor_transform_wrapper.hpp"
+#include "ck/wrapper/layout.hpp"
 
 using DataType = int;
 
@@ -17,7 +17,7 @@ template <typename Layout>
 void Print1d(const Layout& layout)
 {
     std::cout << "Print1d" << std::endl;
-    for(ck::index_t w = 0; w < ck::tensor_transform_wrapper::size(layout); w++)
+    for(ck::index_t w = 0; w < ck::wrapper::size(layout); w++)
     {
         std::cout << layout(ck::make_tuple(w)) << " ";
     }
@@ -28,9 +28,9 @@ template <typename Layout>
 void Print2d(const Layout& layout)
 {
     std::cout << "Print2d" << std::endl;
-    for(ck::index_t h = 0; h < ck::tensor_transform_wrapper::size<0>(layout); h++)
+    for(ck::index_t h = 0; h < ck::wrapper::size<0>(layout); h++)
     {
-        for(ck::index_t w = 0; w < ck::tensor_transform_wrapper::size<1>(layout); w++)
+        for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
         {
             std::cout << layout(ck::make_tuple(h, w)) << " ";
         }
@@ -43,15 +43,11 @@ template <typename Layout>
 void Print3dCustom(const Layout& layout)
 {
     std::cout << "Print3dCustom" << std::endl;
-    for(ck::index_t d = 0;
-        d < ck::tensor_transform_wrapper::size<0>(ck::tensor_transform_wrapper::get<0>(layout));
-        d++)
+    for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++)
     {
-        for(ck::index_t h = 0;
-            h < ck::tensor_transform_wrapper::size<1>(ck::tensor_transform_wrapper::get<0>(layout));
-            h++)
+        for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++)
         {
-            for(ck::index_t w = 0; w < ck::tensor_transform_wrapper::size<1>(layout); w++)
+            for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
             {
                 std::cout << layout(ck::make_tuple(ck::make_tuple(d, h), w)) << " ";
             }
@@ -68,7 +64,7 @@ int main()
     // Basic descriptor 0, 1, 2, ... 30, 31 (compile-time descriptor)
     // (dims:4,8 strides:1,4)
     const auto shape_4x8       = ck::make_tuple(ck::Number<4>{}, ck::Number<8>{});
-    const auto layout_4x8_s1x4 = ck::tensor_transform_wrapper::make_layout(shape_4x8);
+    const auto layout_4x8_s1x4 = ck::wrapper::make_layout(shape_4x8);
     std::cout << "dims:4,8 strides:1,4" << std::endl;
     Print2d(layout_4x8_s1x4);
     using Cord1x1Type                = ck::Tuple<ck::Number<1>, ck::Number<1>>;
@@ -77,10 +73,9 @@ int main()
 
     // Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (runtime descriptor)
     // dims:4,(2,4) strides:2,(1,8)
-    const auto shape_4x2x4    = ck::make_tuple(4, ck::make_tuple(2, 4));
-    const auto strides_s2x1x8 = ck::make_tuple(2, ck::make_tuple(1, 8));
-    const auto layout_4x2x4_s2x1x8 =
-        ck::tensor_transform_wrapper::make_layout(shape_4x2x4, strides_s2x1x8);
+    const auto shape_4x2x4         = ck::make_tuple(4, ck::make_tuple(2, 4));
+    const auto strides_s2x1x8      = ck::make_tuple(2, ck::make_tuple(1, 8));
+    const auto layout_4x2x4_s2x1x8 = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8);
 
     std::cout << "dims:4,(2,4) strides:2,(1,8)" << std::endl;
     Print2d(layout_4x2x4_s2x1x8);
@@ -92,7 +87,7 @@ int main()
     const auto strides_s1x4x2x8 = ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}),
                                                  ck::make_tuple(ck::Number<2>{}, ck::Number<8>{}));
     static const auto layout_2x2x2x4_s1x4x2x8 =
-        ck::tensor_transform_wrapper::make_layout(shape_2x2x2x4, strides_s1x4x2x8);
+        ck::wrapper::make_layout(shape_2x2x2x4, strides_s1x4x2x8);
 
     std::cout << "dims:(2,2),(2,4) strides:(1,4),(2,8)" << std::endl;
     Print2d(layout_2x2x2x4_s1x4x2x8);
@@ -108,7 +103,7 @@ int main()
         ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}), ck::Number<2>{}),
         ck::Number<8>{});
     static const auto layout_2x2x2x4_s1x4x2x8_nested =
-        ck::tensor_transform_wrapper::make_layout(shape_2x2x2x4_nested, strides_s1x4x2x8_nested);
+        ck::wrapper::make_layout(shape_2x2x2x4_nested, strides_s1x4x2x8_nested);
 
     std::cout << "dims:((2,2),2),4 strides:((1,4),2),8" << std::endl;
     Print1d(layout_2x2x2x4_s1x4x2x8_nested);
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index 2594422095..fac9e138e1 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -778,7 +778,9 @@ WARN_LOGFILE           =
 INPUT                  = ../../include/ck/tensor_operation/gpu/grid \
                          ../../include/ck/tensor_operation/gpu/block \
                          ../../include/ck/tensor_operation/gpu/thread \
-                         ../../library/include/ck/library/utility
+                         ../../library/include/ck/library/utility \
+                         ../../include/ck/wrapper
+
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/docs/index.rst b/docs/index.rst
index 51c0c862ae..8c4aaa2b3d 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -34,6 +34,7 @@ Current CK library are structured into 4 layers:
 * "Templated Tile Operators" layer
 * "Templated Kernel and Invoker" layer
 * "Instantiated Kernel and Invoker" layer
+* "Wrapper for tensor transform operations"
 * "Client API" layer
 
 .. image:: data/ck_layer.png
@@ -50,6 +51,7 @@ The following is a list of CK documents in the suggested reading order:
 
    tutorial_hello_world
    dockerhub
+   wrapper
    Supported_Primitives_Guide
    API_Reference_Guide
    Contributors_Guide
diff --git a/docs/wrapper.rst b/docs/wrapper.rst
new file mode 100644
index 0000000000..64fb6a4031
--- /dev/null
+++ b/docs/wrapper.rst
@@ -0,0 +1,54 @@
+===============
+Wrapper
+===============
+
+-------------------------------------
+Description
+-------------------------------------
+
+.. note::
+
+    The wrapper is under development and its functionality is limited.
+
+
+CK provides a lightweight wrapper for more complex operations implemented in 
+the library. It allows indexing of nested layouts using a simple interface 
+(avoiding complex descriptor transformations). 
+
+Example:
+
+.. code-block:: c
+
+    const auto shape_4x2x4         = ck::make_tuple(4, ck::make_tuple(2, 4));
+    const auto strides_s2x1x8      = ck::make_tuple(2, ck::make_tuple(1, 8));
+    const auto layout = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8);
+
+    std::cout << "dims:4,(2,4) strides:2,(1,8)" << std::endl;
+    for(ck::index_t h = 0; h < ck::wrapper::size<0>(layout); h++)
+    {
+        for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
+        {
+            std::cout << layout(ck::make_tuple(h, w)) << " ";
+        }
+        std::cout << std::endl;
+    }
+
+Output::
+
+    dims:4,(2,4) strides:2,(1,8)
+    0 1 8 9 16 17 24 25 
+    2 3 10 11 18 19 26 27 
+    4 5 12 13 20 21 28 29 
+    6 7 14 15 22 23 30 31 
+
+-------------------------------------
+Layout
+-------------------------------------
+
+.. doxygenstruct:: ck::wrapper::Layout
+
+-------------------------------------
+Layout helpers
+-------------------------------------
+
+.. doxygenfile:: layout_utils.hpp
diff --git a/example/64_tensor_transforms/CMakeLists.txt b/example/64_tensor_transforms/CMakeLists.txt
deleted file mode 100644
index 9d14a410e3..0000000000
--- a/example/64_tensor_transforms/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-add_example_executable(example_tensor_transform tensor_transform.cpp)
-add_example_executable(example_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp)
diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp
index d7b492fe66..75f2693f20 100644
--- a/include/ck/utility/tuple_helper.hpp
+++ b/include/ck/utility/tuple_helper.hpp
@@ -166,4 +166,16 @@ __host__ __device__ constexpr auto IsNestedTuple(const Tuple<Ts...>&)
     return (is_detected<is_tuple, Ts>::value || ...);
 }
 
+template <index_t depth = 0, typename T>
+__host__ __device__ constexpr auto TupleDepth(const T&)
+{
+    return depth;
+}
+
+template <index_t depth = 0, typename... Ts>
+__host__ __device__ constexpr auto TupleDepth(const Tuple<Ts...>&)
+{
+    return math::max(TupleDepth<depth + 1>(Ts{})...);
+}
+
 } // namespace ck
diff --git a/example/64_tensor_transforms/tensor_transform_wrapper.hpp b/include/ck/wrapper/layout.hpp
similarity index 68%
rename from example/64_tensor_transforms/tensor_transform_wrapper.hpp
rename to include/ck/wrapper/layout.hpp
index 71cd6091f8..b337d88a1a 100644
--- a/example/64_tensor_transforms/tensor_transform_wrapper.hpp
+++ b/include/ck/wrapper/layout.hpp
@@ -3,27 +3,13 @@
 
 #pragma once
 
-#include "ck/ck.hpp"
-
-#include "ck/utility/number.hpp"
-#include "ck/utility/tuple.hpp"
-#include "ck/utility/tuple_helper.hpp"
-#include "ck/utility/sequence.hpp"
-#include "ck/utility/sequence_helper.hpp"
-#include "ck/utility/is_detected.hpp"
-
-#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/wrapper/layout_utils.hpp"
 
 namespace ck {
-namespace tensor_transform_wrapper {
+namespace wrapper {
 
 /**
- * \brief Layout wrapper
- *
- * \details
- * Layout wrapper that performs the tensor descriptor logic.
+ * \brief Layout wrapper that performs the tensor descriptor logic.
  *
  * \tparam Shape Tuple of Number<> (for compile-time layout) or index_t
  *         (dynamic layout). It is possible to pass nested shapes
@@ -32,21 +18,19 @@ namespace tensor_transform_wrapper {
  *         (dynamic layout). Stride tuple should be nested if shape tuple is
  *         nested.
  */
-template <typename Shape, typename Strides = Tuple<>>
+template <typename Shape, typename Strides>
 struct Layout
 {
     private:
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
 
-    template <typename T>
-    using is_tuple = decltype(std::declval<T&>().IsTuple());
-
     // Generate packed (column-major) strides if not passed
     template <typename... Ts>
     __host__ __device__ constexpr static auto
-    GenerateColumnMajorPackedStrides(const Tuple<Ts...>& tuple)
+    GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape)
     {
+        const auto unrolled_shape = UnrollNestedTuple(shape);
         return generate_tuple(
             [&](auto i) {
                 if constexpr(i.value == 0)
@@ -56,10 +40,10 @@ struct Layout
                 else
                 {
                     return TupleReduce<I0.value, i.value>([](auto x, auto y) { return x * y; },
-                                                          tuple);
+                                                          unrolled_shape);
                 }
             },
-            Number<Tuple<Ts...>::Size()>{});
+            Number<decltype(unrolled_shape)::Size()>{});
     }
 
     // Generate LowerDims in Compile-time for MergeTrasform using passed Type
@@ -112,8 +96,8 @@ struct Layout
     // Example shape:   (2, (2, 2)), 2, (2, 2)
     // Unrolled shape:  2,  (2, 2),  2, (2, 2)
     template <typename... ShapeDims, typename... IdxDims>
-    __host__ __device__ constexpr static auto UnrollShapeViaIdx(const Tuple<ShapeDims...>& shape,
-                                                                const Tuple<IdxDims...>& idx)
+    __host__ __device__ constexpr static auto AlignShapeToIdx(const Tuple<ShapeDims...>& shape,
+                                                              const Tuple<IdxDims...>& idx)
     {
         if constexpr(!IsNestedTuple(Tuple<IdxDims...>{}))
         {
@@ -125,7 +109,7 @@ struct Layout
             // Iterate over shape tuple elements:
             // 1. If corresponding idx element is tuple then return (will be unrolled)
             // 2. If no, pack in tuple. It will be restored during unroll.
-            auto unrolled_shape_via_idx = generate_tuple(
+            auto aligned_shape = generate_tuple(
                 [&](auto i) {
                     if constexpr(is_detected<is_tuple,
                                              tuple_element_t<i, Tuple<IdxDims...>>>::value)
@@ -140,8 +124,8 @@ struct Layout
                 Number<Tuple<IdxDims...>::Size()>{});
 
             // Unroll and process next step
-            return UnrollShapeViaIdx(UnrollNestedTuple<0, 1>(unrolled_shape_via_idx),
-                                     UnrollNestedTuple<0, 1>(idx));
+            return AlignShapeToIdx(UnrollNestedTuple<0, 1>(aligned_shape),
+                                   UnrollNestedTuple<0, 1>(idx));
         }
     }
 
@@ -150,27 +134,24 @@ struct Layout
                                                           DescriptorToMerge& desc)
     {
         // Reverse each element in tuple
-        using ReversedUnrolledShape = decltype(TupleReverse(UnrollNestedTuple(shape)));
-        const auto merge_elems      = ReversedUnrolledShape{};
-
+        const auto merge_elems = TupleReverse(UnrollNestedTuple(shape));
         // Generate reverted indexes (column major traverse)
-        using MergeElemsSequence =
-            typename arithmetic_sequence_gen<0, ReversedUnrolledShape::Size(), 1>::type;
-        const auto lower_dims = make_tuple(MergeElemsSequence::Reverse());
-        const auto upper_dims = make_tuple(Sequence<0>{});
+        using MergeElemsSequence = typename arithmetic_sequence_gen<0, merge_elems.Size(), 1>::type;
+        const auto lower_dims    = make_tuple(MergeElemsSequence::Reverse());
+        const auto upper_dims    = make_tuple(Sequence<0>{});
         // Merge to 1d
         return transform_tensor_descriptor(
             desc, make_tuple(make_merge_transform(merge_elems)), lower_dims, upper_dims);
     }
 
-    // Merge nested shape dims
+    // Merge nested shape dims. Merge nested shape dims when idx is also nested.
     // Input desc shape: 2,  2,  2, 2,  2,  2
     // Example idx:      1,      1, 1,      1
     // Example shape:    2, (2, 2), 2, (2, 2)
     // Merged shape:     2,      4, 2,      4
     template <typename... ShapeDims, typename... IdxDims, typename DescriptorToMerge>
-    __host__ __device__ constexpr static auto
-    MakeMerges(const Tuple<ShapeDims...>& shape, const Tuple<IdxDims...>&, DescriptorToMerge& desc)
+    __host__ __device__ constexpr static auto CreateMergedDescriptor(
+        const Tuple<ShapeDims...>& shape, const Tuple<IdxDims...>&, DescriptorToMerge& desc)
     {
         const auto transforms = generate_tuple(
             [&](auto i) {
@@ -224,9 +205,9 @@ struct Layout
             static_assert(Tuple<ShapeDims...>::Size() == Tuple<IdxDims...>::Size(),
                           "Idx rank and Shape rank must be the same (except 1d).");
             // Unroll while IdxDims is nested
-            const auto unrolled_shape_via_idx = UnrollShapeViaIdx(shape, idx);
+            const auto aligned_shape = AlignShapeToIdx(shape, idx);
             // Transform correct form of shape
-            return MakeMerges(unrolled_shape_via_idx, UnrollNestedTuple(idx), descriptor_);
+            return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idx), descriptor_);
         }
     }
 
@@ -234,26 +215,21 @@ struct Layout
     __host__ __device__ static auto MakeNaiveDescriptor(const LayoutShape& shape,
                                                         const LayoutStrides& strides)
     {
-        const auto unrolled_shape = UnrollNestedTuple(shape);
-
-        if constexpr(ck::is_same_v<LayoutStrides, Tuple<>>)
-        {
-            // If shape is packed
-            const auto column_major_packed_strides =
-                GenerateColumnMajorPackedStrides(unrolled_shape);
-            return make_naive_tensor_descriptor(unrolled_shape, column_major_packed_strides);
-        }
-        else
-        {
-            const auto unrolled_strides = UnrollNestedTuple(strides);
-            static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
-                          "Size of strides and shape are not consistent.");
-            return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
-        }
+        const auto unrolled_shape   = UnrollNestedTuple(shape);
+        const auto unrolled_strides = UnrollNestedTuple(strides);
+        static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
+                      "Size of strides and shape are not consistent.");
+        return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
     }
 
     public:
-    using NaiveDescriptorType = remove_cvref_t<decltype(MakeNaiveDescriptor(Shape{}, Strides{}))>;
+    // If the stride is not passed, you can infer it from `GenerateColumnMajorPackedStrides`.
+    using DeducedStrides =
+        std::conditional_t<is_same_v<Strides, Tuple<>>,
+                           remove_cvref_t<decltype(GenerateColumnMajorPackedStrides(Shape{}))>,
+                           Strides>;
+    using NaiveDescriptorType =
+        remove_cvref_t<decltype(MakeNaiveDescriptor(Shape{}, DeducedStrides{}))>;
 
     /**
      * \brief Layout constructor.
@@ -268,9 +244,9 @@ struct Layout
         // Construct if runtime mode
         if constexpr(!NaiveDescriptorType::IsKnownAtCompileTime())
         {
-            // Keep only shape, strides are not need for transforms
             shape_      = shape;
-            descriptor_ = MakeNaiveDescriptor(shape, strides);
+            strides_    = strides;
+            descriptor_ = MakeNaiveDescriptor(shape_, strides_);
         }
     }
 
@@ -279,7 +255,8 @@ struct Layout
         if constexpr(!NaiveDescriptorType::IsKnownAtCompileTime())
         {
             shape_      = shape;
-            descriptor_ = MakeNaiveDescriptor(shape, Strides{});
+            strides_    = GenerateColumnMajorPackedStrides(shape_);
+            descriptor_ = MakeNaiveDescriptor(shape_, strides_);
         }
     }
 
@@ -338,7 +315,7 @@ struct Layout
      *
      * \return Calculated size.
      */
-    __host__ __device__ constexpr index_t GetLength() const
+    __host__ __device__ constexpr index_t GetLengths() const
     {
         const auto unrolled_shape = UnrollNestedTuple(shape_);
         return TupleReduce<I0.value, unrolled_shape.Size()>([](auto x, auto y) { return x * y; },
@@ -346,80 +323,24 @@ struct Layout
     }
 
     /**
-     * \brief Dimension getter.
+     * \brief Shape getter.
      *
-     * \tparam IDim Dimension idx.
-     * \return Calculated size.
+     * \return Shape.
      */
-    template <index_t IDim>
-    __host__ __device__ constexpr auto Get() const
-    {
-        const auto elem = shape_.At(Number<IDim>{});
-        return elem;
-    }
+    __host__ __device__ constexpr Shape GetShape() const { return shape_; }
+
+    /**
+     * \brief Strides getter.
+     *
+     * \return Strides.
+     */
+    __host__ __device__ constexpr DeducedStrides GetStrides() const { return strides_; }
 
     private:
     NaiveDescriptorType descriptor_;
     Shape shape_;
+    DeducedStrides strides_;
 };
 
-// Layout helpers
-// Length getter (product if tuple)
-template <index_t idx, typename Shape, typename Strides>
-__host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
-{
-    return layout.template GetLength<idx>();
-}
-
-// Get shape size (product of dims if tuple)
-template <typename... ShapeDims>
-__host__ __device__ constexpr index_t size(const Tuple<ShapeDims...>& shape)
-{
-    using UnrolledShape = decltype(UnrollNestedTuple(shape));
-    return TupleReduce<0, UnrolledShape::Size()>([](auto x, auto y) { return x * y; },
-                                                 UnrolledShape{});
-}
-
-// Get dim size (could be returned from get function)
-template <typename T>
-__host__ __device__ T constexpr size(const T& dim)
-{
-    return dim;
-}
-
-// Get layout size (product of shapes)
-template <typename Shape, typename Strides>
-__host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
-{
-    return layout.GetLength();
-}
-
-// Get shape element size
-template <index_t idx, typename... ShapeDims>
-__host__ __device__ constexpr index_t size(const Tuple<ShapeDims...>& shape)
-{
-    return size(shape.At(Number<idx>{}));
-}
-
-// Dim getter (tuple if tuple)
-template <index_t idx, typename Shape, typename Strides>
-__host__ __device__ constexpr auto get(const Layout<Shape, Strides>& layout)
-{
-    return layout.template Get<idx>();
-}
-
-template <typename Shape, typename Strides>
-__host__ __device__ constexpr Layout<Shape, Strides> make_layout(const Shape& shape,
-                                                                 const Strides& strides)
-{
-    return Layout<Shape, Strides>(shape, strides);
-}
-
-template <typename Shape>
-__host__ __device__ constexpr Layout<Shape> make_layout(const Shape& shape)
-{
-    return Layout<Shape>(shape);
-}
-
-} // namespace tensor_transform_wrapper
+} // namespace wrapper
 } // namespace ck
diff --git a/include/ck/wrapper/layout_utils.hpp b/include/ck/wrapper/layout_utils.hpp
new file mode 100644
index 0000000000..fac8f33854
--- /dev/null
+++ b/include/ck/wrapper/layout_utils.hpp
@@ -0,0 +1,321 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+
+#include "ck/utility/number.hpp"
+#include "ck/utility/tuple.hpp"
+#include "ck/utility/tuple_helper.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/utility/sequence_helper.hpp"
+#include "ck/utility/is_detected.hpp"
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+
+namespace ck {
+namespace wrapper {
+
+// Disable from doxygen docs generation
+/// @cond
+// forward declaration
+template <typename Shape, typename Strides = Tuple<>>
+struct Layout;
+
+template <typename T>
+using is_tuple = decltype(std::declval<T&>().IsTuple());
+/// @endcond
+
+// make_*
+/**
+ * \brief Make layout function.
+ *
+ * \tparam Shape Shape for layout.
+ * \tparam Strides Strides for layout.
+ * \return Constructed layout.
+ */
+template <typename Shape, typename Strides>
+__host__ __device__ constexpr Layout<Shape, Strides> make_layout(const Shape& shape,
+                                                                 const Strides& strides)
+{
+    return Layout<Shape, Strides>(shape, strides);
+}
+
+/**
+ * \brief Make layout function with packed strides
+ *        (column-major).
+ *
+ * \tparam Shape Shape for layout.
+ * \return Constructed layout.
+ */
+template <typename Shape>
+__host__ __device__ constexpr Layout<Shape> make_layout(const Shape& shape)
+{
+    return Layout<Shape>(shape);
+}
+
+// Layout helpers
+// get
+/**
+ * \brief Get element from tuple (Shape/Strides/Idxs).
+ *
+ * \tparam idx Index to lookup.
+ * \param tuple Tuple to lookup.
+ * \return Requsted element.
+ */
+template <index_t idx, typename... Dims>
+__host__ __device__ constexpr auto get(const Tuple<Dims...>& tuple)
+{
+    return tuple.At(Number<idx>{});
+}
+
+/**
+ * \brief Get sub layout.
+ *
+ * \tparam idx Index to lookup.
+ * \param layout Layout to create sub layout.
+ * \return Requsted sub layout.
+ */
+template <index_t idx, typename Shape, typename Strides>
+__host__ __device__ constexpr auto get(const Layout<Shape, Strides>& layout)
+{
+    const auto new_shape = get<idx>(layout.GetShape());
+    static_assert(is_detected<is_tuple, decltype(new_shape)>::value,
+                  "Shape of sub layout must be tuple");
+    if constexpr(is_same_v<Strides, Tuple<>>)
+    {
+        // If stride not passed, create without strides
+        return make_layout(new_shape);
+    }
+    else
+    {
+        const auto new_strides = get<idx>(layout.GetStrides());
+        static_assert(is_detected<is_tuple, decltype(new_strides)>::value,
+                      "Strides of sub layout must be tuple");
+        return make_layout(new_shape, new_strides);
+    }
+}
+
+/**
+ * \brief Hierarchical get.
+ *
+ * \tparam Idxs Indexes to lookup.
+ * \param elem Element to lookup.
+ * \return Requsted element.
+ */
+template <index_t Idx, index_t... Idxs, typename T>
+__host__ __device__ constexpr auto get(const T& elem)
+{
+    return get<Idxs...>(get<Idx>(elem));
+}
+
+// size
+/**
+ * \brief Length get (product if tuple).
+ *
+ * \tparam idx Index to lookup.
+ * \param layout Layout to get Shape.
+ * \return Requsted length.
+ */
+template <index_t idx, typename Shape, typename Strides>
+__host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
+{
+    return layout.template GetLength<idx>();
+}
+
+/**
+ * \brief Shape size (product of dims).
+ *
+ * \param shape Shape to lookup.
+ * \return Requsted size.
+ */
+template <typename... ShapeDims>
+__host__ __device__ constexpr index_t size(const Tuple<ShapeDims...>& shape)
+{
+    const auto unrolled_shape = UnrollNestedTuple(shape);
+    return TupleReduce<0, unrolled_shape.Size()>([](auto x, auto y) { return x * y; },
+                                                 unrolled_shape);
+}
+
+// Get dim size (could be returned from get function)
+/**
+ * \private
+ */
+template <typename T>
+__host__ __device__ T constexpr size(const T& dim)
+{
+    return dim;
+}
+
+/**
+ * \brief Layout size (product of dims).
+ *
+ * \param layout Layout to calculate shape size.
+ * \return Requsted size.
+ */
+template <typename Shape, typename Strides>
+__host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
+{
+    return layout.GetLengths();
+}
+
+/**
+ * \brief Length get from tuple (product if tuple).
+ *
+ * \tparam idx Index to lookup.
+ * \param tuple Tuple to lookup.
+ * \return Requsted length.
+ */
+template <index_t idx, typename... Ts>
+__host__ __device__ constexpr index_t size(const Tuple<Ts...>& tuple)
+{
+    return size(tuple.At(Number<idx>{}));
+}
+
+/**
+ * \brief Hierarchical size.
+ *
+ * \tparam Idxs Indexes to lookup.
+ * \param elem Element to lookup.
+ * \return Requsted element.
+ */
+template <index_t... Idxs, typename T>
+__host__ __device__ constexpr auto size(const T& elem)
+{
+    return size(get<Idxs...>(elem));
+}
+
+// rank
+/**
+ * \brief Get layout rank (num elements in shape).
+ *
+ * \param layout Layout to calculate rank.
+ * \return Requsted rank.
+ */
+template <typename Shape, typename Strides>
+__host__ __device__ constexpr auto rank([[maybe_unused]] const Layout<Shape, Strides>& layout)
+{
+    return Shape::Size();
+}
+
+/**
+ * \brief Get tuple rank (num elements in tuple).
+ *        Return 1 if scalar passed.
+ *
+ * \param tuple Tuple to calculate rank.
+ * \return Requsted rank.
+ */
+template <typename... Dims>
+__host__ __device__ constexpr auto rank([[maybe_unused]] const Tuple<Dims...>& tuple)
+{
+    return Tuple<Dims...>::Size();
+}
+
+/**
+ * \private
+ */
+template <index_t IDim>
+__host__ __device__ constexpr index_t rank(const Number<IDim>&)
+{
+    return 1;
+}
+
+/**
+ * \private
+ */
+__host__ __device__ constexpr index_t rank(const index_t&) { return 1; }
+
+/**
+ * \brief Hierarchical rank.
+ *
+ * \tparam Idxs Indexes to lookup.
+ * \param elem Element to lookup.
+ * \return Requsted rank.
+ */
+template <index_t... Idxs, typename T>
+__host__ __device__ constexpr auto rank(const T& elem)
+{
+    return rank(get<Idxs...>(elem));
+}
+
+// depth
+/**
+ * \brief Get depth of the layout shape (return 0 if scalar).
+ *
+ * \param layout Layout to calculate depth.
+ * \return Requsted depth.
+ */
+template <typename Shape, typename Strides>
+__host__ __device__ constexpr auto depth(const Layout<Shape, Strides>& layout)
+{
+    return TupleDepth(layout.GetShape());
+}
+
+/**
+ * \brief Get depth of the tuple. (return 0 if scalar)
+ *
+ * \param tuple Tuple to calculate depth.
+ * \return Requsted depth.
+ */
+template <typename... Dims>
+__host__ __device__ constexpr auto depth(const Tuple<Dims...>& tuple)
+{
+    return TupleDepth(tuple);
+}
+
+/**
+ * \private
+ */
+template <index_t IDim>
+__host__ __device__ constexpr index_t depth(const Number<IDim>&)
+{
+    return 0;
+}
+
+/**
+ * \private
+ */
+__host__ __device__ constexpr index_t depth(const index_t&) { return 0; }
+
+/**
+ * \brief Hierarchical depth.
+ *
+ * \tparam Idxs Indexes to lookup.
+ * \param elem Element to lookup.
+ * \return Requsted depth.
+ */
+template <index_t... Idxs, typename T>
+__host__ __device__ constexpr auto depth(const T& elem)
+{
+    return depth(get<Idxs...>(elem));
+}
+
+/**
+ * \brief Get Layout strides.
+ *
+ * \param layout Layout to get strides.
+ * \return Requsted strides.
+ */
+template <typename Shape, typename Strides>
+__host__ __device__ constexpr auto stride(const Layout<Shape, Strides>& layout)
+{
+    return layout.GetStrides();
+}
+
+/**
+ * \brief Get Layout shape.
+ *
+ * \param layout Layout to get shape.
+ * \return Requsted shape.
+ */
+template <typename Shape, typename Strides>
+__host__ __device__ constexpr auto shape(const Layout<Shape, Strides>& layout)
+{
+    return layout.GetShape();
+}
+
+} // namespace wrapper
+} // namespace ck
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4aaa5fcfa5..b325a3a7f8 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -149,6 +149,7 @@ add_subdirectory(batched_gemm_multi_d)
 add_subdirectory(grouped_convnd_bwd_data)
 add_subdirectory(conv_tensor_rearrange)
 add_subdirectory(transpose)
+add_subdirectory(wrapper)
 if(GPU_TARGETS MATCHES "gfx11")
     add_subdirectory(wmma_op)
 endif()
diff --git a/test/wrapper/CMakeLists.txt b/test/wrapper/CMakeLists.txt
new file mode 100644
index 0000000000..e25ef176dd
--- /dev/null
+++ b/test/wrapper/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_gtest_executable(test_layout test_layout.cpp)
+target_link_libraries(test_layout PRIVATE utility)
diff --git a/test/wrapper/test_layout.cpp b/test/wrapper/test_layout.cpp
new file mode 100644
index 0000000000..7d09696fbb
--- /dev/null
+++ b/test/wrapper/test_layout.cpp
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/utility/common_header.hpp"
+
+#include "ck/wrapper/layout.hpp"
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+
+class TestWrapperLayout : public ::testing::Test
+{
+    protected:
+    static constexpr auto I0 = ck::Number<0>{};
+    static constexpr auto I1 = ck::Number<1>{};
+
+    template <typename Desc,
+              typename Desc1d,
+              typename LayoutRuntime,
+              typename LayoutCompiletime,
+              typename Idxs>
+    void Run(Desc& desc,
+             Desc1d& desc_1d,
+             LayoutRuntime& layout_runtime,
+             LayoutCompiletime& layout_compiletime,
+             const std::vector<Idxs>& idxs)
+    {
+        // 1d check
+        EXPECT_EQ(desc_1d.GetLength(I0), ck::wrapper::size(layout_runtime));
+        // Check layout compiletime and runtime result consistency
+        EXPECT_EQ(ck::wrapper::size(layout_runtime), ck::wrapper::size(layout_compiletime));
+
+        for(ck::index_t i = 0; i < desc_1d.GetLength(I0); i++)
+        {
+            const ck::index_t layout_runtime_offset_1d     = layout_runtime(ck::make_tuple(i));
+            const ck::index_t layout_compiletime_offset_1d = layout_compiletime(ck::make_tuple(i));
+            const ck::index_t desc_offset_1d = desc_1d.CalculateOffset(ck::make_tuple(i));
+            EXPECT_EQ(layout_runtime_offset_1d, desc_offset_1d);
+            EXPECT_EQ(layout_compiletime_offset_1d, layout_runtime_offset_1d);
+        }
+        // size(layout)-d check, don't check if access is hierarchical
+        if constexpr(!IsNestedTuple(Idxs{}))
+        {
+            ck::static_for<0, Idxs::Size(), 1>{}([&](auto d) {
+                EXPECT_EQ(desc.GetLength(ck::Number<d>{}), ck::wrapper::size<d>(layout_runtime));
+                EXPECT_EQ(ck::wrapper::size<d>(layout_runtime),
+                          ck::wrapper::size<d>(layout_compiletime));
+            });
+        }
+        for(const auto idx : idxs)
+        {
+            const ck::index_t layout_runtime_offset     = layout_runtime(idx);
+            const ck::index_t layout_compiletime_offset = layout_compiletime(idx);
+            const ck::index_t desc_offset =
+                desc.CalculateOffset(UnrollNestedTuple(idx)); // Unroll if nested
+            EXPECT_EQ(layout_runtime_offset, desc_offset);
+            EXPECT_EQ(layout_runtime_offset, layout_compiletime_offset);
+        }
+    }
+};
+
+TEST_F(TestWrapperLayout, 2d)
+{
+    // dims:(4, 3) strides:(1, 4)
+    constexpr ck::index_t d1 = 4;
+    constexpr ck::index_t d0 = 3;
+    constexpr ck::index_t s1 = 1;
+    constexpr ck::index_t s0 = 4;
+    const auto desc =
+        ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
+                                         ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
+    // Reverse due to column major
+    const auto desc_1d = transform_tensor_descriptor(
+        desc,
+        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1))),
+        ck::make_tuple(ck::Sequence<1, 0>{}),
+        ck::make_tuple(ck::Sequence<0>{}));
+    const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0));
+    const auto layout_compiletime =
+        ck::wrapper::make_layout(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}));
+    std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs;
+
+    for(ck::index_t h = 0; h < d1; h++)
+    {
+        for(ck::index_t w = 0; w < d0; w++)
+        {
+            idxs.emplace_back(h, w);
+        }
+    }
+
+    this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs);
+}
+
+TEST_F(TestWrapperLayout, 3d_nested)
+{
+    // dims:((2, 3), 4, 3) strides:((2, 4), 12, 48)
+    constexpr ck::index_t d3 = 2;
+    constexpr ck::index_t d2 = 3;
+    constexpr ck::index_t d1 = 4;
+    constexpr ck::index_t d0 = 3;
+    constexpr ck::index_t s3 = 2;
+    constexpr ck::index_t s2 = 4;
+    constexpr ck::index_t s1 = 12;
+    constexpr ck::index_t s0 = 48;
+    const auto desc          = ck::make_naive_tensor_descriptor(
+        ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
+        ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
+    // Reverse due to column major
+    const auto desc_1d = transform_tensor_descriptor(
+        desc,
+        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
+        ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
+        ck::make_tuple(ck::Sequence<0>{}));
+    const auto desc_3d = transform_tensor_descriptor(
+        desc,
+        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
+                       ck::make_pass_through_transform(d1),
+                       ck::make_pass_through_transform(d2)),
+        ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<3>{}),
+        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
+    const auto layout_runtime =
+        ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), d1, d0),
+                                 ck::make_tuple(ck::make_tuple(s3, s2), s1, s0));
+    const auto layout_compiletime = ck::wrapper::make_layout(
+        ck::make_tuple(
+            ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}), ck::Number<d1>{}, ck::Number<d0>{}),
+        ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
+                       ck::Number<s1>{},
+                       ck::Number<s0>{}));
+    std::vector<ck::Tuple<ck::index_t, ck::index_t, ck::index_t>> idxs_3d;
+
+    for(ck::index_t d = 0; d < d2 * d3; d++)
+    {
+        for(ck::index_t h = 0; h < d1; h++)
+        {
+            for(ck::index_t w = 0; w < d0; w++)
+            {
+                idxs_3d.emplace_back(d, h, w);
+            }
+        }
+    }
+    this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
+
+    // Check also 4d iteration
+    std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t, ck::index_t>> idxs_4d;
+
+    for(ck::index_t e = 0; e < d3; e++)
+    {
+        for(ck::index_t d = 0; d < d2; d++)
+        {
+            for(ck::index_t h = 0; h < d1; h++)
+            {
+                for(ck::index_t w = 0; w < d0; w++)
+                {
+                    idxs_4d.emplace_back(ck::make_tuple(e, d), h, w);
+                }
+            }
+        }
+    }
+    this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
+}
+
+TEST_F(TestWrapperLayout, 2d_nested)
+{
+    // dims:((2, 3), (4, 3)) strides:((2, 4), (48, 12))
+    constexpr ck::index_t d3 = 2;
+    constexpr ck::index_t d2 = 3;
+    constexpr ck::index_t d1 = 4;
+    constexpr ck::index_t d0 = 3;
+    constexpr ck::index_t s3 = 2;
+    constexpr ck::index_t s2 = 4;
+    constexpr ck::index_t s1 = 48;
+    constexpr ck::index_t s0 = 12;
+    const auto desc          = ck::make_naive_tensor_descriptor(
+        ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
+        ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
+    // Reverse due to column major
+    const auto desc_1d = transform_tensor_descriptor(
+        desc,
+        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
+        ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
+        ck::make_tuple(ck::Sequence<0>{}));
+    const auto desc_2d = transform_tensor_descriptor(
+        desc,
+        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
+                       ck::make_merge_transform(ck::make_tuple(d0, d1))),
+        ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<3, 2>{}),
+        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
+    const auto layout_runtime =
+        ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), ck::make_tuple(d1, d0)),
+                                 ck::make_tuple(ck::make_tuple(s3, s2), ck::make_tuple(s1, s0)));
+    const auto layout_compiletime = ck::wrapper::make_layout(
+        ck::make_tuple(ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}),
+                       ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
+        ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
+                       ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
+    std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
+
+    for(ck::index_t h = 0; h < d2 * d3; h++)
+    {
+        for(ck::index_t w = 0; w < d0 * d1; w++)
+        {
+            idxs_2d.emplace_back(h, w);
+        }
+    }
+    this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
+    // Check also 4d iteration
+    std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::Tuple<ck::index_t, ck::index_t>>>
+        idxs_4d;
+
+    for(ck::index_t e = 0; e < d3; e++)
+    {
+        for(ck::index_t d = 0; d < d2; d++)
+        {
+            for(ck::index_t h = 0; h < d1; h++)
+            {
+                for(ck::index_t w = 0; w < d0; w++)
+                {
+                    idxs_4d.emplace_back(ck::make_tuple(e, d), ck::make_tuple(h, w));
+                }
+            }
+        }
+    }
+    this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
+}
+
+TEST_F(TestWrapperLayout, 3d_double_nested)
+{
+    // dims:(((2, 2), 3), (4, 3)) strides:(((2, 4), 8), (96, 24))
+    constexpr ck::index_t d4 = 2;
+    constexpr ck::index_t d3 = 2;
+    constexpr ck::index_t d2 = 3;
+    constexpr ck::index_t d1 = 4;
+    constexpr ck::index_t d0 = 3;
+    constexpr ck::index_t s4 = 2;
+    constexpr ck::index_t s3 = 4;
+    constexpr ck::index_t s2 = 8;
+    constexpr ck::index_t s1 = 96;
+    constexpr ck::index_t s0 = 24;
+    const auto desc          = ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d4>{},
+                                                                      ck::Number<d3>{},
+                                                                      ck::Number<d2>{},
+                                                                      ck::Number<d1>{},
+                                                                      ck::Number<d0>{}),
+                                                       ck::make_tuple(ck::Number<s4>{},
+                                                                      ck::Number<s3>{},
+                                                                      ck::Number<s2>{},
+                                                                      ck::Number<s1>{},
+                                                                      ck::Number<s0>{}));
+    // Reverse due to column major
+    const auto desc_1d = transform_tensor_descriptor(
+        desc,
+        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3, d4))),
+        ck::make_tuple(ck::Sequence<4, 3, 2, 1, 0>{}),
+        ck::make_tuple(ck::Sequence<0>{}));
+    const auto desc_3d = transform_tensor_descriptor(
+        desc,
+        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d3, d4)),
+                       ck::make_pass_through_transform(d2),
+                       ck::make_merge_transform(ck::make_tuple(d0, d1))),
+        ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<4, 3>{}),
+        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
+    const auto desc_2d = transform_tensor_descriptor(
+        desc_3d,
+        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3 * d4)),
+                       ck::make_pass_through_transform(d1 * d0)),
+        ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}),
+        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
+    const auto layout_runtime = ck::wrapper::make_layout(
+        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)),
+        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, s3), s2), ck::make_tuple(s1, s0)));
+    const auto layout_compiletime = ck::wrapper::make_layout(
+        ck::make_tuple(
+            ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
+            ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
+        ck::make_tuple(
+            ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
+            ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
+    std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
+
+    for(ck::index_t h = 0; h < d2 * d3 * d4; h++)
+    {
+        for(ck::index_t w = 0; w < d0 * d1; w++)
+        {
+            idxs_2d.emplace_back(h, w);
+        }
+    }
+    this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
+    // Check also 3d iteration
+    std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>> idxs_3d;
+
+    for(ck::index_t d = 0; d < d3 * d4; d++)
+    {
+        for(ck::index_t h = 0; h < d2; h++)
+        {
+            for(ck::index_t w = 0; w < d1 * d0; w++)
+            {
+                idxs_3d.emplace_back(ck::make_tuple(d, h), w);
+            }
+        }
+    }
+    this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
+    // Check also 5d iteration
+    std::vector<ck::Tuple<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>,
+                          ck::Tuple<ck::index_t, ck::index_t>>>
+        idxs_5d;
+
+    for(ck::index_t f = 0; f < d4; f++)
+    {
+        for(ck::index_t e = 0; e < d3; e++)
+        {
+            for(ck::index_t d = 0; d < d2; d++)
+            {
+                for(ck::index_t h = 0; h < d1; h++)
+                {
+                    for(ck::index_t w = 0; w < d0; w++)
+                    {
+                        idxs_5d.emplace_back(ck::make_tuple(ck::make_tuple(f, e), d),
+                                             ck::make_tuple(h, w));
+                    }
+                }
+            }
+        }
+    }
+    this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_5d);
+}
+
+TEST(TestLayoutHelpers, SizeAndGet)
+{
+    // dims:(((2, 2), 3), (4, 3))
+    constexpr ck::index_t d4  = 2;
+    constexpr ck::index_t d3  = 2;
+    constexpr ck::index_t d2  = 3;
+    constexpr ck::index_t d1  = 4;
+    constexpr ck::index_t d0  = 3;
+    const auto layout_runtime = ck::wrapper::make_layout(
+        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
+    const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
+        ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
+        ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
+
+    // Size of layout
+    EXPECT_EQ(ck::wrapper::size(layout_runtime), d4 * d3 * d2 * d1 * d0);
+    EXPECT_EQ(ck::wrapper::size(layout_compiletime), d4 * d3 * d2 * d1 * d0);
+
+    // Size of dims
+    EXPECT_EQ(ck::wrapper::size<0>(layout_runtime), d4 * d3 * d2);
+    EXPECT_EQ(ck::wrapper::size<0>(layout_compiletime), d4 * d3 * d2);
+    EXPECT_EQ(ck::wrapper::size<1>(layout_runtime), d1 * d0);
+    EXPECT_EQ(ck::wrapper::size<1>(layout_compiletime), d1 * d0);
+
+    // Access through new layout (using get with layout object)
+    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_runtime)), d4 * d3);
+    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_compiletime)), d4 * d3);
+    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
+    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
+
+    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d4);
+    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
+              d4);
+    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d3);
+    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
+              d3);
+
+    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
+    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
+
+    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_runtime)), d1);
+    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_compiletime)), d1);
+    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_runtime)), d0);
+    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_compiletime)), d0);
+}
+
+TEST(TestLayoutHelpers, DepthAndRank)
+{
+    // dims:(((2, 2), 3), (4, 3))
+    constexpr ck::index_t d4  = 2;
+    constexpr ck::index_t d3  = 2;
+    constexpr ck::index_t d2  = 3;
+    constexpr ck::index_t d1  = 4;
+    constexpr ck::index_t d0  = 3;
+    const auto layout_runtime = ck::wrapper::make_layout(
+        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
+    const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
+        ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
+        ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
+
+    EXPECT_EQ(ck::wrapper::depth(layout_runtime), 3);
+    EXPECT_EQ(ck::wrapper::depth(layout_compiletime), 3);
+    EXPECT_EQ(ck::wrapper::depth(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
+    // Check for integer
+    EXPECT_EQ(ck::wrapper::depth(d0), 0);
+
+    EXPECT_EQ(ck::wrapper::rank(layout_runtime), 2);
+    EXPECT_EQ(ck::wrapper::rank(layout_compiletime), 2);
+    EXPECT_EQ(ck::wrapper::rank(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
+    // Check for integer
+    EXPECT_EQ(ck::wrapper::rank(d0), 1);
+}
+
+TEST(TestLayoutHelpers, ShapeAndStrides)
+{
+    // dims:(((2, 2), 3), (4, 3))
+    constexpr ck::index_t d4     = 2;
+    constexpr ck::index_t d3     = 2;
+    constexpr ck::index_t d2     = 3;
+    constexpr ck::index_t d1     = 4;
+    constexpr ck::index_t d0     = 3;
+    constexpr ck::index_t s4     = 2;
+    constexpr ck::index_t s3     = 4;
+    constexpr ck::index_t s2     = 8;
+    constexpr ck::index_t s1     = 96;
+    constexpr ck::index_t s0     = 24;
+    const auto shape_compiletime = ck::make_tuple(
+        ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
+        ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}));
+    const auto strides_compiletime = ck::make_tuple(
+        ck::make_tuple(ck::make_tuple(ck::Number<s4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
+        ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
+    const auto shape_runtime =
+        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
+    const auto strides_runtime =
+        ck::make_tuple(ck::make_tuple(ck::make_tuple(s4, s3), s2), ck::make_tuple(s1, s0));
+    const auto layout_runtime = ck::wrapper::make_layout(shape_runtime, strides_runtime);
+    const auto layout_compiletime =
+        ck::wrapper::make_layout(shape_compiletime, strides_compiletime);
+
+    constexpr bool check_compiletime_shape =
+        std::is_same_v<std::remove_const<decltype(shape_compiletime)>::type,
+                       decltype(shape(layout_compiletime))>;
+    constexpr bool check_compiletime_strides =
+        std::is_same_v<std::remove_const<decltype(strides_compiletime)>::type,
+                       decltype(stride(layout_compiletime))>;
+    constexpr bool check_runtime_shape =
+        std::is_same_v<std::remove_const<decltype(shape_runtime)>::type,
+                       decltype(shape(layout_runtime))>;
+    constexpr bool check_runtime_strides =
+        std::is_same_v<std::remove_const<decltype(strides_runtime)>::type,
+                       decltype(stride(layout_runtime))>;
+    EXPECT_TRUE(check_compiletime_shape);
+    EXPECT_TRUE(check_compiletime_strides);
+    EXPECT_TRUE(check_runtime_shape);
+    EXPECT_TRUE(check_runtime_strides);
+}
+
+TEST(TestLayoutHelpers, Hierarchical)
+{
+    // dims:(((2, 2), 3), (4, 3))
+    constexpr ck::index_t d4 = 2;
+    constexpr ck::index_t d3 = 2;
+    constexpr ck::index_t d2 = 3;
+    constexpr ck::index_t d1 = 4;
+    constexpr ck::index_t d0 = 3;
+    const auto runtime_shape =
+        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
+    const auto layout_runtime     = ck::wrapper::make_layout(runtime_shape);
+    const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
+        ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
+        ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
+
+    EXPECT_EQ((ck::wrapper::rank<0, 0>(runtime_shape)), 2);
+    EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_runtime)), 2);
+    EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_compiletime)), 2);
+
+    EXPECT_EQ((ck::wrapper::depth<0, 0>(runtime_shape)), 1);
+    EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_runtime)), 1);
+    EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_compiletime)), 1);
+
+    EXPECT_EQ((ck::wrapper::size<0, 0>(runtime_shape)), d4 * d3);
+    EXPECT_EQ((ck::wrapper::size<0, 0>(layout_runtime)), d4 * d3);
+    EXPECT_EQ((ck::wrapper::size<0, 0>(layout_compiletime)), d4 * d3);
+
+    EXPECT_EQ((ck::wrapper::get<0, 0, 0>(runtime_shape)), d4);
+}

From 6896c3b0ae3da9adfa3cd4979621cee642257fc3 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Wed, 6 Dec 2023 12:48:10 -0800
Subject: [PATCH 03/18] Fix the CI builds using clang++ directly. (#1087)

* turn on -O3 compiler flag explicitly

* change cmake syntax for CI

* modify cmake line breaks in jenkinsfile
---
 Jenkinsfile | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 8e67f9cc39..d5fbff288f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -768,8 +768,15 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx908 || gfx90a") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -DCMAKE_EXE_LINKER_FLAGS=" -L ${env.WORKSPACE}/script -T hip_fatbin_insert " """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \
+                                         -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \
+                                         -DCMAKE_EXE_LINKER_FLAGS=" -L ${env.WORKSPACE}/script -T hip_fatbin_insert " \
+                                         -DCMAKE_CXX_FLAGS=" -O3 " """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
+                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
+                                           -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \
+                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ 
                     }
                     steps{
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
@@ -784,8 +791,12 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx908 || gfx90a") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
+                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
+                                           -DGPU_TARGETS="gfx908;gfx90a" \
+                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ 
                     }
                     steps{
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
@@ -800,8 +811,12 @@ pipeline {
                     }
                     agent{ label rocmnode("navi21") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON """ 
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """ 
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
+                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
+                                           -DGPU_TARGETS="gfx1030" \
+                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
@@ -816,8 +831,12 @@ pipeline {
                     }
                     agent{ label rocmnode("navi32") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
+                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
+                                           -DGPU_TARGETS="gfx1101" \
+                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')

From 957281ce45025f674c75ee3e318257d9df3a52d7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 7 Dec 2023 10:32:04 -0700
Subject: [PATCH 04/18] Bump rocm-docs-core from 0.29.0 to 0.30.1 in
 /docs/sphinx (#1090)

Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.29.0 to 0.30.1.
- [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases)
- [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.29.0...v0.30.1)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/sphinx/requirements.in  | 2 +-
 docs/sphinx/requirements.txt | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index f5ee431e7d..0a65ffc81a 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==0.29.0
+rocm-docs-core==0.30.1
 sphinxcontrib-bibtex==2.6.1
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 0442ae9a2b..01cb32e714 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -96,7 +96,9 @@ pygments==2.14.0
     #   pydata-sphinx-theme
     #   sphinx
 pyjwt[crypto]==2.6.0
-    # via pygithub
+    # via
+    #   pygithub
+    #   pyjwt
 pynacl==1.5.0
     # via pygithub
 pytz==2023.3.post1
@@ -111,7 +113,7 @@ requests==2.28.2
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==0.29.0
+rocm-docs-core==0.30.1
     # via -r requirements.in
 six==1.16.0
     # via

From 33600202c644f64d3596d6340466982895772822 Mon Sep 17 00:00:00 2001
From: zjing14 <zhangjing14@gmail.com>
Date: Thu, 7 Dec 2023 13:39:40 -0600
Subject: [PATCH 05/18] remove imcomplete transpose profiler (#1088)

Co-authored-by: Jing Zhang <jizha@amd.com>
Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
---
 profiler/src/profile_transpose.cpp | 85 ------------------------------
 1 file changed, 85 deletions(-)
 delete mode 100644 profiler/src/profile_transpose.cpp

diff --git a/profiler/src/profile_transpose.cpp b/profiler/src/profile_transpose.cpp
deleted file mode 100644
index c239a520d1..0000000000
--- a/profiler/src/profile_transpose.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-
-#include "profiler/profile_transpose_impl.hpp"
-#include "profiler_operation_registry.hpp"
-
-enum struct MatrixLayout
-{
-    NCDHW, // 0
-    NCHWD, // 1
-};
-
-enum struct DataType
-{
-    F32_F32_F32_F32_F32, // 0
-    F16_F16_F16_F16_F16, // 1
-};
-
-#define OP_NAME "transpose"
-#define OP_DESC "Transpose"
-
-int profile_transpose(int argc, char* argv[])
-{
-    if(argc != 15)
-    {
-        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
-        printf("arg2: data type (0: fp32; 1: fp16)\n");
-        // printf("arg3: matrix layout (NCDHW -> NDCHW);\n");
-        printf("arg4: verification (0: no; 1: yes)\n");
-        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg6: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: time kernel (0=no, 1=yes)\n");
-        printf("arg8 to 13: N, C, D, H, W\n");
-        exit(1);
-    }
-
-    const auto data_type = static_cast<DataType>(std::stoi(argv[2]));
-    // const auto layout          = static_cast<MatrixLayout>(std::stoi(argv[3]));
-    const bool do_verification   = std::stoi(argv[3]);
-    const int init_method        = std::stoi(argv[4]);
-    const bool do_log            = std::stoi(argv[5]);
-    const bool time_kernel       = std::stoi(argv[6]);
-    std::vector<index_t> lengths = std::stoi(argv[7]);
-
-    /**const int N = std::stoi(argv[7]);
-    const int C = std::stoi(argv[8]);
-    const int D = std::stoi(argv[9]);
-    const int H = std::stoi(argv[10]);
-    const int W = std::stoi(argv[11]);**/
-
-    using F32 = float;
-    using F16 = ck::half_t;
-
-    auto profile = [&](auto a_type, auto b_type) {
-        using ADataType = decltype(a_type);
-        using BDataType = decltype(b_type);
-
-        bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType>(
-            do_verification, init_method, do_log, time_kernel, lengths);
-
-        return pass ? 0 : 1;
-    };
-
-    if(data_type == GemmDataType::F32_F32_F32_F32_F32)
-    {
-        return profile(F32{}, F32{});
-    }
-    else if(data_type == GemmDataType::F16_F16_F16_F16_F16)
-    {
-        return profile(F16{}, F16{});
-    }
-    else
-    {
-        std::cout << "this data_type & layout is not implemented" << std::endl;
-
-        return 1;
-    }
-}
-
-REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_transpose);

From d939411dae1aa0e09fecb466cfdc1e3044085720 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 7 Dec 2023 15:59:34 -0800
Subject: [PATCH 06/18] Switch from ROCmSoftwarePlatform to ROCm org (#1091)

* switch from ROCmSoftwarePlatform to ROCm org

* replace ROCmSoftwarePlatform with ROCm in few more places
---
 CITATION.cff                            |  4 ++--
 Jenkinsfile                             | 10 +++++-----
 README.md                               |  2 +-
 dev-requirements.txt                    |  4 ++--
 include/ck/host_utility/device_prop.hpp |  2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/CITATION.cff b/CITATION.cff
index d35fe9e587..3813d63812 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -59,9 +59,9 @@ authors:
     family-names: Zhou
   - given-names: Jianfeng
     family-names: Yan
-repository-code: 'https://github.com/ROCmSoftwarePlatform/composable_kernel'
+repository-code: 'https://github.com/ROCm/composable_kernel'
 abstract: Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for Machine Learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel progarmming languages, like HIP C++.
 keywords:
   - 'CK, Composable Kernel, Tensor Coordinate Transformation'
 license: MIT
-license-url: https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE
+license-url: https://github.com/ROCm/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE
diff --git a/Jenkinsfile b/Jenkinsfile
index d5fbff288f..8f661e4780 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -302,7 +302,7 @@ def buildHipClangJob(Map conf=[:]){
         def retimage
         (retimage, image) = getDockerImage(conf)
 
-        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
             withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
                 timeout(time: 5, unit: 'HOURS')
                 {
@@ -355,7 +355,7 @@ def runCKProfiler(Map conf=[:]){
         def variant = env.STAGE_NAME
         def retimage
 
-        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
             try {
                 (retimage, image) = getDockerImage(conf)
                 withDockerContainer(image: image, args: dockerOpts) {
@@ -487,7 +487,7 @@ def Build_CK(Map conf=[:]){
         def retimage
         def navi_node = 0
 
-        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
             try {
                 (retimage, image) = getDockerImage(conf)
                 withDockerContainer(image: image, args: dockerOpts) {
@@ -553,7 +553,7 @@ def Build_CK(Map conf=[:]){
                         sh """#!/bin/bash
                             rm -rf "${params.hipTensor_branch}".zip
                             rm -rf hipTensor-"${params.hipTensor_branch}"
-                            wget https://github.com/ROCmSoftwarePlatform/hipTensor/archive/refs/heads/"${params.hipTensor_branch}".zip
+                            wget https://github.com/ROCm/hipTensor/archive/refs/heads/"${params.hipTensor_branch}".zip
                             unzip -o "${params.hipTensor_branch}".zip
                         """
                         dir("hipTensor-${params.hipTensor_branch}"){
@@ -605,7 +605,7 @@ def process_results(Map conf=[:]){
     def variant = env.STAGE_NAME
     def retimage
 
-    gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+    gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
         try {
             (retimage, image) = getDockerImage(conf)
         }
diff --git a/README.md b/README.md
index e5a20f143f..7679607e69 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
 3. Clone CK source code from the GitHub repository and start the build:
 
     ```bash
-    git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git && \
+    git clone https://github.com/ROCm/composable_kernel.git && \
     cd composable_kernel && \
     mkdir build && \
     cd build
diff --git a/dev-requirements.txt b/dev-requirements.txt
index 9e7b9f01e1..d5d91f8c27 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,3 +1,3 @@
-ROCmSoftwarePlatform/rocm-recipes
+ROCm/rocm-recipes
 RadeonOpenCompute/rocm-cmake@04f694df2a8dc9d7e35fa4dee4ba5fa407ec04f8 --build
-danmar/cppcheck@2.9
\ No newline at end of file
+danmar/cppcheck@2.9
diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp
index be2c2395fc..e8dabc9973 100644
--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -26,7 +26,7 @@ inline std::string get_device_name()
     }
     const std::string raw_name(props.gcnArchName);
 
-    // https://github.com/ROCmSoftwarePlatform/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
+    // https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
     static std::map<std::string, std::string> device_name_map = {
         {"Ellesmere", "gfx803"},
         {"Baffin", "gfx803"},

From f83698489109205dfe1780ce63c032b2a27e7434 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Fri, 8 Dec 2023 11:07:42 +0100
Subject: [PATCH 07/18] Support broadcast for bias in grouped conv fwd (#1081)

* Support broadcast for bias in grouped conv fwd

* Fix comment

* Comment fixes

* Remove GK layout
---
 ...rouped_conv_fwd_scaleadd_scaleadd_relu.inc |  16 +-
 example/62_conv_fwd_activ/CMakeLists.txt      |   2 +
 ...aleadd_scaleadd_relu_bcasted_bias_fp16.cpp | 294 ++++++++++++++++++
 .../run_convnd_fwd_activ_example.inc          |   2 +-
 ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp |  32 +-
 ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp |   3 +-
 .../gpu/device/tensor_layout.hpp              |   6 -
 .../transform_conv_fwd_to_gemm.hpp            |  15 +-
 .../device_operation_instance_factory.hpp     |   6 +-
 ...olution_forward_scaleadd_scaleadd_relu.hpp |  12 +-
 ...elu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp |   8 +-
 ...relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp |   8 +-
 ...relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp |   8 +-
 ...elu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp |   8 +-
 .../conv2d_fwd/conv2d_quantization_common.hpp |   6 +-
 15 files changed, 371 insertions(+), 55 deletions(-)
 create mode 100644 example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp

diff --git a/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc b/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
index c72c72971d..e8f5529520 100644
--- a/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
+++ b/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
@@ -16,6 +16,7 @@
 using InLayout             = ck::tensor_layout::convolution::NDHWGC;
 using WeiLayout            = ck::tensor_layout::convolution::GKZYXC;
 using OutLayout            = ck::tensor_layout::convolution::NDHWGK;
+using BiasLayout           = ck::tensor_layout::convolution::G_K;
 using PassThrough          = ck::tensor_operation::element_wise::PassThrough;
 using ScaleAddScaleAddRelu = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
 
@@ -64,6 +65,9 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
     std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
     std::array<ck::index_t, 6> out_strides{
         K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
+    // Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
+    std::array<ck::index_t, 6> bias_lengths{G, 1, K, 1, 1, 1};
+    std::array<ck::index_t, 6> bias_strides{K, 0, 1, 0, 0, 0};
 
     std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
     std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
@@ -74,13 +78,13 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
     SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
     SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
     SimpleDeviceMem d0(sizeof(std::tuple_element_t<0, DDataTypes>) * N * Do * Ho * Wo * G * K);
-    SimpleDeviceMem d1(sizeof(std::tuple_element_t<1, DDataTypes>) * N * Do * Ho * Wo * G * K);
+    SimpleDeviceMem d1(sizeof(std::tuple_element_t<1, DDataTypes>) * G * K);
 
     using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
         NumDimSpatial,
         InLayout,
         WeiLayout,
-        ck::Tuple<OutLayout, OutLayout>,
+        ck::Tuple<OutLayout, BiasLayout>,
         OutLayout,
         InDataType,
         WeiDataType,
@@ -117,8 +121,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
                                         in_strides,
                                         wei_lengths,
                                         wei_strides,
-                                        {out_lengths, out_lengths},
-                                        {out_strides, out_strides},
+                                        {out_lengths, bias_lengths},
+                                        {out_strides, bias_strides},
                                         out_lengths,
                                         out_strides,
                                         filter_strides,
@@ -187,8 +191,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
                                         in_strides,
                                         wei_lengths,
                                         wei_strides,
-                                        {out_lengths, out_lengths},
-                                        {out_strides, out_strides},
+                                        {out_lengths, bias_lengths},
+                                        {out_strides, bias_strides},
                                         out_lengths,
                                         out_strides,
                                         filter_strides,
diff --git a/example/62_conv_fwd_activ/CMakeLists.txt b/example/62_conv_fwd_activ/CMakeLists.txt
index bb95602416..d1f26bbfe1 100644
--- a/example/62_conv_fwd_activ/CMakeLists.txt
+++ b/example/62_conv_fwd_activ/CMakeLists.txt
@@ -42,6 +42,8 @@ foreach(gpu IN LISTS GPU_TARGETS)
       # ScaleAdd ScaleAdd Relu
       add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp)
       add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16)
+      add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp)
+      add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16)
    set(target 1)
  endif()
 endforeach()
diff --git a/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp b/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp
new file mode 100644
index 0000000000..196636f8b5
--- /dev/null
+++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+constexpr ck::index_t NDimSpatial = 3;
+using InDataType                  = ck::half_t;
+using WeiDataType                 = ck::half_t;
+using AccDataType                 = float;
+using CShuffleDataType            = ck::half_t;
+using OutDataType                 = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InLayout  = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::NDHWGK;
+
+using BiasLayout = ck::tensor_layout::convolution::G_K;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+using OutElementOp = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <typename OutElementOp>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<OutLayout, BiasLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<OutDataType, OutDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+
+namespace {
+// Use custom implementation to pass two more tensors for post op
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          int init_method,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    constexpr ck::index_t NumDs = 2;
+    const ck::index_t G         = out_g_n_k_wos_desc.GetLengths()[0];
+    const ck::index_t K         = out_g_n_k_wos_desc.GetLengths()[2];
+
+    // Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
+    std::array<ck::index_t, NDimSpatial + 3> bias_g_k_lengths;
+    std::array<ck::index_t, NDimSpatial + 3> bias_g_k_strides;
+    // Fill other lenghts than G,K with 1 and strides with 0
+    bias_g_k_lengths.fill(1);
+    bias_g_k_strides.fill(0);
+    bias_g_k_lengths[0]              = G;
+    bias_g_k_lengths[2]              = K;
+    bias_g_k_strides[0]              = K; // stride to G
+    bias_g_k_strides[2]              = 1; // stride to K
+    const auto broadcasted_bias_desc = HostTensorDescriptor(bias_g_k_lengths, bias_g_k_strides);
+
+    //  y = relu ( alpha1 * conv(x) + alpha2 * z + bias )
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+    std::array<Tensor<OutDataType>, NumDs> d_tensors = {Tensor<OutDataType>(out_g_n_k_wos_desc),
+                                                        Tensor<OutDataType>(broadcasted_bias_desc)};
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+    std::cout << "z_tensor: " << d_tensors[0].mDesc << std::endl;
+    std::cout << "bias_tensor: " << d_tensors[1].mDesc << std::endl;
+
+    // Make sure that we allocated only G * K values for bias
+    assert(static_cast<ck::index_t>(d_tensors[1].mData.size()) == G * K);
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        d_tensors[0].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
+        d_tensors[1].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
+        d_tensors[0].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
+        d_tensors[1].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem z_buf(sizeof(OutDataType) * d_tensors[0].mDesc.GetElementSpaceSize());
+    DeviceMem bias_buf(sizeof(OutDataType) * d_tensors[1].mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    z_buf.ToDevice(d_tensors[0].mData.data());
+    bias_buf.ToDevice(d_tensors[1].mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    const std::array<const void*, NumDs> ds = {z_buf.GetDeviceBuffer(), bias_buf.GetDeviceBuffer()};
+
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      ds,
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
+                                          e_g_n_k_wos_lengths, bias_g_k_lengths},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
+                                          e_g_n_k_wos_strides, bias_g_k_strides},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("The device op with the specified compilation parameters does "
+                                 "not support this convolution problem.");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = conv_param.GetFlops() + G * K +
+                       conv_param.GetOutputByte<OutDataType>() / sizeof(OutDataType);
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
+                            G * K * sizeof(OutDataType) + conv_param.GetOutputByte<OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        auto ref_conv =
+            ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                         InDataType,
+                                                         WeiDataType,
+                                                         OutDataType,
+                                                         InElementOp,
+                                                         WeiElementOp,
+                                                         OutElementOp,
+                                                         0, /*Num A Elementwise Tensors*/
+                                                         0, /*Num B Elementwise Tensors*/
+                                                         NumDs>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op,
+                                                  {},
+                                                  {},
+                                                  d_tensors);
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
+    }
+
+    return true;
+}
+
+} // namespace
+
+#include "run_convnd_fwd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
diff --git a/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc b/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc
index 7c20c01066..aa547c870a 100644
--- a/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc
+++ b/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc
@@ -24,7 +24,7 @@ bool run_convnd_fwd_example(int argc, char* argv[])
     // Following shapes are selected to avoid overflow. Expect inf in case of
     // size increase for some elementwise ops.
     ck::utils::conv::ConvParam conv_param{
-        3, 1, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+        3, 2, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
 
     if(argc == 1)
     {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index 26224b5dec..4afef85d8c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -357,15 +357,17 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         return out_gemmm_gemmn_desc;
     }
 
+    // Shape of Ds and E must be aligned. Strides can be different.
+    // Pass e_g_n_k_wos_lengths for logical broadcast.
     static auto MakeDsGridDescriptor_M_N(
-        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
         const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides)
     {
         return generate_tuple(
             [&](auto i) {
                 using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
 
-                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(ds_g_n_k_wos_lengths[i],
+                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(e_g_n_k_wos_lengths,
                                                                   ds_g_n_k_wos_strides[i]);
             },
             Number<NumDTensor>{});
@@ -569,7 +571,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
 
                 // D desc
                 ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N<DLayout>(
-                    ds_g_n_k_wos_lengths[i], ds_g_n_k_wos_strides[i]);
+                    e_g_n_k_wos_lengths, ds_g_n_k_wos_strides[i]);
             });
             compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_k_wos_strides[0];
 
@@ -916,8 +918,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                          is_same_v<DLayout, ctc::G_NDHW_K> || is_same_v<DLayout, ctc::GNWK> ||
                          is_same_v<DLayout, ctc::GNHWK> || is_same_v<DLayout, ctc::GNDHWK> ||
                          is_same_v<DLayout, ctc::NWGK> || is_same_v<DLayout, ctc::NHWGK> ||
-                         is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::GK> ||
-                         is_same_v<DLayout, ctc::G_K>)
+                         is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::G_K>)
             {
                 const index_t K = arg.ds_g_n_k_wos_lengths_[i][2];
 
@@ -925,6 +926,27 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                 {
                     valid = false;
                 }
+
+                if constexpr(is_same_v<DLayout, ctc::G_K>)
+                {
+                    // G and K must be the same
+                    if(arg.ds_g_n_k_wos_lengths_[i][0] != arg.e_g_n_k_wos_lengths_[0] ||
+                       arg.ds_g_n_k_wos_lengths_[i][2] != arg.e_g_n_k_wos_lengths_[2])
+                    {
+                        valid = false;
+                    }
+                }
+                else
+                {
+                    // E and D must have the same shape
+                    for(index_t d = 0; d < NDimSpatial + 3; d++)
+                    {
+                        if(arg.ds_g_n_k_wos_lengths_[i][d] != arg.e_g_n_k_wos_lengths_[d])
+                        {
+                            valid = false;
+                        }
+                    }
+                }
             }
             else
             {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
index 80a5d0e97a..0050a5b281 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
@@ -631,8 +631,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
                          is_same_v<DLayout, ctc::G_NDHW_K> || is_same_v<DLayout, ctc::GNWK> ||
                          is_same_v<DLayout, ctc::GNHWK> || is_same_v<DLayout, ctc::GNDHWK> ||
                          is_same_v<DLayout, ctc::NWGK> || is_same_v<DLayout, ctc::NHWGK> ||
-                         is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::GK> ||
-                         is_same_v<DLayout, ctc::G_K>)
+                         is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::G_K>)
             {
                 const index_t K = arg.ds_g_n_k_wos_lengths_[i][2];
 
diff --git a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
index b2d141fd61..ecc71ba2f2 100644
--- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
@@ -308,12 +308,6 @@ struct GNDHWK : public BaseTensorLayout
     static constexpr const char* name = "GNDHWK";
 };
 
-// for output bias
-struct GK : public BaseTensorLayout
-{
-    static constexpr const char* name = "GK";
-};
-
 // output tensor
 // packed NWGK/NHWGK/NDHWGK
 struct NWGK : public BaseTensorLayout
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
index 6f546f1d6d..e2f75142d4 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
@@ -522,22 +522,21 @@ struct TransformConvFwdToGemm
 
     // for output bias
     template <typename CLayout,
-              typename std::enable_if<is_same_v<CLayout, tensor_layout::convolution::GK> ||
-                                          is_same_v<CLayout, tensor_layout::convolution::G_K>,
+              typename std::enable_if<is_same_v<CLayout, tensor_layout::convolution::G_K>,
                                       bool>::type = false>
-    static auto
-    MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */)
+    static auto MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                                    const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides)
     {
-        const index_t N = c_g_n_k_wos_lengths[1];
-        const index_t K = c_g_n_k_wos_lengths[2];
+        const index_t N       = c_g_n_k_wos_lengths[1];
+        const index_t K       = c_g_n_k_wos_lengths[2];
+        const index_t KStride = c_g_n_k_wos_strides[2];
 
         const index_t NHoWo =
             N * ck::accumulate_n<index_t>(
                     c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
 
         const auto out_gemmm_gemmn_desc =
-            make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(I0, I1));
+            make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(I0, KStride));
 
         return out_gemmm_gemmn_desc;
     }
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index 89b8b9667f..dc47c7ec1a 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -86,9 +86,9 @@ using NHWGK  = ck::tensor_layout::convolution::NHWGK;
 using NDHWGK = ck::tensor_layout::convolution::NDHWGK;
 
 //
-using GK          = ck::tensor_layout::convolution::G_K;
-using GK_Tuple    = ck::Tuple<GK>;
-using GK_GK_Tuple = ck::Tuple<GK, GK>;
+using G_K         = ck::tensor_layout::convolution::G_K;
+using GK_Tuple    = ck::Tuple<G_K>;
+using GK_GK_Tuple = ck::Tuple<G_K, G_K>;
 
 // pointwise functor
 using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp
index dc9f44dc86..efb6266426 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp
@@ -27,7 +27,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
-                                                                ck::Tuple<NDHWGK, NDHWGK>,
+                                                                ck::Tuple<NDHWGK, G_K>,
                                                                 NDHWGK,
                                                                 BF16,
                                                                 BF16,
@@ -43,7 +43,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
-                                                                ck::Tuple<NDHWGK, NDHWGK>,
+                                                                ck::Tuple<NDHWGK, G_K>,
                                                                 NDHWGK,
                                                                 F16,
                                                                 F16,
@@ -59,7 +59,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
-                                                                ck::Tuple<NDHWGK, NDHWGK>,
+                                                                ck::Tuple<NDHWGK, G_K>,
                                                                 NDHWGK,
                                                                 F32,
                                                                 F32,
@@ -75,7 +75,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
-                                                                ck::Tuple<NDHWGK, NDHWGK>,
+                                                                ck::Tuple<NDHWGK, G_K>,
                                                                 NDHWGK,
                                                                 int8_t,
                                                                 int8_t,
@@ -130,7 +130,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
         if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
-                     is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK>)
+                     is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK> &&
+                     DLayouts::Size() == 2 && is_same_v<tuple_element_t<0, DLayouts>, NDHWGK> &&
+                     is_same_v<tuple_element_t<1, DLayouts>, G_K>)
         {
 #ifdef CK_ENABLE_FP32
             if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
index c6627a4825..7d2df94ad7 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -13,7 +13,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
-                                                                ck::Tuple<NDHWGK, NDHWGK>,
+                                                                ck::Tuple<NDHWGK, G_K>,
                                                                 NDHWGK,
                                                                 BF16,
                                                                 BF16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
         device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_bf16_instances<3,
                                                                           NDHWGC,
                                                                           GKZYXC,
-                                                                          ck::Tuple<NDHWGK, NDHWGK>,
+                                                                          ck::Tuple<NDHWGK, G_K>,
                                                                           NDHWGK,
                                                                           ConvFwdDefault>{});
     add_device_operation_instances(
@@ -36,7 +36,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
         device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_bf16_instances<3,
                                                                           NDHWGC,
                                                                           GKZYXC,
-                                                                          ck::Tuple<NDHWGK, NDHWGK>,
+                                                                          ck::Tuple<NDHWGK, G_K>,
                                                                           NDHWGK,
                                                                           ConvFwd1x1P0>{});
     add_device_operation_instances(
@@ -44,7 +44,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
         device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_bf16_instances<3,
                                                                           NDHWGC,
                                                                           GKZYXC,
-                                                                          ck::Tuple<NDHWGK, NDHWGK>,
+                                                                          ck::Tuple<NDHWGK, G_K>,
                                                                           NDHWGK,
                                                                           ConvFwd1x1S1P0>{});
 }
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
index 627af24d7b..8a09d03967 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -13,7 +13,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
-                                                                ck::Tuple<NDHWGK, NDHWGK>,
+                                                                ck::Tuple<NDHWGK, G_K>,
                                                                 NDHWGK,
                                                                 F16,
                                                                 F16,
@@ -28,7 +28,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
         device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f16_instances<3,
                                                                          NDHWGC,
                                                                          GKZYXC,
-                                                                         ck::Tuple<NDHWGK, NDHWGK>,
+                                                                         ck::Tuple<NDHWGK, G_K>,
                                                                          NDHWGK,
                                                                          ConvFwdDefault>{});
     add_device_operation_instances(
@@ -36,7 +36,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
         device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f16_instances<3,
                                                                          NDHWGC,
                                                                          GKZYXC,
-                                                                         ck::Tuple<NDHWGK, NDHWGK>,
+                                                                         ck::Tuple<NDHWGK, G_K>,
                                                                          NDHWGK,
                                                                          ConvFwd1x1P0>{});
     add_device_operation_instances(
@@ -44,7 +44,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
         device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f16_instances<3,
                                                                          NDHWGC,
                                                                          GKZYXC,
-                                                                         ck::Tuple<NDHWGK, NDHWGK>,
+                                                                         ck::Tuple<NDHWGK, G_K>,
                                                                          NDHWGK,
                                                                          ConvFwd1x1S1P0>{});
 }
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
index 1fd567e360..6966959639 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -13,7 +13,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
-                                                                ck::Tuple<NDHWGK, NDHWGK>,
+                                                                ck::Tuple<NDHWGK, G_K>,
                                                                 NDHWGK,
                                                                 F32,
                                                                 F32,
@@ -28,7 +28,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
         device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f32_instances<3,
                                                                          NDHWGC,
                                                                          GKZYXC,
-                                                                         ck::Tuple<NDHWGK, NDHWGK>,
+                                                                         ck::Tuple<NDHWGK, G_K>,
                                                                          NDHWGK,
                                                                          ConvFwdDefault>{});
     add_device_operation_instances(
@@ -36,7 +36,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
         device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f32_instances<3,
                                                                          NDHWGC,
                                                                          GKZYXC,
-                                                                         ck::Tuple<NDHWGK, NDHWGK>,
+                                                                         ck::Tuple<NDHWGK, G_K>,
                                                                          NDHWGK,
                                                                          ConvFwd1x1P0>{});
     add_device_operation_instances(
@@ -44,7 +44,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
         device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f32_instances<3,
                                                                          NDHWGC,
                                                                          GKZYXC,
-                                                                         ck::Tuple<NDHWGK, NDHWGK>,
+                                                                         ck::Tuple<NDHWGK, G_K>,
                                                                          NDHWGK,
                                                                          ConvFwd1x1S1P0>{});
 }
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
index dae292891c..2606f69428 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
@@ -12,7 +12,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                 NDHWGC,
                                                                 GKZYXC,
-                                                                ck::Tuple<NDHWGK, NDHWGK>,
+                                                                ck::Tuple<NDHWGK, G_K>,
                                                                 NDHWGK,
                                                                 int8_t,
                                                                 int8_t,
@@ -27,7 +27,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
         device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_int8_instances<3,
                                                                           NDHWGC,
                                                                           GKZYXC,
-                                                                          ck::Tuple<NDHWGK, NDHWGK>,
+                                                                          ck::Tuple<NDHWGK, G_K>,
                                                                           NDHWGK,
                                                                           ConvFwdDefault>{});
     add_device_operation_instances(
@@ -35,7 +35,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
         device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_int8_instances<3,
                                                                           NDHWGC,
                                                                           GKZYXC,
-                                                                          ck::Tuple<NDHWGK, NDHWGK>,
+                                                                          ck::Tuple<NDHWGK, G_K>,
                                                                           NDHWGK,
                                                                           ConvFwd1x1P0>{});
     add_device_operation_instances(
@@ -43,7 +43,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
         device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_int8_instances<3,
                                                                           NDHWGC,
                                                                           GKZYXC,
-                                                                          ck::Tuple<NDHWGK, NDHWGK>,
+                                                                          ck::Tuple<NDHWGK, G_K>,
                                                                           NDHWGK,
                                                                           ConvFwd1x1S1P0>{});
 }
diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
index 711314985a..d46fe090b8 100644
--- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp
@@ -22,13 +22,13 @@ using S = ck::Sequence<Is...>;
 using NHWGC       = ck::tensor_layout::convolution::NHWGC;
 using GKYXC       = ck::tensor_layout::convolution::GKYXC;
 using NHWGK       = ck::tensor_layout::convolution::NHWGK;
-using GK          = ck::tensor_layout::convolution::G_K;
+using G_K         = ck::tensor_layout::convolution::G_K;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using Relu        = ck::tensor_operation::element_wise::Relu;
 using TanH        = ck::tensor_operation::element_wise::TanH;
 
-using GK_Tuple      = ck::Tuple<GK>;
-using GK_GK_Tuple   = ck::Tuple<GK, GK>;
+using GK_Tuple      = ck::Tuple<G_K>;
+using GK_GK_Tuple   = ck::Tuple<G_K, G_K>;
 using I32_Tuple     = ck::Tuple<int32_t>;
 using F32_Tuple     = ck::Tuple<float>;
 using I32_F32_Tuple = ck::Tuple<int32_t, float>;

From b4dcd5803f1dae92467d39c31f176131ce796735 Mon Sep 17 00:00:00 2001
From: Nicolas Macchioni <nmacchioni@fb.com>
Date: Fri, 8 Dec 2023 11:30:01 -0800
Subject: [PATCH 08/18] Add F8 dtype definition in f16_f8_f16 gemm instances
 (#1092)

---
 .../device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp   | 1 +
 .../device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp
index 3c9e03b674..38667ad42b 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp
@@ -16,6 +16,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
+using F8 = ck::f8_t;
 using F16 = ck::half_t;
 using F32 = float;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp
index aab0af990d..820404e064 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp
@@ -16,6 +16,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
+using F8 = ck::f8_t;
 using F16 = ck::half_t;
 using F32 = float;
 

From f199035b748331901b7e0d58cbcd88e108bdcadd Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Fri, 8 Dec 2023 14:32:37 -0800
Subject: [PATCH 09/18] fix clang format (#1095)

---
 .../device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp  | 2 +-
 .../device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp
index 38667ad42b..b3d1e925df 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp
@@ -16,7 +16,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using F8 = ck::f8_t;
+using F8  = ck::f8_t;
 using F16 = ck::half_t;
 using F32 = float;
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp
index 820404e064..9c80995949 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp
@@ -16,7 +16,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-using F8 = ck::f8_t;
+using F8  = ck::f8_t;
 using F16 = ck::half_t;
 using F32 = float;
 

From 89ee47460bedd3e028a6240c2c395023cb233f4c Mon Sep 17 00:00:00 2001
From: Bartlomiej Wroblewski <bwroblewski10@gmail.com>
Date: Mon, 11 Dec 2023 17:12:32 +0100
Subject: [PATCH 10/18] Fix IsSupported check in the contraction op (#1066)

Current implementation of IsSupported method in contraction ops does not cover a lot of possible cases in which ScalarPerVector cannot really be used to read A, B or D, or write E.

This PR extends both the regular and multiABD contraction ops with improved checks and also adds new instances with smaller values of ScalarPerVector to support instances that are not supported by other instances.
---
 ..._contraction_multiple_abd_xdl_cshuffle.hpp | 153 ++++++++--------
 ...ce_contraction_multiple_d_xdl_cshuffle.hpp | 163 ++++++++----------
 .../device/impl/device_contraction_utils.hpp  |  87 ++++++++++
 .../device_contraction_instance.hpp           |  24 ++-
 4 files changed, 261 insertions(+), 166 deletions(-)
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
index 29d7a2b949..0c8e11a17b 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
@@ -14,6 +14,7 @@
 #include "ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
@@ -500,22 +501,29 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
             // for sanity check of vector memory access
             for(index_t i = 0; i < NumATensor; ++i)
             {
-                a_mz_stride_[i] = a_ms_ks_strides[i][NumDimM - 1];
-                a_kz_stride_[i] = a_ms_ks_strides[i][NumDimM + NumDimK - 1];
+                as_mz_consecutive_[i] = a_ms_ks_strides[i][NumDimM - 1] == 1;
+                as_kz_consecutive_[i] = a_ms_ks_strides[i][NumDimM + NumDimK - 1] == 1;
+                as_max_read_elems_[i] =
+                    CalculateMaxRead<NumDimM, NumDimK>(a_ms_ks_lengths[i], a_ms_ks_strides[i]);
             }
 
             for(index_t i = 0; i < NumBTensor; ++i)
             {
-                b_nz_stride_[i] = b_ns_ks_strides[i][NumDimN - 1];
-                b_kz_stride_[i] = b_ns_ks_strides[i][NumDimN + NumDimK - 1];
+                bs_nz_consecutive_[i] = b_ns_ks_strides[i][NumDimN - 1] == 1;
+                bs_kz_consecutive_[i] = b_ns_ks_strides[i][NumDimN + NumDimK - 1] == 1;
+                bs_max_read_elems_[i] =
+                    CalculateMaxRead<NumDimN, NumDimK>(b_ns_ks_lengths[i], b_ns_ks_strides[i]);
             }
 
             for(index_t i = 0; i < NumDTensor; ++i)
             {
-                ds_nz_stride_[i] = d_ms_ns_strides[i][NumDimM + NumDimN - 1];
+                ds_nz_consecutive_[i] = d_ms_ns_strides[i][NumDimM + NumDimN - 1] == 1;
+                ds_max_read_elems_[i] =
+                    CalculateMaxRead<NumDimM, NumDimN>(d_ms_ns_lengths[i], d_ms_ns_strides[i]);
             }
 
-            e_nz_stride_ = e_ms_ns_stride[NumDimM + NumDimN - 1];
+            e_nz_consecutive_  = e_ms_ns_stride[NumDimM + NumDimN - 1] == 1;
+            e_max_write_elems_ = CalculateMaxRead<NumDimM, NumDimN>(e_ms_ns_length, e_ms_ns_stride);
         }
 
         // pointers
@@ -545,16 +553,19 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
         BElementwiseOperation b_element_op_;
         CDEElementwiseOperation cde_element_op_;
 
-        // Strides for the last M/N/K dimensions of A/B/Ds/E
-        //   for sanity check of vector load/store
-        std::array<index_t, NumATensor> a_mz_stride_;
-        std::array<index_t, NumATensor> a_kz_stride_;
+        // Describe whether the last part of a given dimension of A/B/D/E is consecutive
+        // in the memory or not.
+        std::array<bool, NumATensor> as_mz_consecutive_;
+        std::array<bool, NumATensor> as_kz_consecutive_;
+        std::array<bool, NumBTensor> bs_nz_consecutive_;
+        std::array<bool, NumBTensor> bs_kz_consecutive_;
+        std::array<bool, NumDTensor> ds_nz_consecutive_;
+        bool e_nz_consecutive_;
 
-        std::array<index_t, NumBTensor> b_nz_stride_;
-        std::array<index_t, NumBTensor> b_kz_stride_;
-
-        std::array<index_t, NumDTensor> ds_nz_stride_;
-        index_t e_nz_stride_;
+        std::array<index_t, NumATensor> as_max_read_elems_;
+        std::array<index_t, NumBTensor> bs_max_read_elems_;
+        std::array<index_t, NumDTensor> ds_max_read_elems_;
+        index_t e_max_write_elems_;
     };
 
     // Invoker
@@ -643,73 +654,65 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
 
         // check vector load/store
         {
-            bool all_valid = true;
-
+            bool valid_as_access = true;
             static_for<0, NumATensor, 1>{}([&](auto i) {
-                // vector memory access of A: could be on M or AK1 dimension
-                if constexpr(ABlockTransferSrcVectorDim == 1)
+                const bool valid_a_vector_size =
+                    arg.as_max_read_elems_[i] % ABlockTransferSrcScalarPerVector == 0;
+                const bool valid_a_access_dim_m =
+                    ABlockTransferSrcVectorDim == 1 && arg.as_mz_consecutive_[i];
+                const bool valid_a_access_dim_k =
+                    ABlockTransferSrcVectorDim == 2 && arg.as_kz_consecutive_[i];
+                const bool valid_a_access_dim = valid_a_access_dim_m || valid_a_access_dim_k;
+                if(!(valid_a_vector_size && valid_a_access_dim))
                 {
-                    if(!(arg.a_mz_stride_[i] == 1 && arg.as_grid_desc_ak0_m_ak1_[i].GetLength(I1) %
-                                                             ABlockTransferSrcScalarPerVector ==
-                                                         0))
-                    {
-                        all_valid = false;
-                    }
-                }
-                else
-                {
-                    if(!(arg.a_kz_stride_[i] == 1 && arg.as_grid_desc_ak0_m_ak1_[i].GetLength(I2) %
-                                                             ABlockTransferSrcScalarPerVector ==
-                                                         0))
-                    {
-                        all_valid = false;
-                    }
+                    valid_as_access = false;
                 }
             });
-
-            // vector memory access of B: could be on N or BK1 dimension
-            static_for<0, NumBTensor, 1>{}([&](auto i) {
-                if constexpr(BBlockTransferSrcVectorDim == 1)
-                {
-                    if(!(arg.b_nz_stride_[i] == 1 && arg.bs_grid_desc_bk0_n_bk1_[i].GetLength(I1) %
-                                                             BBlockTransferSrcScalarPerVector ==
-                                                         0))
-                    {
-                        all_valid = false;
-                    }
-                }
-                else
-                {
-                    if(!(arg.b_kz_stride_[i] == 1 && arg.bs_grid_desc_bk0_n_bk1_[i].GetLength(I2) %
-                                                             BBlockTransferSrcScalarPerVector ==
-                                                         0))
-                    {
-                        all_valid = false;
-                    }
-                }
-            });
-
-            // check vector load of Ds
-            static_for<0, NumDTensor, 1>{}([&](auto i) {
-                if(!(arg.ds_nz_stride_[i] == 1 &&
-                     arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) %
-                             CDEBlockTransferScalarPerVector_NPerBlock ==
-                         0))
-                {
-                    all_valid = false;
-                }
-            });
-
-            // vector memory access of E: always on NPerBlock dimension
-            if(!(arg.e_nz_stride_ == 1 &&
-                 arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) %
-                         CDEBlockTransferScalarPerVector_NPerBlock ==
-                     0))
+            if(!valid_as_access)
             {
-                all_valid = false;
+                return false;
             }
 
-            if(!all_valid)
+            bool valid_bs_access = true;
+            static_for<0, NumBTensor, 1>{}([&](auto i) {
+                const bool valid_b_vector_size =
+                    arg.bs_max_read_elems_[i] % BBlockTransferSrcScalarPerVector == 0;
+                const bool valid_b_access_dim_n =
+                    BBlockTransferSrcVectorDim == 1 && arg.bs_nz_consecutive_[i];
+                const bool valid_b_access_dim_k =
+                    BBlockTransferSrcVectorDim == 2 && arg.bs_kz_consecutive_[i];
+                const bool valid_b_access_dim = valid_b_access_dim_n || valid_b_access_dim_k;
+                if(!(valid_b_vector_size && valid_b_access_dim))
+                {
+                    valid_bs_access = false;
+                }
+            });
+            if(!valid_bs_access)
+            {
+                return false;
+            }
+
+            bool valid_ds_access = true;
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                const bool valid_d_vector_size =
+                    arg.ds_max_read_elems_[i] % CDEBlockTransferScalarPerVector_NPerBlock == 0;
+                // Vector read of Ds is always on N dimension.
+                const bool valid_d_access_dim = arg.ds_nz_consecutive_[i];
+                if(!(valid_d_vector_size && valid_d_access_dim))
+                {
+                    valid_ds_access = false;
+                }
+            });
+            if(!valid_ds_access)
+            {
+                return false;
+            }
+
+            const bool valid_e_vector_size =
+                arg.e_max_write_elems_ % CDEBlockTransferScalarPerVector_NPerBlock == 0;
+            // Vector write of E is always on N dimension.
+            const bool valid_e_access_dim = arg.e_nz_consecutive_;
+            if(!(valid_e_vector_size && valid_e_access_dim))
             {
                 return false;
             }
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
index 71ff2ba17d..290abe221a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -13,6 +13,7 @@
 #include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
@@ -183,7 +184,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
             return generate_tuple([&](auto i) { return vec[i]; }, num);
         };
 
-        const auto a_ms_ns_lengths = to_tuple(a_ms_ks_lengths_vec, Number<NumDimM + NumDimK>{});
+        const auto a_ms_ks_lengths = to_tuple(a_ms_ks_lengths_vec, Number<NumDimM + NumDimK>{});
         const auto a_ms_ks_strides = to_tuple(a_ms_ks_strides_vec, Number<NumDimM + NumDimK>{});
 
         // dimension Ids for M0, M1, ...
@@ -194,14 +195,14 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
             typename arithmetic_sequence_gen<NumDimM, NumDimM + NumDimK, 1>::type{};
 
         // lengths for M0, M1, ...
-        const auto mLengths = get_container_subset(a_ms_ns_lengths, mDimIds);
+        const auto mLengths = get_container_subset(a_ms_ks_lengths, mDimIds);
 
         // lengths for K0, K1, ...
-        const auto kLengths = get_container_subset(a_ms_ns_lengths, kDimIds);
+        const auto kLengths = get_container_subset(a_ms_ks_lengths, kDimIds);
 
         // naive tensor A[M0, M1, M2, ..., K0, K1, K2...]
         const auto a_grid_desc_ms_ks =
-            make_naive_tensor_descriptor(a_ms_ns_lengths, a_ms_ks_strides);
+            make_naive_tensor_descriptor(a_ms_ks_lengths, a_ms_ks_strides);
 
         // transformed tensor A[MRaw = M0 * M1 * M2 * ... , KRaw = K0 * K1 * K2 * ...]
         const auto a_grid_desc_mraw_kraw = transform_tensor_descriptor(
@@ -383,7 +384,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                  const void* p_b_grid,
                  std::array<const void*, NumDTensor> p_ds_grid,
                  void* p_e_grid,
-                 const std::vector<index_t>& a_ms_ns_lengths,
+                 const std::vector<index_t>& a_ms_ks_lengths,
                  const std::vector<index_t>& a_ms_ks_strides,
                  const std::vector<index_t>& b_ns_ks_lengths,
                  const std::vector<index_t>& b_ns_ks_strides,
@@ -398,7 +399,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
               p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
               p_ds_grid_{},
               p_e_grid_{static_cast<EDataType*>(p_e_grid)},
-              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(a_ms_ns_lengths, a_ms_ks_strides)},
+              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(a_ms_ks_lengths, a_ms_ks_strides)},
               b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(b_ns_ks_lengths, b_ns_ks_strides)},
               ds_grid_desc_m_n_{},
               e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(e_ms_ns_lengths, e_ms_ns_strides)},
@@ -411,13 +412,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
               block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
               a_element_op_{a_element_op},
               b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op},
-              a_mz_stride_{},
-              a_kz_stride_{},
-              b_nz_stride_{},
-              b_kz_stride_{},
-              ds_nz_stride_{},
-              e_nz_stride_{}
+              cde_element_op_{cde_element_op}
         {
             // populate pointer, batch stride, desc for Ds
             static_for<0, NumDTensor, 1>{}([&](auto i) {
@@ -448,18 +443,26 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
             }
 
             // for sanity check of vector memory access
-            a_mz_stride_ = a_ms_ks_strides[NumDimM - 1];
-            a_kz_stride_ = a_ms_ks_strides[NumDimM + NumDimK - 1];
+            a_mz_consecutive_ = a_ms_ks_strides[NumDimM - 1] == 1;
+            a_kz_consecutive_ = a_ms_ks_strides[NumDimM + NumDimK - 1] == 1;
+            a_max_read_elems_ =
+                CalculateMaxRead<NumDimM, NumDimK>(a_ms_ks_lengths, a_ms_ks_strides);
 
-            b_nz_stride_ = b_ns_ks_strides[NumDimN - 1];
-            b_kz_stride_ = b_ns_ks_strides[NumDimN + NumDimK - 1];
+            b_nz_consecutive_ = b_ns_ks_strides[NumDimN - 1] == 1;
+            b_kz_consecutive_ = b_ns_ks_strides[NumDimN + NumDimK - 1] == 1;
+            b_max_read_elems_ =
+                CalculateMaxRead<NumDimN, NumDimK>(b_ns_ks_lengths, b_ns_ks_strides);
 
             for(index_t i = 0; i < NumDTensor; ++i)
             {
-                ds_nz_stride_[i] = ds_ms_ns_strides[i][NumDimM + NumDimN - 1];
+                ds_nz_consecutive_[i] = ds_ms_ns_strides[i][NumDimM + NumDimN - 1] == 1;
+                ds_max_read_elems_[i] =
+                    CalculateMaxRead<NumDimM, NumDimN>(ds_ms_ns_lengths[i], ds_ms_ns_strides[i]);
             }
 
-            e_nz_stride_ = e_ms_ns_strides[NumDimM + NumDimN - 1];
+            e_nz_consecutive_ = e_ms_ns_strides[NumDimM + NumDimN - 1] == 1;
+            e_max_write_elems_ =
+                CalculateMaxRead<NumDimM, NumDimN>(e_ms_ns_lengths, e_ms_ns_strides);
         }
 
         void Print() const
@@ -499,15 +502,19 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
         BElementwiseOperation b_element_op_;
         CDEElementwiseOperation cde_element_op_;
 
-        // Strides for the last M/N/K dimensions of A/B/Ds/E
-        //   for sanity check of vector load/store
-        index_t a_mz_stride_;
-        index_t a_kz_stride_;
-        index_t b_nz_stride_;
-        index_t b_kz_stride_;
-        std::array<index_t, NumDTensor> ds_nz_stride_;
-        index_t e_mz_stride_;
-        index_t e_nz_stride_;
+        // Describe whether the last part of a given dimension of A/B/D/E is consecutive
+        // in the memory or not.
+        bool a_mz_consecutive_;
+        bool a_kz_consecutive_;
+        bool b_nz_consecutive_;
+        bool b_kz_consecutive_;
+        std::array<bool, NumDTensor> ds_nz_consecutive_;
+        bool e_nz_consecutive_;
+
+        index_t a_max_read_elems_;
+        index_t b_max_read_elems_;
+        std::array<index_t, NumDTensor> ds_max_read_elems_;
+        index_t e_max_write_elems_;
     };
 
     // Invoker
@@ -616,65 +623,47 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                           (BBlockTransferSrcVectorDim == 1 || BBlockTransferSrcVectorDim == 2),
                       "wrong!");
 
-        // vector memory access of A: could be on M or AK1 dimension
-        if constexpr(ABlockTransferSrcVectorDim == 1)
-        {
-            if(!(arg.a_mz_stride_ == 1 &&
-                 arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) % ABlockTransferSrcScalarPerVector == 0))
-            {
-                return false;
-            }
-        }
-        else
-        {
-            if(!(arg.a_kz_stride_ == 1 &&
-                 arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) % ABlockTransferSrcScalarPerVector == 0))
-            {
-                return false;
-            }
-        }
-
-        // vector memory access of B: could be on N or BK1 dimension
-        if constexpr(BBlockTransferSrcVectorDim == 1)
-        {
-            if(!(arg.b_nz_stride_ == 1 &&
-                 arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) % BBlockTransferSrcScalarPerVector == 0))
-            {
-                return false;
-            }
-        }
-        else
-        {
-            if(!(arg.b_kz_stride_ == 1 &&
-                 arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) % BBlockTransferSrcScalarPerVector == 0))
-            {
-                return false;
-            }
-        }
-
-        // vector memory access of Ds: always on NPerBlock dimension
-        bool valid_d_access = true;
-
-        static_for<0, NumDTensor, 1>{}([&](auto i) {
-            if(!(arg.ds_nz_stride_[i] == 1 &&
-                 arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) %
-                         CDEBlockTransferScalarPerVector_NPerBlock ==
-                     0))
-            {
-                valid_d_access = false;
-            }
-        });
-
-        if(valid_d_access == false)
+        const bool valid_a_vector_size =
+            arg.a_max_read_elems_ % ABlockTransferSrcScalarPerVector == 0;
+        const bool valid_a_access_dim_m = ABlockTransferSrcVectorDim == 1 && arg.a_mz_consecutive_;
+        const bool valid_a_access_dim_k = ABlockTransferSrcVectorDim == 2 && arg.a_kz_consecutive_;
+        const bool valid_a_access_dim   = valid_a_access_dim_m || valid_a_access_dim_k;
+        if(!(valid_a_vector_size && valid_a_access_dim))
         {
             return false;
         }
 
-        // vector memory access of E: always on NPerBlock dimension
-        if(!(arg.e_nz_stride_ == 1 &&
-             arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) %
-                     CDEBlockTransferScalarPerVector_NPerBlock ==
-                 0))
+        const bool valid_b_vector_size =
+            arg.b_max_read_elems_ % BBlockTransferSrcScalarPerVector == 0;
+        const bool valid_b_access_dim_n = BBlockTransferSrcVectorDim == 1 && arg.b_nz_consecutive_;
+        const bool valid_b_access_dim_k = BBlockTransferSrcVectorDim == 2 && arg.b_kz_consecutive_;
+        const bool valid_b_access_dim   = valid_b_access_dim_n || valid_b_access_dim_k;
+        if(!(valid_b_vector_size && valid_b_access_dim))
+        {
+            return false;
+        }
+
+        bool valid_ds_access = true;
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            const bool valid_d_vector_size =
+                arg.ds_max_read_elems_[i] % CDEBlockTransferScalarPerVector_NPerBlock == 0;
+            // Vector read of Ds is always on N dimension.
+            const bool valid_d_access_dim = arg.ds_nz_consecutive_[i];
+            if(!(valid_d_vector_size && valid_d_access_dim))
+            {
+                valid_ds_access = false;
+            }
+        });
+        if(!valid_ds_access)
+        {
+            return false;
+        }
+
+        const bool valid_e_vector_size =
+            arg.e_max_write_elems_ % CDEBlockTransferScalarPerVector_NPerBlock == 0;
+        // Vector write of E is always on N dimension.
+        const bool valid_e_access_dim = arg.e_nz_consecutive_;
+        if(!(valid_e_vector_size && valid_e_access_dim))
         {
             return false;
         }
@@ -692,7 +681,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                              const void* p_b,
                              std::array<const void*, NumDTensor> p_ds,
                              void* p_e,
-                             const std::vector<index_t>& a_ms_ns_lengths,
+                             const std::vector<index_t>& a_ms_ks_lengths,
                              const std::vector<index_t>& a_ms_ks_strides,
                              const std::vector<index_t>& b_ns_ks_lengths,
                              const std::vector<index_t>& b_ns_ks_strides,
@@ -708,7 +697,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                         p_b,
                         p_ds,
                         p_e,
-                        a_ms_ns_lengths,
+                        a_ms_ks_lengths,
                         a_ms_ks_strides,
                         b_ns_ks_lengths,
                         b_ns_ks_strides,
@@ -729,7 +718,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                         const void* p_b,
                         std::array<const void*, NumDTensor> p_ds,
                         void* p_e,
-                        const std::vector<index_t>& a_ms_ns_lengths,
+                        const std::vector<index_t>& a_ms_ks_lengths,
                         const std::vector<index_t>& a_ms_ks_strides,
                         const std::vector<index_t>& b_ns_ks_lengths,
                         const std::vector<index_t>& b_ns_ks_strides,
@@ -745,7 +734,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                                           p_b,
                                           p_ds,
                                           p_e,
-                                          a_ms_ns_lengths,
+                                          a_ms_ks_lengths,
                                           a_ms_ks_strides,
                                           b_ns_ks_lengths,
                                           b_ns_ks_strides,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp
new file mode 100644
index 0000000000..0e14b40942
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cassert>
+#include <sstream>
+#include <vector>
+
+#include "ck/ck.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+/**
+ * Calculates the maximum number of subsequent elements of the fast changing dimension
+ * that are consecutive in memory.
+ *
+ * Example:
+ *   NumDimM = 2, NumDimK = 3
+ *   A shape =   [  2,   3,  4, 5, 6]
+ *   A strides = [360, 120, 30, 6, 1]
+ *                |  M   |  |   K  |
+ *   It follows from strides that K is FCD and all the subsequent elements of K are consecutive
+ *   in memory.
+ *   But if strides were [360, 120, 6, 24, 1], then only 6 subsequent elements of K would be
+ *   consecutive in memory.
+ *
+ * Assumes that the dimensions are split into two groups of `NumDim1` and `NumDim2` dimensions.
+ */
+template <index_t NumDim1, index_t NumDim2>
+auto CalculateMaxRead(const std::vector<index_t>& lengths, const std::vector<index_t>& strides)
+{
+    if(lengths.size() != NumDim1 + NumDim2)
+    {
+        std::ostringstream err;
+        err << "Incorrect number of lengths in " << __FILE__ << ":" << __LINE__
+            << ", in function: " << __func__;
+        throw std::runtime_error(err.str());
+    }
+    if(strides.size() != NumDim1 + NumDim2)
+    {
+        std::ostringstream err;
+        err << "Incorrect number of strides in " << __FILE__ << ":" << __LINE__
+            << ", in function: " << __func__;
+        throw std::runtime_error(err.str());
+    }
+
+    // Determine the beginning and end idx of the group representing the FCD.
+    index_t begin_idx, end_idx;
+    if(strides[NumDim1 - 1] == 1)
+    {
+        begin_idx = 0;
+        end_idx   = NumDim1 - 1;
+    }
+    else if(strides[NumDim1 + NumDim2 - 1] == 1)
+    {
+        begin_idx = NumDim1;
+        end_idx   = NumDim1 + NumDim2 - 1;
+    }
+    else
+    {
+        // The dimension consecutive in memory is not the last dimension of any group, so only
+        // one element can be read/written at once.
+        return 1;
+    }
+
+    index_t consecutive_stride = 1;
+    for(index_t dim_idx = end_idx; dim_idx >= begin_idx; --dim_idx)
+    {
+        if(strides[dim_idx] == consecutive_stride)
+        {
+            consecutive_stride *= lengths[dim_idx];
+        }
+        else
+        {
+            break;
+        }
+    }
+    const index_t max_subsequent_elems = consecutive_stride;
+    return max_subsequent_elems;
+}
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
index b43d34d69a..b67119ad19 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
@@ -61,7 +61,11 @@ using device_contraction_kk_instance = std::tuple<
         DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, ComputeDataType>,
         DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, ComputeDataType>,
         DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4, ComputeDataType>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4, ComputeDataType>
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4, ComputeDataType>,
+        // Small scalar per vector
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               2, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,               1, ComputeDataType>
     // clang-format on
     >;
 
@@ -96,7 +100,11 @@ using device_contraction_kn_instance = std::tuple<
         DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   1,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
         DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
         DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   1,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        // Small scalar per vector
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1,  8>,               2, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,               1, ComputeDataType>
     // clang-format on
     >;
 
@@ -131,7 +139,11 @@ using device_contraction_mk_instance = std::tuple<
         DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
         DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
         DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        // Small scalar per vector
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               2, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,               1, ComputeDataType>
     // clang-format on
     >;
 
@@ -166,7 +178,11 @@ using device_contraction_mn_instance = std::tuple<
         DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
         DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
         DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, ComputeDataType>,
+        // Small scalar per vector
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               1, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1,  8>,               2, ComputeDataType>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1,  8, 1,  8>,               1, ComputeDataType>
     // clang-format on
     >;
 

From c004e0d99048d76c40da81c4dd2a36921cee0293 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Mon, 11 Dec 2023 17:49:27 -0800
Subject: [PATCH 11/18] disabling some fp8 gemm instances to reduce build time
 (#1084)

* disabling some fp8 gemm instances to reduce build time

* disable fp8 gemm instances to reduce build time

* remove the unused variable

* build fp8 gemm default and padded instances separately

* fix include pathsc
---
 ...shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp} | 20 +++-----------
 .../tensor_operation_instance/gpu/gemm.hpp    |  9 +++++--
 .../gpu/gemm/CMakeLists.txt                   |  3 ++-
 ..._fp8_fp8_fp8_mk_kn_mn_default_instance.cpp | 26 +++++++++++++++++++
 ...e_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp | 26 +++++++++++++++++++
 5 files changed, 64 insertions(+), 20 deletions(-)
 rename library/{src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp => include/ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp} (96%)
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp

diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp
similarity index 96%
rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp
rename to library/include/ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp
index 82eae9f0a2..005cec94ec 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp
@@ -25,10 +25,6 @@ using S = ck::Sequence<Is...>;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-
-static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
 template <ck::tensor_operation::device::GemmSpecialization GemmSpec>
 using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple<
@@ -37,7 +33,7 @@ using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple<
         //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //#####################|        |        |        |       |       |       |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
-        // pipeline v1, 1 wave                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
+        // pipeline v1, 1 wave
         DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   256,   128,    64,  16,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,              16, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   256,    64,  16,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,              16, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -75,7 +71,8 @@ using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple<
         DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16, LoopScheduler::Interwave,      PipelineVersion::v1>
 
 #endif
-#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
+#if 0 
+	//CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
         // pipeline v2, 1 wave
         ,
         DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   256,   128,    64,  16,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,              16, LoopScheduler::Default,        PipelineVersion::v2>,
@@ -98,17 +95,6 @@ using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple<
     // clang-format on
     >;
 
-void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemm<Row, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances<GemmDefault>{});
-
-    add_device_operation_instances(
-        instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances<MNKPadding>{});
-}
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
index bbc70f1a5b..626dd7f00a 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -345,7 +345,11 @@ void add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Col, Col, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances);
 
-void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances(
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances);
+
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_padded_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Row, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances);
 
@@ -575,7 +579,8 @@ struct DeviceOperationInstanceFactory<
             if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                          is_same_v<CLayout, Row>)
             {
-                add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_padded_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_default_instances(op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                               is_same_v<CLayout, Row>)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
index d0bcacbe3c..3532c3f4ba 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -101,7 +101,8 @@ list(APPEND GEMM_INSTANCES
     device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp)
 
 list(APPEND GEMM_INSTANCES
-    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp
+    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp
+    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp
     device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp
     device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp
     device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp
new file mode 100644
index 0000000000..baa76a74af
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp"
+
+#ifdef CK_ENABLE_FP8
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+#endif
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp
new file mode 100644
index 0000000000..f16809db28
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp"
+
+#ifdef CK_ENABLE_FP8
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_padded_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances<MNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+#endif

From 6891e4d10965513657d531c3c8c2048aaba34b05 Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Wed, 13 Dec 2023 14:27:31 -0600
Subject: [PATCH 12/18] Fix the bugs (#1099)

---
 include/ck/utility/type_convert.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index 70bc6f278c..11db866152 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -182,7 +182,7 @@ inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, half_t>(half_t x)
 {
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
     // convert to float and use native converion
-    return f8_convert_sr<f8_t>(type_convert<float>(x));
+    return f8_convert_sr<bf8_t>(type_convert<float>(x));
 #else
     constexpr bool negative_zero_nan = true;
     constexpr bool clip              = true;
@@ -295,7 +295,7 @@ inline __host__ __device__ bf8_t f8_convert_rne<bf8_t, half_t>(half_t x)
 template <>
 inline __host__ __device__ f8_t type_convert<f8_t, float>(float x)
 {
-#if defined CK_USE_SR_F8_CONVERSION
+#if CK_USE_SR_F8_CONVERSION
     return f8_convert_sr<f8_t>(x);
 #else
     return f8_convert_rne<f8_t>(x);
@@ -352,10 +352,10 @@ inline __host__ __device__ half2_t type_convert<half2_t, float2_t>(float2_t x)
 template <>
 inline __host__ __device__ f8_t type_convert<f8_t, half_t>(half_t x)
 {
-#if defined CK_USE_SR_F8_CONVERSION
+#if CK_USE_SR_F8_CONVERSION
     return f8_convert_sr<f8_t>(x);
 #else
-    return f8_convert_nre<f8_t>(x);
+    return f8_convert_rne<f8_t>(x);
 #endif
 }
 
@@ -376,7 +376,7 @@ inline __host__ __device__ half_t type_convert<half_t, f8_t>(f8_t x)
 template <>
 inline __host__ __device__ bf8_t type_convert<bf8_t, float>(float x)
 {
-#if defined CK_USE_SR_F8_CONVERSION
+#if CK_USE_SR_F8_CONVERSION
     return f8_convert_sr<bf8_t>(x);
 #else
     return f8_convert_rne<bf8_t>(x);
@@ -403,7 +403,7 @@ inline __host__ __device__ float type_convert<float, bf8_t>(bf8_t x)
 template <>
 inline __host__ __device__ bf8_t type_convert<bf8_t, half_t>(half_t x)
 {
-#if defined CK_USE_SR_F8_CONVERSION
+#if CK_USE_SR_F8_CONVERSION
     return f8_convert_sr<bf8_t>(x);
 #else
     return f8_convert_rne<bf8_t>(x);

From 3a3b98ef79d967391840a202a8ddf7b3d05ba823 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Wed, 13 Dec 2023 12:50:15 -0800
Subject: [PATCH 13/18] [Doc][Werror] Fix security alerts and sync with MIOpen
 (#1085)

* fix Werror unused-parameter

* sync doc requirements

* fix blank space format

* fix dependency issue
---
 docs/sphinx/requirements.txt                     | 16 ++++++++--------
 .../gpu/grid/gridwise_tensor_rearrange.hpp       |  2 ++
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 01cb32e714..75863c214e 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -16,7 +16,7 @@ beautifulsoup4==4.11.2
     # via pydata-sphinx-theme
 breathe==4.34.0
     # via rocm-docs-core
-certifi==2022.12.7
+certifi==2023.7.22
     # via requests
 cffi==1.15.1
     # via
@@ -26,7 +26,7 @@ charset-normalizer==3.1.0
     # via requests
 click==8.1.3
     # via sphinx-external-toc
-cryptography==40.0.2
+cryptography==41.0.6
     # via pyjwt
 deprecated==1.2.13
     # via pygithub
@@ -42,7 +42,7 @@ fastjsonschema==2.18.0
     # via rocm-docs-core
 gitdb==4.0.10
     # via gitpython
-gitpython==3.1.35
+gitpython==3.1.37
     # via rocm-docs-core
 idna==3.4
     # via requests
@@ -88,9 +88,9 @@ pydata-sphinx-theme==0.13.3
     # via
     #   rocm-docs-core
     #   sphinx-book-theme
-pygithub==1.58.2
+pygithub==1.58.1
     # via rocm-docs-core
-pygments==2.14.0
+pygments==2.15.0
     # via
     #   accessible-pygments
     #   pydata-sphinx-theme
@@ -109,7 +109,7 @@ pyyaml==6.0
     #   pybtex
     #   rocm-docs-core
     #   sphinx-external-toc
-requests==2.28.2
+requests==2.31.0
     # via
     #   pygithub
     #   sphinx
@@ -141,7 +141,7 @@ sphinx-book-theme==1.0.1
     # via rocm-docs-core
 sphinx-copybutton==0.5.1
     # via rocm-docs-core
-sphinx-design==0.3.0
+sphinx-design==0.4.1
     # via rocm-docs-core
 sphinx-external-toc==0.3.1
     # via rocm-docs-core
@@ -163,7 +163,7 @@ sphinxcontrib-serializinghtml==1.1.5
     # via sphinx
 typing-extensions==4.5.0
     # via pydata-sphinx-theme
-urllib3==1.26.15
+urllib3==1.26.18
     # via requests
 wrapt==1.15.0
     # via deprecated
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
index f77ffff350..9535ca69a9 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
@@ -50,7 +50,9 @@ __global__ void
     ignore = p_in_global;
     ignore = out_grid_desc;
     ignore = p_out_global;
+    ignore = batch_count;
     ignore = block_2_tile_map;
+    ignore = compute_ptr_offset_of_batch;
 #endif
 }
 

From 281f8369033366669fbabe05ed9622c1370c4a71 Mon Sep 17 00:00:00 2001
From: Lisa <lisajdelaney@gmail.com>
Date: Thu, 14 Dec 2023 15:21:18 -0700
Subject: [PATCH 14/18] fix typo (#1067)

Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 7679607e69..4889914691 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,6 @@ python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
 ```
 
 You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page.
-page.
 
 ```note
 If you use CK, cite us as follows:

From efaf31061a00a9c17a888ddbf2e273aafe977d5e Mon Sep 17 00:00:00 2001
From: trixirt <trix@redhat.com>
Date: Thu, 14 Dec 2023 17:26:41 -0800
Subject: [PATCH 15/18] cmake: Add CK_PARALLEL_LINK_JOBS and
 CK_PARALLEL_COMPILE_JOBS options (#1063)

Copied from the llvm-project LLVM_PARALLEL_*_JOBS

Concurrent linking can break the build as well as having too many
compile jobs for the avaiable memory.  These options allow the user
to fine tune the build to fit within their machines memory
constraints.

An example use on linux is
COMPILE_JOBS=`cat /proc/cpuinfo | grep -m 1 'cpu cores' | awk '{ print $4 }'`
if [ ${COMPILE_JOBS}x = x ]; then
  COMPILE_JOBS=1
fi
BUILD_MEM=4
MEM_KB=0
MEM_KB=`cat /proc/meminfo | grep MemTotal | awk '{ print $2 }'`
MEM_MB=`eval "expr ${MEM_KB} / 1024"`
MEM_GB=`eval "expr ${MEM_MB} / 1024"`
COMPILE_JOBS_MEM=`eval "expr 1 + ${MEM_GB} / ${BUILD_MEM}"`
if [ "$COMPILE_JOBS_MEM" -lt "$COMPILE_JOBS" ]; then
  COMPILE_JOBS=$COMPILE_JOBS_MEM
fi
LINK_MEM=32
LINK_JOBS=`eval "expr 1 + ${MEM_GB} / ${LINK_MEM}"`

cmake -G Ninja -DCK_PARALLEL_LINK_JOBS=$LINK_JOBS
               -DCK_PARALLEL_COMPILE_JOBS=$COMPILE_JOBS

Signed-off-by: Tom Rix <trix@redhat.com>
---
 CMakeLists.txt | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e780c15657..4e4b9d8d4b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -146,6 +146,33 @@ if(${hip_VERSION_FLAT} GREATER 500723302)
    add_compile_options(-fno-offload-uniform-block)
 endif()
 
+#
+# Seperate linking jobs from compiling
+# Too many concurrent linking jobs can break the build
+# Copied from LLVM
+set(CK_PARALLEL_LINK_JOBS "" CACHE STRING
+  "Define the maximum number of concurrent link jobs (Ninja only).")
+if(CMAKE_GENERATOR MATCHES "Ninja")
+  if(CK_PARALLEL_LINK_JOBS)
+    set_property(GLOBAL APPEND PROPERTY JOB_POOLS link_job_pool=${CK_PARALLEL_LINK_JOBS})
+    set(CMAKE_JOB_POOL_LINK link_job_pool)
+  endif()
+elseif(CK_PARALLEL_LINK_JOBS)
+  message(WARNING "Job pooling is only available with Ninja generators.")
+endif()
+# Similar for compiling
+set(CK_PARALLEL_COMPILE_JOBS "" CACHE STRING
+  "Define the maximum number of concurrent compile jobs (Ninja only).")
+if(CMAKE_GENERATOR MATCHES "Ninja")
+  if(CK_PARALLEL_COMPILE_JOBS)
+    set_property(GLOBAL APPEND PROPERTY JOB_POOLS compile_job_pool=${CK_PARALLEL_COMPILE_JOBS})
+    set(CMAKE_JOB_POOL_COMPILE compile_job_pool)
+  endif()
+elseif(CK_PARALLEL_COMPILE_JOBS)
+  message(WARNING "Job pooling is only available with Ninja generators.")
+endif()
+
+
 option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
 option(USE_OPT_NAVI3X, "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF)
 

From 07092d68f0b13560caf3cbe762a9a799d13cdc0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Fri, 15 Dec 2023 12:45:08 +0100
Subject: [PATCH 16/18] Add tensor structure to wrapper (#1098)

* Add tensor structure to wrapper

* update changelog

* Fix names

* Comment fixes
---
 CHANGELOG.md                                  |   2 +-
 docs/wrapper.rst                              |  39 ++-
 include/ck/wrapper/layout.hpp                 | 178 +++++++---
 include/ck/wrapper/tensor.hpp                 | 314 ++++++++++++++++++
 .../ck/wrapper/{ => utils}/layout_utils.hpp   |  62 ++--
 include/ck/wrapper/utils/tensor_utils.hpp     | 290 ++++++++++++++++
 test/wrapper/CMakeLists.txt                   |   2 +
 test/wrapper/test_layout.cpp                  |  16 +-
 test/wrapper/test_tensor.cpp                  | 205 ++++++++++++
 9 files changed, 1020 insertions(+), 88 deletions(-)
 create mode 100644 include/ck/wrapper/tensor.hpp
 rename include/ck/wrapper/{ => utils}/layout_utils.hpp (86%)
 create mode 100644 include/ck/wrapper/utils/tensor_utils.hpp
 create mode 100644 test/wrapper/test_tensor.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3da22fc790..2891b8585b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,7 +19,7 @@ None
 - Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
 - Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799)
 - Support for Batched Gemm DL (#732)
-- Introduce wrapper sublibrary (limited functionality) (#1071)
+- Introduce wrapper sublibrary (limited functionality). (#1071, #1098)
 
 ### Changes
  - Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
diff --git a/docs/wrapper.rst b/docs/wrapper.rst
index 64fb6a4031..a2f60b97ae 100644
--- a/docs/wrapper.rst
+++ b/docs/wrapper.rst
@@ -13,7 +13,7 @@ Description
 
 CK provides a lightweight wrapper for more complex operations implemented in 
 the library. It allows indexing of nested layouts using a simple interface 
-(avoiding complex descriptor transformations). 
+(avoiding complex descriptor transformations) and memory access (using Tensor).
 
 Example:
 
@@ -22,24 +22,31 @@ Example:
     const auto shape_4x2x4         = ck::make_tuple(4, ck::make_tuple(2, 4));
     const auto strides_s2x1x8      = ck::make_tuple(2, ck::make_tuple(1, 8));
     const auto layout = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8);
+    
+    std::array<ck::index_t, 32> data;
+    auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
 
-    std::cout << "dims:4,(2,4) strides:2,(1,8)" << std::endl;
-    for(ck::index_t h = 0; h < ck::wrapper::size<0>(layout); h++)
+    for(ck::index_t w = 0; w < size(tensor); w++) {
+        tensor(w) = w;
+    }
+
+    // slice() == slice(0, -1) (whole dimension)
+    auto tensor_slice = tensor(ck::wrapper::slice(1, 3), ck::make_tuple(ck::wrapper::slice(), ck::wrapper::slice()));
+    std::cout << "dims:2,(2,4) strides:2,(1,8)" << std::endl;
+    for(ck::index_t h = 0; h < ck::wrapper::size<0>(tensor_slice); h++)
     {
-        for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
+        for(ck::index_t w = 0; w < ck::wrapper::size<1>(tensor_slice); w++)
         {
-            std::cout << layout(ck::make_tuple(h, w)) << " ";
+            std::cout << tensor_slice(h, w) << " ";
         }
         std::cout << std::endl;
     }
 
 Output::
 
-    dims:4,(2,4) strides:2,(1,8)
-    0 1 8 9 16 17 24 25 
-    2 3 10 11 18 19 26 27 
-    4 5 12 13 20 21 28 29 
-    6 7 14 15 22 23 30 31 
+    dims:2,(2,4) strides:2,(1,8)
+    1 5 9 13 17 21 25 29 
+    2 6 10 14 18 22 26 30 
 
 -------------------------------------
 Layout
@@ -52,3 +59,15 @@ Layout helpers
 -------------------------------------
 
 .. doxygenfile:: layout_utils.hpp
+
+-------------------------------------
+Tensor
+-------------------------------------
+
+.. doxygenstruct:: ck::wrapper::Tensor
+
+-------------------------------------
+Tensor helpers
+-------------------------------------
+
+.. doxygenfile:: tensor_utils.hpp
diff --git a/include/ck/wrapper/layout.hpp b/include/ck/wrapper/layout.hpp
index b337d88a1a..f20d985b49 100644
--- a/include/ck/wrapper/layout.hpp
+++ b/include/ck/wrapper/layout.hpp
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "ck/wrapper/layout_utils.hpp"
+#include "ck/wrapper/utils/layout_utils.hpp"
 
 namespace ck {
 namespace wrapper {
@@ -25,6 +25,26 @@ struct Layout
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
 
+    // Generate default idxs tuple (idx with all merged nested shapes)
+    template <typename... Ts>
+    __host__ __device__ constexpr static auto GenerateDefaultIdxsTuple(const Tuple<Ts...>&)
+    {
+        return generate_tuple(
+            [&](auto) {
+                if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime())
+                {
+                    // runtime layout
+                    return index_t(0);
+                }
+                else
+                {
+                    // compiletime layout
+                    return I0;
+                }
+            },
+            Number<Tuple<Ts...>::Size()>{});
+    }
+
     // Generate packed (column-major) strides if not passed
     template <typename... Ts>
     __host__ __device__ constexpr static auto
@@ -131,7 +151,7 @@ struct Layout
 
     template <typename... ShapeDims, typename DescriptorToMerge>
     __host__ __device__ constexpr static auto MakeMerge1d(const Tuple<ShapeDims...>& shape,
-                                                          DescriptorToMerge& desc)
+                                                          const DescriptorToMerge& desc)
     {
         // Reverse each element in tuple
         const auto merge_elems = TupleReverse(UnrollNestedTuple(shape));
@@ -144,7 +164,7 @@ struct Layout
             desc, make_tuple(make_merge_transform(merge_elems)), lower_dims, upper_dims);
     }
 
-    // Merge nested shape dims. Merge nested shape dims when idx is also nested.
+    // Merge nested shape dims when corresponding index is also nested.
     // Input desc shape: 2,  2,  2, 2,  2,  2
     // Example idx:      1,      1, 1,      1
     // Example shape:    2, (2, 2), 2, (2, 2)
@@ -187,14 +207,38 @@ struct Layout
         return transform_tensor_descriptor(desc, transforms, lower_dims, upper_dims);
     }
 
+    template <typename LayoutShape, typename LayoutStrides>
+    __host__ __device__ static auto MakeFlattenDescriptor(const LayoutShape& shape,
+                                                          const LayoutStrides& strides)
+    {
+        const auto unrolled_shape   = UnrollNestedTuple(shape);
+        const auto unrolled_strides = UnrollNestedTuple(strides);
+        static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
+                      "Size of strides and shape are not consistent.");
+        return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
+    }
+
+    // If the stride is not passed, you can infer it from `GenerateColumnMajorPackedStrides`.
+    using DeducedStrides =
+        std::conditional_t<is_same_v<Strides, Tuple<>>,
+                           remove_cvref_t<decltype(GenerateColumnMajorPackedStrides(Shape{}))>,
+                           Strides>;
+    using FlattenDescriptorType =
+        remove_cvref_t<decltype(MakeFlattenDescriptor(Shape{}, DeducedStrides{}))>;
+    using Descriptor1dType =
+        remove_cvref_t<decltype(MakeMerge1d(Shape{}, FlattenDescriptorType{}))>;
+    using DefaultIdxsTupleType = remove_cvref_t<decltype(GenerateDefaultIdxsTuple(Shape{}))>;
+
     template <typename... ShapeDims, typename... IdxDims>
-    __host__ __device__ constexpr auto TransformDesc(const Tuple<ShapeDims...>& shape,
-                                                     const Tuple<IdxDims...>& idx) const
+    __host__ __device__ constexpr static auto
+    TransformDesc(const Tuple<ShapeDims...>& shape,
+                  const Tuple<IdxDims...>& idx,
+                  const FlattenDescriptorType& naive_descriptor)
     {
         if constexpr(Tuple<IdxDims...>::Size() == I1)
         {
             // 1d idx path
-            return MakeMerge1d(shape, descriptor_);
+            return MakeMerge1d(shape, naive_descriptor);
         }
         else
         {
@@ -207,56 +251,53 @@ struct Layout
             // Unroll while IdxDims is nested
             const auto aligned_shape = AlignShapeToIdx(shape, idx);
             // Transform correct form of shape
-            return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idx), descriptor_);
+            return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idx), naive_descriptor);
         }
     }
 
-    template <typename LayoutShape, typename LayoutStrides>
-    __host__ __device__ static auto MakeNaiveDescriptor(const LayoutShape& shape,
-                                                        const LayoutStrides& strides)
-    {
-        const auto unrolled_shape   = UnrollNestedTuple(shape);
-        const auto unrolled_strides = UnrollNestedTuple(strides);
-        static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
-                      "Size of strides and shape are not consistent.");
-        return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
-    }
+    using MergedNestsDescriptorType = remove_cvref_t<decltype(TransformDesc(
+        Shape{}, DefaultIdxsTupleType{}, FlattenDescriptorType{}))>;
 
     public:
-    // If the stride is not passed, you can infer it from `GenerateColumnMajorPackedStrides`.
-    using DeducedStrides =
-        std::conditional_t<is_same_v<Strides, Tuple<>>,
-                           remove_cvref_t<decltype(GenerateColumnMajorPackedStrides(Shape{}))>,
-                           Strides>;
-    using NaiveDescriptorType =
-        remove_cvref_t<decltype(MakeNaiveDescriptor(Shape{}, DeducedStrides{}))>;
+    __host__ __device__ constexpr auto GetElementSpaceSize() const
+    {
+        return flatten_descriptor_.GetElementSpaceSize();
+    }
 
+    __host__ __device__ Layout() = delete;
     /**
      * \brief Layout constructor.
      *
      * \param shape Shape for layout.
      * \param strides Strides for layout (optional if tensor is packed).
-     * \return Layout object.
      */
-    __host__ __device__ Layout() = delete;
-    __host__ __device__ Layout(const Shape& shape, const Strides& strides) : descriptor_{}
+    __host__ __device__ constexpr Layout(const Shape& shape, const Strides& strides)
+        : flatten_descriptor_{}, shape_(shape), strides_(strides)
     {
         // Construct if runtime mode
-        if constexpr(!NaiveDescriptorType::IsKnownAtCompileTime())
+        if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime())
         {
-            shape_      = shape;
-            strides_    = strides;
-            descriptor_ = MakeNaiveDescriptor(shape_, strides_);
+            flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_);
+            descriptor_1d_      = MakeMerge1d(shape_, flatten_descriptor_);
+            merged_nests_descriptor_ =
+                TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_);
         }
     }
 
-    __host__ __device__ Layout(const Shape& shape) : descriptor_{}
+    /**
+     * \brief Layout constructor (with default packed column-major strides).
+     *
+     * \param shape Shape for layout.
+     */
+    __host__ __device__ constexpr Layout(const Shape& shape)
+        : flatten_descriptor_{}, shape_(shape), strides_(GenerateColumnMajorPackedStrides(shape_))
     {
-        if constexpr(!NaiveDescriptorType::IsKnownAtCompileTime())
+        if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime())
         {
-            shape_      = shape;
-            strides_    = GenerateColumnMajorPackedStrides(shape_);
-            descriptor_ = MakeNaiveDescriptor(shape_, strides_);
+            flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_);
+            descriptor_1d_      = MakeMerge1d(shape_, flatten_descriptor_);
+            merged_nests_descriptor_ =
+                TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_);
         }
     }
 
@@ -269,7 +310,9 @@ struct Layout
     template <typename Idxs>
     __host__ __device__ constexpr index_t operator()() const
     {
-        using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}));
+        static_assert(FlattenDescriptorType::IsKnownAtCompileTime(),
+                      "Compiletime operator used on runtime layout.");
+        using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, FlattenDescriptorType{}));
         using UnrolledIdx     = decltype(UnrollNestedTuple(Idxs{}));
         return TransformedDesc{}.CalculateOffset(UnrolledIdx{});
     }
@@ -283,9 +326,22 @@ struct Layout
     template <typename... Ts>
     __host__ __device__ index_t operator()(const Tuple<Ts...>& Idx) const
     {
-        // Static to construct transformed_desc only once
-        static const auto transformed_desc = TransformDesc(shape_, Idx);
-        return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx));
+        if constexpr(!IsNestedTuple(Tuple<Ts...>{}) && Tuple<Ts...>::Size() == 1)
+        {
+            // if 1d access
+            return descriptor_1d_.CalculateOffset(Idx);
+        }
+        else if constexpr(!IsNestedTuple(Tuple<Ts...>{}) && Tuple<Ts...>::Size() == Shape::Size())
+        {
+            // if Shape::Size() access (merged nested shapes)
+            return merged_nests_descriptor_.CalculateOffset(UnrollNestedTuple(Idx));
+        }
+        else
+        {
+            // Custom index, need to transform descriptor
+            const auto transformed_desc = TransformDesc(shape_, Idx, flatten_descriptor_);
+            return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx));
+        }
     }
 
     /**
@@ -327,19 +383,51 @@ struct Layout
      *
      * \return Shape.
      */
-    __host__ __device__ constexpr Shape GetShape() const { return shape_; }
+    __host__ __device__ constexpr const Shape& GetShape() const { return shape_; }
 
     /**
      * \brief Strides getter.
      *
      * \return Strides.
      */
-    __host__ __device__ constexpr DeducedStrides GetStrides() const { return strides_; }
+    __host__ __device__ constexpr const DeducedStrides& GetStrides() const { return strides_; }
+
+    /**
+     * \brief Get default lengths (tuple filled with Shape length elements).
+     *
+     * \return Default lengths.
+     */
+    __host__ __device__ constexpr auto GetDefaultLengthsTuple() const
+    {
+        return generate_tuple([&](auto i) { return GetLength<i>(); }, Number<Shape::Size()>{});
+    }
+
+    /**
+     * \brief Get default start idx (tuple filled with 0s of the same size as Shape).
+     *
+     * \return Default start idx.
+     */
+    __host__ __device__ constexpr auto GetDefaultStartIdxs() const
+    {
+        return GenerateDefaultIdxsTuple(shape_);
+    }
+
+    /**
+     * \brief Get default descriptor (with the same size as Shape)
+     *
+     * \return Default descriptor.
+     */
+    __host__ __device__ constexpr MergedNestsDescriptorType GetDefaultDescriptor()
+    {
+        return merged_nests_descriptor_;
+    }
 
     private:
-    NaiveDescriptorType descriptor_;
-    Shape shape_;
-    DeducedStrides strides_;
+    FlattenDescriptorType flatten_descriptor_;
+    Descriptor1dType descriptor_1d_;
+    MergedNestsDescriptorType merged_nests_descriptor_;
+    const Shape shape_;
+    const DeducedStrides strides_;
 };
 
 } // namespace wrapper
diff --git a/include/ck/wrapper/tensor.hpp b/include/ck/wrapper/tensor.hpp
new file mode 100644
index 0000000000..4ec6498fbc
--- /dev/null
+++ b/include/ck/wrapper/tensor.hpp
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "utils/tensor_utils.hpp"
+#include "utils/layout_utils.hpp"
+
+namespace ck {
+namespace wrapper {
+
+/**
+ * \brief Tensor wrapper that performs static and dynamic buffer logic.
+ *
+ * \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR).
+ * \tparam ElementType Element data type.
+ * \tparam Shape Tensor shape (layout component).
+ * \tparam Strides Tensor strides (layout component).
+ * \tparam NumVectors Number of vectors (only for VGPR, SGPR).
+ * \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR).
+ */
+template <MemoryTypeEnum BufferAddressSpace,
+          typename ElementType,
+          typename Shape,
+          typename Strides,
+          index_t NumVectors,     // param for Register memory
+          index_t ScalarPerVector // param for Register memory
+          >
+struct Tensor
+{
+    private:
+    // Check if Tuple contains Slice object
+    template <typename T>
+    constexpr static bool IsSlicing(T&&)
+    {
+        return is_detected<is_slice, T>::value;
+    }
+    template <typename... Ts>
+    constexpr static bool IsSlicing(Tuple<Ts...>&&)
+    {
+        return (IsSlicing(Ts{}) || ...);
+    }
+
+    // Calculate first index of new tensor after slice
+    // It is needed to calculate offset for new tensor
+    template <typename... Ts>
+    constexpr auto GetStartIdxForSlicedTensor(const Tuple<Ts...>& idx) const
+    {
+        const auto start_idx_for_sliced_tensor = generate_tuple(
+            [&](auto i) {
+                constexpr auto num_i = Number<i>{};
+                if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
+                {
+                    // if tuple then recurrence
+                    return GetStartIdxForSlicedTensor(idx.At(num_i));
+                }
+                else if constexpr(is_detected<is_slice,
+                                              tuple_element_t<i.value, Tuple<Ts...>>>::value)
+                {
+                    // if slice, return the beginning of the interval
+                    return idx.At(num_i).from_;
+                }
+                else
+                {
+                    // if one dim selected
+                    return idx.At(num_i);
+                }
+            },
+            Number<Tuple<Ts...>::Size()>{});
+
+        return start_idx_for_sliced_tensor;
+    }
+
+    // Calculate new tensor shape after slice
+    template <typename... Ts, typename ShapeTmpType>
+    constexpr auto GetShapeFromSlicedTensor(const Tuple<Ts...>& idx,
+                                            const ShapeTmpType& shape) const
+    {
+        // Pack each value in tuple to remove empty tuples after generation
+        auto new_shape = generate_tuple(
+            [&](auto i) {
+                constexpr auto num_i = Number<i>{};
+                if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
+                {
+                    if constexpr(!IsSlicing(tuple_element_t<i.value, Tuple<Ts...>>{}))
+                    {
+                        // if tuple does not have any slice then we can remove dimension
+                        return Tuple<>{};
+                    }
+                    else
+                    {
+                        // if tuple then recurrence
+                        return make_tuple(GetShapeFromSlicedTensor(idx.At(num_i), shape.At(num_i)));
+                    }
+                }
+                else if constexpr(is_detected<is_slice,
+                                              tuple_element_t<i.value, Tuple<Ts...>>>::value)
+                {
+                    // calculate new dimension
+                    const auto& dim = size(shape.At(num_i));
+                    const auto val  = idx.At(num_i).range(dim);
+                    return make_tuple(val);
+                }
+                else
+                {
+                    // remove dimension for just value
+                    return Tuple<>{};
+                }
+            },
+            Number<Tuple<Ts...>::Size()>{});
+        // Remove empty tuples (deleted elements) and return
+        return UnrollNestedTuple<0, 1>(new_shape);
+    }
+
+    template <typename... Ts, typename StridesTmpType>
+    constexpr auto GetStridesFromSlicedTensor(const Tuple<Ts...>& idx,
+                                              const StridesTmpType& strides) const
+    {
+        // Pack each value in tuple to remove empty tuples after generation
+        auto new_strides = generate_tuple(
+            [&](auto i) {
+                constexpr auto num_i = Number<i>{};
+                if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
+                {
+                    if constexpr(!IsSlicing(tuple_element_t<i.value, Tuple<Ts...>>{}))
+                    {
+                        // if tuple does not have any slice then we can remove dimension
+                        return Tuple<>{};
+                    }
+                    else
+                    {
+                        // if tuple then recurrence
+                        return make_tuple(
+                            GetStridesFromSlicedTensor(idx.At(num_i), strides.At(num_i)));
+                    }
+                }
+                else if constexpr(is_detected<is_slice,
+                                              tuple_element_t<i.value, Tuple<Ts...>>>::value)
+                {
+                    // Stride will be the same
+                    return make_tuple(strides.At(num_i));
+                }
+                else
+                {
+                    // remove dimension for just value
+                    return Tuple<>{};
+                }
+            },
+            Number<Tuple<Ts...>::Size()>{});
+        // Remove empty tuples (deleted elements) and return
+        return UnrollNestedTuple<0, 1>(new_strides);
+    }
+
+    public:
+    using ElementSpaceSize  = decltype(Layout<Shape, Strides>{
+        Shape{}, Strides{}}.GetElementSpaceSize()); // SpaceSize type for buffer
+    using TensorElementType = ElementType;           // DataType
+
+    static constexpr MemoryTypeEnum TensorBufferAddressSpace = BufferAddressSpace;
+    static constexpr bool IsDynamicBuffer = !(BufferAddressSpace == MemoryTypeEnum ::Sgpr ||
+                                              BufferAddressSpace == MemoryTypeEnum ::Vgpr);
+
+    __host__ __device__ Tensor() = delete;
+    __host__ __device__ Tensor(ElementType* pointer, const Layout<Shape, Strides>& layout)
+        : layout_(layout),
+          buffer_(make_dynamic_buffer<BufferAddressSpace>(pointer, layout.GetElementSpaceSize()))
+    {
+    }
+
+    __host__ __device__ Tensor(const Layout<Shape, Strides>& layout) : layout_(layout)
+    {
+        static_assert(!IsDynamicBuffer, "Wrong BufferAddressSpace for register.");
+    }
+
+    __host__ __device__ constexpr const Layout<Shape, Strides>& GetLayout() const
+    {
+        return layout_;
+    }
+
+    // Getter for new sliced tensor
+    template <typename... Ts, enable_if_t<IsSlicing(Tuple<Ts...>{}), bool> = false>
+    __host__ __device__ auto operator[](const Tuple<Ts...>& idx) const
+    {
+        static_assert(IsDynamicBuffer, "Register slice is not supported");
+        // Calculate offset based on first idx for new tensor
+        const index_t offset = layout_(GetStartIdxForSlicedTensor(idx));
+
+        auto new_shape = GetShapeFromSlicedTensor(idx, layout_.GetShape());
+        if constexpr(is_same_v<Strides, Tuple<>>)
+        {
+            auto new_layout = make_layout(new_shape);
+            return make_tensor<BufferAddressSpace>(buffer_.p_data_ + offset, new_layout);
+        }
+        else
+        {
+            auto new_strides = GetStridesFromSlicedTensor(idx, layout_.GetStrides());
+            auto new_layout  = make_layout(new_shape, new_strides);
+            return make_tensor<BufferAddressSpace>(buffer_.p_data_ + offset, new_layout);
+        }
+    }
+
+    template <typename... Ts, enable_if_t<IsSlicing(Tuple<Ts...>{}), bool> = false>
+    __host__ __device__ auto operator()(const Tuple<Ts...>& idx) const
+    {
+        return this->operator[](idx);
+    }
+
+    template <typename... Idxs, enable_if_t<IsSlicing(Tuple<Idxs...>{}), bool> = false>
+    __host__ __device__ auto operator()(Idxs... idxs) const
+    {
+        return this->operator[](make_tuple(idxs...));
+    }
+
+    // Getter for the const value
+    template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
+    __host__ __device__ const ElementType& operator[](const Tuple<Ts...>& idx) const
+    {
+        if constexpr(IsDynamicBuffer)
+        {
+            const index_t offset = layout_(idx);
+            return buffer_[offset];
+        }
+        else
+        {
+            if constexpr(is_same_v<Strides, Tuple<>>)
+            {
+                constexpr index_t offset =
+                    Layout<Shape, Strides>{Shape{}}.template operator()<Tuple<Ts...>>();
+                return buffer_[Number<offset>{}];
+            }
+            else
+            {
+                constexpr index_t offset =
+                    Layout<Shape, Strides>{Shape{}, Strides{}}.template operator()<Tuple<Ts...>>();
+                return buffer_[Number<offset>{}];
+            }
+        }
+    }
+
+    template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
+    __host__ __device__ const ElementType& operator()(const Tuple<Ts...>& idx) const
+    {
+        return this->operator[](idx);
+    }
+
+    template <typename... Idxs, enable_if_t<!IsSlicing(Tuple<Idxs...>{}), bool> = false>
+    __host__ __device__ const ElementType& operator()(Idxs... idxs) const
+    {
+        return this->operator[](make_tuple(idxs...));
+    }
+
+    // Getter for the value reference
+    template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
+    __host__ __device__ ElementType& operator[](const Tuple<Ts...>& idx)
+    {
+        if constexpr(IsDynamicBuffer)
+        {
+            const index_t offset = layout_(idx);
+            return buffer_(offset);
+        }
+        else
+        {
+            if constexpr(is_same_v<Strides, Tuple<>>)
+            {
+                constexpr index_t offset =
+                    Layout<Shape, Strides>{Shape{}}.template operator()<Tuple<Ts...>>();
+                return buffer_(Number<offset>{});
+            }
+            else
+            {
+                constexpr index_t offset =
+                    Layout<Shape, Strides>{Shape{}, Strides{}}.template operator()<Tuple<Ts...>>();
+                return buffer_(Number<offset>{});
+            }
+        }
+    }
+
+    template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
+    __host__ __device__ ElementType& operator()(const Tuple<Ts...>& idx)
+    {
+        return this->operator[](idx);
+    }
+
+    template <typename... Idxs, enable_if_t<!IsSlicing(Tuple<Idxs...>{}), bool> = false>
+    __host__ __device__ ElementType& operator()(Idxs... idxs)
+    {
+        return this->operator[](make_tuple(idxs...));
+    }
+
+    __host__ __device__ constexpr auto GetDefaultDescriptor()
+    {
+        return layout_.GetDefaultDescriptor();
+    }
+
+    private:
+    using DynamicBufferType = DynamicBuffer<BufferAddressSpace,
+                                            ElementType,
+                                            ElementSpaceSize,
+                                            true /*InvalidElementUseNumericalZeroValue*/>;
+    using StaticBufferType =
+        StaticBufferTupleOfVector<BufferAddressSpace,
+                                  ElementType,
+                                  NumVectors,
+                                  ScalarPerVector,
+                                  true /*InvalidElementUseNumericalZeroValue*/>;
+    // If register use static buffer, else use dynamic buffer
+    using Buffer = std::conditional_t<IsDynamicBuffer, DynamicBufferType, StaticBufferType>;
+
+    const Layout<Shape, Strides> layout_;
+    Buffer buffer_;
+};
+
+} // namespace wrapper
+} // namespace ck
diff --git a/include/ck/wrapper/layout_utils.hpp b/include/ck/wrapper/utils/layout_utils.hpp
similarity index 86%
rename from include/ck/wrapper/layout_utils.hpp
rename to include/ck/wrapper/utils/layout_utils.hpp
index fac8f33854..5df9dd7dea 100644
--- a/include/ck/wrapper/layout_utils.hpp
+++ b/include/ck/wrapper/utils/layout_utils.hpp
@@ -22,7 +22,7 @@ namespace wrapper {
 // Disable from doxygen docs generation
 /// @cond
 // forward declaration
-template <typename Shape, typename Strides = Tuple<>>
+template <typename Shape, typename Strides>
 struct Layout;
 
 template <typename T>
@@ -52,13 +52,23 @@ __host__ __device__ constexpr Layout<Shape, Strides> make_layout(const Shape& sh
  * \return Constructed layout.
  */
 template <typename Shape>
-__host__ __device__ constexpr Layout<Shape> make_layout(const Shape& shape)
+__host__ __device__ constexpr Layout<Shape, Tuple<>> make_layout(const Shape& shape)
 {
-    return Layout<Shape>(shape);
+    return Layout<Shape, Tuple<>>(shape);
 }
 
 // Layout helpers
 // get
+// Get dim (could be returned from get with empty Idxs)
+/**
+ * \private
+ */
+template <typename T>
+__host__ __device__ T constexpr get(const T& dim)
+{
+    return dim;
+}
+
 /**
  * \brief Get element from tuple (Shape/Strides/Idxs).
  *
@@ -82,7 +92,8 @@ __host__ __device__ constexpr auto get(const Tuple<Dims...>& tuple)
 template <index_t idx, typename Shape, typename Strides>
 __host__ __device__ constexpr auto get(const Layout<Shape, Strides>& layout)
 {
-    const auto new_shape = get<idx>(layout.GetShape());
+    const auto& shape     = layout.GetShape();
+    const auto& new_shape = get<idx>(shape);
     static_assert(is_detected<is_tuple, decltype(new_shape)>::value,
                   "Shape of sub layout must be tuple");
     if constexpr(is_same_v<Strides, Tuple<>>)
@@ -92,7 +103,8 @@ __host__ __device__ constexpr auto get(const Layout<Shape, Strides>& layout)
     }
     else
     {
-        const auto new_strides = get<idx>(layout.GetStrides());
+        const auto& strides     = layout.GetStrides();
+        const auto& new_strides = get<idx>(strides);
         static_assert(is_detected<is_tuple, decltype(new_strides)>::value,
                       "Strides of sub layout must be tuple");
         return make_layout(new_shape, new_strides);
@@ -113,11 +125,21 @@ __host__ __device__ constexpr auto get(const T& elem)
 }
 
 // size
+// Get dim size (could be returned from get function)
+/**
+ * \private
+ */
+template <typename T>
+__host__ __device__ T constexpr size(const T& dim)
+{
+    return dim;
+}
+
 /**
  * \brief Length get (product if tuple).
  *
  * \tparam idx Index to lookup.
- * \param layout Layout to get Shape.
+ * \param layout Layout to get Shape of.
  * \return Requsted length.
  */
 template <index_t idx, typename Shape, typename Strides>
@@ -140,16 +162,6 @@ __host__ __device__ constexpr index_t size(const Tuple<ShapeDims...>& shape)
                                                  unrolled_shape);
 }
 
-// Get dim size (could be returned from get function)
-/**
- * \private
- */
-template <typename T>
-__host__ __device__ T constexpr size(const T& dim)
-{
-    return dim;
-}
-
 /**
  * \brief Layout size (product of dims).
  *
@@ -178,14 +190,15 @@ __host__ __device__ constexpr index_t size(const Tuple<Ts...>& tuple)
 /**
  * \brief Hierarchical size.
  *
- * \tparam Idxs Indexes to lookup.
+ * \tparam Idx First index to lookup (to avoid empty Idxs).
+ * \tparam Idxs Next indexes to lookup.
  * \param elem Element to lookup.
  * \return Requsted element.
  */
-template <index_t... Idxs, typename T>
+template <index_t Idx, index_t... Idxs, typename T>
 __host__ __device__ constexpr auto size(const T& elem)
 {
-    return size(get<Idxs...>(elem));
+    return size(get<Idx, Idxs...>(elem));
 }
 
 // rank
@@ -251,7 +264,8 @@ __host__ __device__ constexpr auto rank(const T& elem)
 template <typename Shape, typename Strides>
 __host__ __device__ constexpr auto depth(const Layout<Shape, Strides>& layout)
 {
-    return TupleDepth(layout.GetShape());
+    const auto& shape = layout.GetShape();
+    return TupleDepth(shape);
 }
 
 /**
@@ -296,11 +310,11 @@ __host__ __device__ constexpr auto depth(const T& elem)
 /**
  * \brief Get Layout strides.
  *
- * \param layout Layout to get strides.
+ * \param layout Layout to get strides from.
  * \return Requsted strides.
  */
 template <typename Shape, typename Strides>
-__host__ __device__ constexpr auto stride(const Layout<Shape, Strides>& layout)
+__host__ __device__ constexpr const auto& stride(const Layout<Shape, Strides>& layout)
 {
     return layout.GetStrides();
 }
@@ -308,11 +322,11 @@ __host__ __device__ constexpr auto stride(const Layout<Shape, Strides>& layout)
 /**
  * \brief Get Layout shape.
  *
- * \param layout Layout to get shape.
+ * \param layout Layout to get shape from.
  * \return Requsted shape.
  */
 template <typename Shape, typename Strides>
-__host__ __device__ constexpr auto shape(const Layout<Shape, Strides>& layout)
+__host__ __device__ constexpr const auto& shape(const Layout<Shape, Strides>& layout)
 {
     return layout.GetShape();
 }
diff --git a/include/ck/wrapper/utils/tensor_utils.hpp b/include/ck/wrapper/utils/tensor_utils.hpp
new file mode 100644
index 0000000000..5f0dc3e500
--- /dev/null
+++ b/include/ck/wrapper/utils/tensor_utils.hpp
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+
+#include "ck/utility/number.hpp"
+#include "ck/utility/tuple.hpp"
+#include "ck/utility/tuple_helper.hpp"
+#include "ck/utility/dynamic_buffer.hpp"
+#include "ck/utility/amd_address_space.hpp"
+
+namespace ck {
+namespace wrapper {
+
+/**
+ * \brief Memory type, allowed members:
+ * - Generic,
+ * - Global,
+ * - LDS,
+ * - SGPR,
+ * - VGPR,
+ */
+using MemoryTypeEnum = AddressSpaceEnum;
+
+// Disable from doxygen docs generation
+/// @cond
+// forward declarations
+template <typename Shape, typename Strides>
+struct Layout;
+template <MemoryTypeEnum BufferAddressSpace,
+          typename ElementType,
+          typename Shape,
+          typename Strides,
+          index_t NumVectors,     // params for Register memory
+          index_t ScalarPerVector // param for Register memory
+          >
+
+struct Tensor;
+
+template <typename FromType, typename ToType>
+struct Slice
+{
+    __host__ __device__ constexpr Slice() : from_(), to_() {}
+    __host__ __device__ constexpr Slice(FromType from, ToType to) : from_(from), to_(to) {}
+
+    template <typename T>
+    __host__ __device__ constexpr auto range(const T& dim) const
+    {
+        if constexpr(is_same_v<FromType, index_t> || is_same_v<ToType, index_t> ||
+                     is_same_v<T, index_t>)
+        {
+            assert(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_) && "Invalid range");
+            if(to_ < 0)
+            {
+                return dim - from_ + to_ + 1;
+            }
+            else
+            {
+                // workaround if one end of the interval is index_t and the second one is Number
+                return static_cast<index_t>(to_) - static_cast<index_t>(from_);
+            }
+        }
+        else
+        {
+            static_assert(dim >= to_ && from_ >= Number<0>{} && (to_ < 0 || to_ > from_),
+                          "Invalid range");
+            if constexpr(to_ < 0)
+            {
+                return dim - from_ + to_ + Number<1>{};
+            }
+            else
+            {
+                return to_ - from_;
+            }
+        }
+    }
+
+    __host__ __device__ static constexpr bool IsSlice() { return true; }
+
+    const FromType from_;
+    const ToType to_;
+};
+
+template <typename T>
+using is_slice = decltype(std::declval<T&>().IsSlice());
+
+template <typename T>
+using is_tuple = decltype(std::declval<T&>().IsTuple());
+/// @endcond
+
+/**
+ * \brief Make tensor function.
+ *
+ * \tparam MemoryType Type of memory.
+ * \param pointer Pointer to the memory.
+ * \param layout Tensor layout.
+ * \return Constructed tensor.
+ */
+template <MemoryTypeEnum MemoryType, typename ElementType, typename Shape, typename Strides>
+constexpr auto make_tensor(ElementType* pointer, const Layout<Shape, Strides>& layout)
+{
+    return Tensor<MemoryType, ElementType, Shape, Strides, 0 /*NumVectors*/, 0 /*ScalarPerVector*/>(
+        pointer, layout);
+}
+
+/**
+ * \brief Make SGPR or VGPR tensor function.
+ *
+ * \tparam MemoryType Type of memory.
+ * \tparam NumVectors Number of vectors.
+ * \tparam ScalarPerVector Scalars per vector.
+ * \tparam ElementType Memory data type.
+ * \param layout Tensor layout.
+ * \return Constructed tensor.
+ */
+template <MemoryTypeEnum MemoryType,
+          index_t NumVectors,
+          index_t ScalarPerVector,
+          typename ElementType,
+          typename Shape,
+          typename Strides>
+constexpr auto make_register_tensor(const Layout<Shape, Strides>& layout)
+{
+    static_assert(!IsNestedTuple(Shape{}), "Register tensor with nested layout is not supported");
+    return Tensor<MemoryType, ElementType, Shape, Strides, NumVectors, ScalarPerVector>(layout);
+}
+
+/**
+ * \brief Get Tensor Layout.
+ *
+ * \param tensor Tensor to get layout of.
+ * \return Requsted layout.
+ */
+template <MemoryTypeEnum BufferAddressSpace,
+          typename ElementType,
+          typename Shape,
+          typename Strides,
+          index_t NumVectors,
+          index_t ScalarPerVector>
+__host__ __device__ constexpr const auto&
+layout(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
+           tensor)
+{
+    return tensor.GetLayout();
+}
+
+/**
+ * \brief Product of tensor shape dims.
+ *
+ * \tparam Idxs Indexes to access specific shape dim (optional).
+ * \param tensor Tensor to get Shape of.
+ * \return Requsted size.
+ */
+template <index_t... Idxs,
+          MemoryTypeEnum BufferAddressSpace,
+          typename ElementType,
+          typename Shape,
+          typename Strides,
+          index_t NumVectors,
+          index_t ScalarPerVector>
+__host__ __device__ constexpr index_t
+size(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
+         tensor)
+{
+    return size<Idxs...>(tensor.GetLayout());
+}
+
+/**
+ * \brief Rank of Shape tuple.
+ *
+ * \tparam Idxs Indexes to access specific shape dim (optional).
+ * \param tensor Tensor to get rank of.
+ * \return Requsted rank.
+ */
+template <index_t... Idxs,
+          MemoryTypeEnum BufferAddressSpace,
+          typename ElementType,
+          typename Shape,
+          typename Strides,
+          index_t NumVectors,
+          index_t ScalarPerVector>
+__host__ __device__ constexpr index_t
+rank(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
+         tensor)
+{
+    return rank<Idxs...>(tensor.GetLayout());
+}
+
+/**
+ * \brief Depth of Shape tuple.
+ *
+ * \tparam Idxs Indexes to access specific shape dim (optional).
+ * \param tensor Tensor to get depth of.
+ * \return Requsted depth.
+ */
+template <index_t... Idxs,
+          MemoryTypeEnum BufferAddressSpace,
+          typename ElementType,
+          typename Shape,
+          typename Strides,
+          index_t NumVectors,
+          index_t ScalarPerVector>
+__host__ __device__ constexpr index_t
+depth(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
+          tensor)
+{
+    return depth<Idxs...>(tensor.GetLayout());
+}
+
+/**
+ * \brief Get Tensor strides.
+ *
+ * \param tensor Tensor to get strides from.
+ * \return Requsted strides.
+ */
+template <MemoryTypeEnum BufferAddressSpace,
+          typename ElementType,
+          typename Shape,
+          typename Strides,
+          index_t NumVectors,
+          index_t ScalarPerVector>
+__host__ __device__ constexpr const auto&
+stride(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
+           tensor)
+{
+    return stride(tensor.GetLayout());
+}
+
+/**
+ * \brief Get Tensor shape.
+ *
+ * \param tensor Tensor to get shape from.
+ * \return Requsted shape.
+ */
+template <MemoryTypeEnum BufferAddressSpace,
+          typename ElementType,
+          typename Shape,
+          typename Strides,
+          index_t NumVectors,
+          index_t ScalarPerVector>
+__host__ __device__ constexpr const auto&
+shape(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
+          tensor)
+{
+    return shape(tensor.GetLayout());
+}
+
+/**
+ * \brief Get dim slice.
+ *
+ * \param from Beginning of the interval.
+ * \param to End of the interval. (could be also negative to index from the end)
+ * \return Requested slice. Could be used to create sliced tensor from other tensor.
+ */
+template <typename FromType, typename ToType>
+constexpr auto slice(const FromType from, const ToType to)
+{
+    return Slice<FromType, ToType>(from, to);
+}
+
+/**
+ * \brief Get dim slice. (Assumed that from is equal to 1)
+ *
+ * \param to End of the interval. (could be also negative to index from the end)
+ * \return Requested slice. Could be used to create sliced tensor from other tensor.
+ */
+template <typename ToType>
+constexpr auto slice(const ToType to)
+{
+    if constexpr(is_same_v<ToType, index_t>)
+    {
+        return Slice<index_t, ToType>(0, to);
+    }
+    else
+    {
+        return Slice<Number<0>, ToType>(Number<0>{}, to);
+    }
+}
+
+/**
+ * \brief Get whole dim slice (from = 0, to = -1).
+ *
+ * \return Requested slice. Could be used to create sliced tensor from other tensor.
+ */
+constexpr auto slice() { return Slice<Number<0>, Number<-1>>(Number<0>{}, Number<-1>{}); }
+
+} // namespace wrapper
+} // namespace ck
diff --git a/test/wrapper/CMakeLists.txt b/test/wrapper/CMakeLists.txt
index e25ef176dd..6b25c08a8a 100644
--- a/test/wrapper/CMakeLists.txt
+++ b/test/wrapper/CMakeLists.txt
@@ -1,2 +1,4 @@
 add_gtest_executable(test_layout test_layout.cpp)
 target_link_libraries(test_layout PRIVATE utility)
+add_gtest_executable(test_tensor test_tensor.cpp)
+target_link_libraries(test_tensor PRIVATE utility)
diff --git a/test/wrapper/test_layout.cpp b/test/wrapper/test_layout.cpp
index 7d09696fbb..14a8b96462 100644
--- a/test/wrapper/test_layout.cpp
+++ b/test/wrapper/test_layout.cpp
@@ -433,17 +433,17 @@ TEST(TestLayoutHelpers, ShapeAndStrides)
         ck::wrapper::make_layout(shape_compiletime, strides_compiletime);
 
     constexpr bool check_compiletime_shape =
-        std::is_same_v<std::remove_const<decltype(shape_compiletime)>::type,
-                       decltype(shape(layout_compiletime))>;
+        std::is_same_v<decltype(shape_compiletime),
+                       std::remove_reference_t<decltype(shape(layout_compiletime))>>;
     constexpr bool check_compiletime_strides =
-        std::is_same_v<std::remove_const<decltype(strides_compiletime)>::type,
-                       decltype(stride(layout_compiletime))>;
+        std::is_same_v<decltype(strides_compiletime),
+                       std::remove_reference_t<decltype(stride(layout_compiletime))>>;
     constexpr bool check_runtime_shape =
-        std::is_same_v<std::remove_const<decltype(shape_runtime)>::type,
-                       decltype(shape(layout_runtime))>;
+        std::is_same_v<decltype(shape_runtime),
+                       std::remove_reference_t<decltype(shape(layout_runtime))>>;
     constexpr bool check_runtime_strides =
-        std::is_same_v<std::remove_const<decltype(strides_runtime)>::type,
-                       decltype(stride(layout_runtime))>;
+        std::is_same_v<decltype(strides_runtime),
+                       std::remove_reference_t<decltype(stride(layout_runtime))>>;
     EXPECT_TRUE(check_compiletime_shape);
     EXPECT_TRUE(check_compiletime_strides);
     EXPECT_TRUE(check_runtime_shape);
diff --git a/test/wrapper/test_tensor.cpp b/test/wrapper/test_tensor.cpp
new file mode 100644
index 0000000000..92f8e2e1bd
--- /dev/null
+++ b/test/wrapper/test_tensor.cpp
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/library/utility/device_memory.hpp"
+
+#include "ck/host_utility/kernel_launch.hpp"
+
+#include "ck/utility/common_header.hpp"
+
+#include "ck/wrapper/layout.hpp"
+#include "ck/wrapper/tensor.hpp"
+
+// Compare data in tensor with offset from layout.
+// Data and offset should match if physical memory has been initialized with
+// sequentially increasing values from 0.
+template <typename TensorType>
+__host__ __device__ bool TestTensorCheck3d(TensorType& tensor)
+{
+    const auto& layout = ck::wrapper::layout(tensor);
+    for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++)
+    {
+        for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++)
+        {
+            for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
+            {
+                const auto idx = ck::make_tuple(ck::make_tuple(d, h), w);
+                if(tensor(idx) != layout(idx))
+                {
+                    return false;
+                }
+            }
+        }
+    }
+    return true;
+}
+
+template <typename TensorType>
+__host__ __device__ bool TestTensorCheck1d(TensorType& tensor, ck::index_t start_offset = 0)
+{
+    const auto& layout = ck::wrapper::layout(tensor);
+    for(ck::index_t w = 0; w < ck::wrapper::size<0>(layout); w++)
+    {
+        if(tensor(w) - start_offset != layout(ck::make_tuple(w)))
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+template <ck::index_t nelems, typename TensorType>
+__host__ __device__ bool StaticTestTensorCheck1d(TensorType& tensor)
+{
+    const auto& layout = ck::wrapper::layout(tensor);
+    bool success       = true;
+    ck::static_for<0, nelems, 1>{}([&](auto w) {
+        if(tensor(ck::Number<w.value>{}) != layout(ck::make_tuple(w.value)))
+        {
+            success = false;
+        }
+    });
+    return success;
+}
+
+template <typename TensorType>
+__host__ __device__ void InitTensor(TensorType& tensor)
+{
+    for(ck::index_t i = 0; i < ck::wrapper::size(ck::wrapper::layout(tensor)); i++)
+    {
+        tensor(i) = i;
+    }
+}
+
+template <ck::index_t nelems, typename TensorType>
+__host__ __device__ void StaticInitTensor(TensorType& tensor)
+{
+
+    ck::static_for<0, nelems, 1>{}([&](auto i) { tensor(ck::Number<i.value>{}) = i.value; });
+}
+
+// Tests
+TEST(TestTensor, ReadWriteHostMemory)
+{
+    constexpr ck::index_t nelems = 8;
+
+    std::array<ck::index_t, nelems> data;
+    const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
+    auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
+    InitTensor(tensor);
+
+    EXPECT_TRUE(TestTensorCheck1d(tensor));
+    EXPECT_TRUE(TestTensorCheck3d(tensor));
+}
+
+__global__ void TestTensorReadWriteDevice(void* data, void* success)
+{
+    constexpr ck::index_t nelems            = 8;
+    constexpr ck::index_t scalar_per_vector = 1;
+    __shared__ ck::index_t p_shared[nelems];
+
+    ck::index_t* casted_data_ptr = static_cast<ck::index_t*>(data);
+    bool* casted_success_ptr     = static_cast<bool*>(success);
+
+    const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
+    constexpr auto register_layout = ck::wrapper::make_layout(ck::make_tuple(ck::Number<8>{}));
+
+    auto tensor_global =
+        ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout);
+    auto tensor_lds  = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(p_shared, layout);
+    auto tensor_vgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr,
+                                                         nelems,
+                                                         scalar_per_vector,
+                                                         ck::index_t>(register_layout);
+    auto tensor_sgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Sgpr,
+                                                         nelems,
+                                                         scalar_per_vector,
+                                                         ck::index_t>(register_layout);
+
+    InitTensor(tensor_global);
+    InitTensor(tensor_lds);
+    StaticInitTensor<nelems>(tensor_vgpr);
+    StaticInitTensor<nelems>(tensor_sgpr);
+
+    *casted_success_ptr &= TestTensorCheck1d(tensor_global);
+    *casted_success_ptr &= TestTensorCheck3d(tensor_global);
+
+    *casted_success_ptr &= TestTensorCheck1d(tensor_lds);
+    *casted_success_ptr &= TestTensorCheck3d(tensor_lds);
+
+    *casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_vgpr);
+
+    *casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_sgpr);
+}
+
+TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory)
+{
+    constexpr ck::index_t nelems = 8;
+    std::array<ck::index_t, nelems> host_data;
+
+    DeviceMem data_buf(nelems * sizeof(ck::index_t));
+    data_buf.ToDevice(&host_data[0]);
+    DeviceMem success_buf(sizeof(bool));
+
+    launch_and_time_kernel(StreamConfig{},
+                           TestTensorReadWriteDevice,
+                           dim3(1),
+                           dim3(1),
+                           nelems * sizeof(ck::index_t),
+                           data_buf.GetDeviceBuffer(),
+                           success_buf.GetDeviceBuffer());
+
+    bool success;
+    success_buf.FromDevice(&success);
+    EXPECT_TRUE(success);
+}
+
+TEST(TestTensor, Slicing)
+{
+    constexpr ck::index_t nelems = 8;
+
+    std::array<ck::index_t, nelems> data;
+    const auto shape   = ck::make_tuple(ck::make_tuple(2, 2), 2);
+    const auto strides = ck::make_tuple(ck::make_tuple(1, 2), 4);
+    const auto layout  = ck::wrapper::make_layout(shape, strides);
+    auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
+    InitTensor(tensor);
+
+    auto tensor2x2x2 =
+        tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2));
+    EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2);
+    EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2);
+    EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8);
+    EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2));
+
+    auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2));
+    EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2);
+    EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2);
+    EXPECT_EQ(ck::wrapper::size(tensor2x2), 4);
+    EXPECT_TRUE(TestTensorCheck1d(tensor2x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0))));
+
+    auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2));
+    EXPECT_EQ(rank(tensor1x1), 2);
+    EXPECT_EQ(depth(tensor1x1), 2);
+    EXPECT_EQ(size(tensor1x1), 1);
+    EXPECT_TRUE(TestTensorCheck1d(tensor1x1, layout(ck::make_tuple(ck::make_tuple(1, 1), 1))));
+
+    auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2));
+    EXPECT_EQ(ck::wrapper::rank(tensor2), 1);
+    EXPECT_EQ(ck::wrapper::depth(tensor2), 1);
+    EXPECT_EQ(ck::wrapper::size(tensor2), 2);
+    EXPECT_TRUE(TestTensorCheck1d(tensor2, layout(ck::make_tuple(ck::make_tuple(1, 1), 0))));
+
+    // negative indexing
+    auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice());
+    EXPECT_EQ(rank(tensor1x2), 2);
+    EXPECT_EQ(depth(tensor1x2), 2);
+    EXPECT_EQ(size(tensor1x2), 2);
+    EXPECT_TRUE(TestTensorCheck1d(tensor1x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0))));
+}

From 3246d1f693035929562240d8c73611345692bbbc Mon Sep 17 00:00:00 2001
From: abhimeda <138710508+abhimeda@users.noreply.github.com>
Date: Fri, 15 Dec 2023 12:41:35 -0500
Subject: [PATCH 17/18] Adding Issue Template (#1094)

* Add files via upload

* fixed extra space typo

* add mi300 GPU architectures and rocm versions 5.6.1 and 6.0.0

---------

Co-authored-by: illsilin <Illia.Silin@amd.com>
Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
---
 .github/ISSUE_TEMPLATE/config.yml       |   1 +
 .github/ISSUE_TEMPLATE/issue_report.yml | 221 ++++++++++++++++++++++++
 2 files changed, 222 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/config.yml
 create mode 100644 .github/ISSUE_TEMPLATE/issue_report.yml

diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000..0086358db1
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
+blank_issues_enabled: true
diff --git a/.github/ISSUE_TEMPLATE/issue_report.yml b/.github/ISSUE_TEMPLATE/issue_report.yml
new file mode 100644
index 0000000000..ef6e6faa1b
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/issue_report.yml
@@ -0,0 +1,221 @@
+name: Issue Report
+description: File a report for ROCm related issues on Linux and Windows. For issues pertaining to documentation or non-bug related, please open a blank issue located below.
+title: "[Issue]: "
+
+body:
+- type: markdown
+  attributes:
+    value: |
+      Thank you for taking the time to fill out this report!
+
+      You can acquire your OS, CPU, GPU (for filling out this report) with the following commands:
+
+      Linux:
+        echo "OS:" && cat /etc/os-release | grep -E "^(NAME=|VERSION=)";
+        echo "CPU: " && cat /proc/cpuinfo | grep "model name" | sort --unique;
+        echo "GPU:" && /opt/rocm/bin/rocminfo | grep -E "^\s*(Name|Marketing Name)";
+        
+      Windows:
+         (Get-WmiObject Win32_OperatingSystem).Version
+         (Get-WmiObject win32_Processor).Name
+         (Get-WmiObject win32_VideoController).Name
+- type: textarea
+  attributes:
+    label: Problem Description
+    description: Describe the issue you encountered.
+  validations:
+    required: true
+- type: input
+  attributes:
+    label: Operating System
+    description: What is the name and version number of the OS?
+    placeholder: "e.g. Ubuntu 22.04.3 LTS (Jammy Jellyfish)"
+  validations:
+    required: true
+- type: input
+  attributes:
+    label: CPU
+    description: What CPU did you encounter the issue on?
+    placeholder: "e.g. AMD Ryzen 9 5900HX with Radeon Graphics"
+  validations:
+    required: true
+- type: dropdown
+  attributes:
+    label: GPU
+    description: What GPU(s) did you encounter the issue on (you can select multiple GPUs from the list)
+    multiple: true
+    options:
+        - AMD Instinct MI300X
+        - AMD Instinct MI300A
+        - AMD Instinct MI300
+        - AMD Instinct MI250X
+        - AMD Instinct MI250
+        - AMD Instinct MI210
+        - AMD Instinct MI100
+        - AMD Instinct MI50
+        - AMD Instinct MI25
+        - AMD Radeon Pro V620
+        - AMD Radeon Pro VII
+        - AMD Radeon RX 7900 XTX
+        - AMD Radeon VII
+        - AMD Radeon Pro W7900
+        - AMD Radeon Pro W7800
+        - AMD Radeon Pro W6800
+        - AMD Radeon Pro W6600
+        - AMD Radeon Pro W5500
+        - AMD Radeon RX 7900 XT
+        - AMD Radeon RX 7600
+        - AMD Radeon RX 6950 XT
+        - AMD Radeon RX 6900 XT
+        - AMD Radeon RX 6800 XT
+        - AMD Radeon RX 6800
+        - AMD Radeon RX 6750
+        - AMD Radeon RX 6700 XT
+        - AMD Radeon RX 6700
+        - AMD Radeon RX 6650 XT
+        - AMD Radeon RX 6600 XT
+        - AMD Radeon RX 6600
+        - Other
+  validations:
+    required: true
+- type: input
+  attributes:
+    label: Other
+    description: If you selected Other, please specify
+- type: dropdown
+  attributes:
+    label: ROCm Version
+    description: What version(s) of ROCm did you encounter the issue on?
+    multiple: true
+    options:
+        - ROCm 6.0.0
+        - ROCm 5.7.1
+        - ROCm 5.7.0
+        - ROCm 5.6.1
+        - ROCm 5.6.0
+        - ROCm 5.5.1
+        - ROCm 5.5.0
+  validations:
+    required: true
+- type: dropdown
+  attributes:
+    label: ROCm Component
+    description: (Optional) If this issue relates to a specific ROCm component, it can be mentioned here.
+    multiple: true
+    options:
+        - Other
+        - AMD Common Language Runtime
+        - AMD MIGraphX
+        - AMD System Management Interface
+        - amdgpu KCL/autoconf
+        - amdgpu Kernel-mode GPU Driver
+        - amdgpu-install
+        - AOMP
+        - AOMP Extras
+        - AqlProfile
+        - build-infra
+        - chelsio
+        - clang-ocl
+        - Composable Kernel
+        - dkms
+        - docker / ROCm-docker
+        - flang
+        - gpuburn
+        - half
+        - HIP
+        - HIP Examples
+        - hipBLAS
+        - hipBLASLt
+        - HIPCC
+        - hipCUB
+        - hip-examples-private
+        - hipFFT
+        - hipfort
+        - HIPIFY
+        - hipRAND
+        - hipSOLVER
+        - hipSPARSE
+        - hipSPARSELt
+        - hipTensor
+        - hip-tests
+        - HSA Runtime
+        - infrastructure
+        - jenkins-utils
+        - libdrm
+        - Linux BPI packaging framework
+        - llvm-project
+        - Mesa
+        - meta
+        - MIOpen
+        - MIVisionX
+        - ml-framework-ci
+        - MLSEQA_TestRepo
+        - OpenCL API C++ Bindings
+        - OpenCL API Headers
+        - OpenCL Conformance Test Suite
+        - OpenCL ICD Loader
+        - perftest-p2p
+        - prototype
+        - RCCL
+        - rccl-rdma-sharp-plugins
+        - rocALUTION
+        - rocBLAS
+        - ROCdbgapi
+        - ROCdebug-agent
+        - rocFFT
+        - ROCgdb
+        - ROCK
+        - ROCm Documentation/Website
+        - ROCm Data Center Tool
+        - ROCm Examples
+        - ROCm for Windows
+        - ROCm Performance Primitives
+        - ROCm System Management Interface Library
+        - ROCm Thrust
+        - ROCm Validation Suite
+        - rocm_bandwidth_test
+        - rocm-cmake
+        - rocm-core
+        - rocm-docs-core
+        - rocminfo
+        - rocMLIR
+        - rocmtools
+        - rocPRIM
+        - rocprofiler
+        - rocRAND
+        - ROCR-Runtime
+        - rocSOLVER
+        - rocSPARSE
+        - roctracer
+        - ROCT-Thunk-Interface
+        - rocWMMA
+        - Tensile
+        - umr
+        - ibv_rc_pingpong-amd
+        - mellanox
+        - mpitest
+        - Pytorch
+        - Tensorflow
+        - APEX
+        - torchvision
+        - Magma
+- type: textarea
+  attributes:
+    label: Steps to Reproduce
+    description: (Optional) Detailed steps to reproduce the issue.
+  validations:
+    required: false
+    
+- type: textarea
+  attributes:
+    label: (Optional for Linux users) Output of /opt/rocm/bin/rocminfo --support
+    description: The output of rocminfo --support could help to better address the problem.
+  validations:
+    required: false
+
+- type: textarea
+  attributes:
+    label: Additional Information
+    description: (Optional) Any additional information that is relevant, e.g. relevant environment variables, dockerfiles, log files, dmesg output (on Linux), etc.
+  validations:
+    required: false

From dcedf3632f0e066c1712add65cb440622416363e Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Sat, 16 Dec 2023 09:17:40 -0800
Subject: [PATCH 18/18] Upgrade the default compiler to ROCm6.0 release.
 (#1103)

* upgrade to rocm6.0 compiler

* move rocm6.0 from private to public repo

* switch to testing hipTensor mainline in CI
---
 Dockerfile  |  6 +++---
 Jenkinsfile | 14 +++++++-------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 87b4eb8e2b..b9339ec5d4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
 FROM ubuntu:20.04
 ARG DEBIAN_FRONTEND=noninteractive
-ARG ROCMVERSION=5.7
+ARG ROCMVERSION=6.0
 ARG compiler_version=""
 ARG compiler_commit=""
 
@@ -16,8 +16,8 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
 RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
 
-RUN wget https://repo.radeon.com/amdgpu-install/5.7/ubuntu/focal/amdgpu-install_5.7.50700-1_all.deb  --no-check-certificate
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_5.7.50700-1_all.deb
+RUN wget https://repo.radeon.com/amdgpu-install/6.0/ubuntu/focal/amdgpu-install_6.0.60000-1_all.deb  --no-check-certificate
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.0.60000-1_all.deb
 
 RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
     sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
diff --git a/Jenkinsfile b/Jenkinsfile
index 8f661e4780..2bb48b85ce 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -33,7 +33,7 @@ def runShell(String command){
 
 def getDockerImageName(){
     def img
-    if (params.ROCMVERSION != "6.0"){
+    if (params.ROCMVERSION != "6.1"){
        if (params.COMPILER_VERSION == "") {
            img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
        }
@@ -655,8 +655,8 @@ def process_results(Map conf=[:]){
 }
 
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION=
-                                              0 21 * * * % ROCMVERSION=5.7;COMPILER_VERSION=;COMPILER_COMMIT=
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.0;COMPILER_VERSION=
+                                              0 21 * * * % ROCMVERSION=6.0;COMPILER_VERSION=;COMPILER_COMMIT=
                                               0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=;USE_SCCACHE=false
                                               0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;COMPILER_COMMIT=;USE_SCCACHE=false''' : ""
 
@@ -675,8 +675,8 @@ pipeline {
             description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
         string(
             name: 'ROCMVERSION', 
-            defaultValue: '5.7', 
-            description: 'Specify which ROCM version to use: 5.7 (default).')
+            defaultValue: '6.0', 
+            description: 'Specify which ROCM version to use: 6.0 (default).')
         string(
             name: 'COMPILER_VERSION', 
             defaultValue: '', 
@@ -703,8 +703,8 @@ pipeline {
             description: "Use the CK build to verify hipTensor build and tests (default: ON)")
         string(
             name: 'hipTensor_branch',
-            defaultValue: 'develop',
-            description: 'Specify which branch of hipTensor to use (default: develop)')
+            defaultValue: 'mainline',
+            description: 'Specify which branch of hipTensor to use (default: mainline)')
         booleanParam(
             name: "USE_SCCACHE",
             defaultValue: true,