From 112d521b0939083573f931cba31089c3f31d6ac2 Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Sun, 3 Mar 2024 23:48:31 +0000
Subject: [PATCH] fix xx

---
 example/ck_tile/01_fmha/fmha_fwd.cpp          |    4 +-
 example/ck_tile/01_fmha/fmha_fwd.hpp          |   15 +-
 example/ck_tile/01_fmha/generate.py           |    2 +-
 include/ck_tile/core.hpp                      |    5 +-
 .../ck_tile/core/arch/amd_address_space.hpp   |   20 -
 .../core/arch/amd_buffer_addressing.hpp       | 1034 ++++++++---------
 include/ck_tile/core/arch/arch.hpp            |   61 +
 include/ck_tile/core/arch/utility.hpp         |   27 +
 include/ck_tile/core/config.hpp               |   12 +
 include/ck_tile/core/container/array.hpp      |   34 +-
 .../core/container/container_helper.hpp       |   17 +
 include/ck_tile/core/container/sequence.hpp   |    5 +-
 include/ck_tile/core/container/tuple.hpp      |  135 ++-
 include/ck_tile/core/numeric/bfloat16.hpp     |   41 +-
 include/ck_tile/core/numeric/float8.hpp       |   95 +-
 include/ck_tile/core/numeric/half.hpp         |    5 +-
 include/ck_tile/core/numeric/math.hpp         |    7 +-
 include/ck_tile/core/numeric/type_convert.hpp |   49 +-
 include/ck_tile/core/numeric/vector_type.hpp  |  327 ++----
 include/ck_tile/core/tensor/buffer_view.hpp   |  316 ++---
 include/ck_tile/core/tensor/load_tile.hpp     |    9 +-
 .../ck_tile/core/tensor/null_tile_window.hpp  |    1 +
 include/ck_tile/core/tensor/shuffle_tile.hpp  |   10 +-
 include/ck_tile/core/tensor/slice_tile.hpp    |    2 +-
 .../core/tensor/static_distributed_tensor.hpp |    1 +
 include/ck_tile/core/tensor/store_tile.hpp    |    4 +-
 .../ck_tile/core/tensor/tensor_adaptor.hpp    |   16 +-
 .../ck_tile/core/tensor/tensor_descriptor.hpp |    8 +-
 include/ck_tile/core/tensor/tensor_view.hpp   |   84 +-
 .../ck_tile/core/tensor/tile_distribution.hpp |    6 +-
 .../ck_tile/core/tensor/tile_elementwise.hpp  |   35 +-
 include/ck_tile/core/tensor/tile_window.hpp   |   64 +-
 include/ck_tile/core/utility/functional.hpp   |   14 +
 include/ck_tile/core/utility/to_sequence.hpp  |    9 +-
 .../core/utility/transpose_vectors.hpp        |  123 ++
 include/ck_tile/core/utility/type_convert.hpp |   57 -
 include/ck_tile/core/utility/type_traits.hpp  |   14 +
 include/ck_tile/host/check_err.hpp            |    6 +-
 include/ck_tile/host/kernel_launch.hpp        |    4 +-
 include/ck_tile/ops/common/README.md          |    4 +
 include/ck_tile/ops/epilogue.hpp              |    1 +
 include/ck_tile/ops/fmha.hpp                  |    1 +
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       |    9 +-
 .../pipeline/block_fmha_pipeline_problem.hpp  |   12 +-
 .../pipeline/block_fmha_pipeline_qr_ks_vs.hpp |   13 +-
 .../block_fmha_pipeline_qr_ks_vs_async.hpp    |   15 +-
 .../block_fmha_pipeline_qr_ks_vs_fp8.hpp      |   13 +-
 .../pipeline/block_fmha_pipeline_qs_ks_vs.hpp |   11 +-
 ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp |   66 +-
 include/ck_tile/ops/gemm.hpp                  |    1 +
 .../block/block_gemm_areg_bgmem_creg_v1.hpp   |   13 +-
 .../block/block_gemm_areg_bsmem_creg_v1.hpp   |   22 +-
 ...gemm_areg_bsmem_creg_v1_default_policy.hpp |   12 +-
 .../block/block_gemm_areg_bsmem_creg_v2.hpp   |   15 +-
 .../block/block_gemm_asmem_bsmem_creg_v1.hpp  |    6 +-
 ...emm_asmem_bsmem_creg_v1_default_policy.hpp |   12 +-
 ...lock_gemm_pipeline_agmem_bgmem_creg_v1.hpp |    4 +-
 ...lock_gemm_pipeline_agmem_bgmem_creg_v2.hpp |    4 +-
 include/ck_tile/ops/gemm/warp/warp_gemm.hpp   |    2 +
 .../gemm/warp/warp_gemm_attribute_mfma.hpp    |   69 +-
 .../warp/warp_gemm_attribute_mfma_impl.hpp    |  192 ++-
 .../ops/gemm/warp/warp_gemm_dispatcher.hpp    |   17 +-
 .../ck_tile/ops/gemm/warp/warp_gemm_impl.hpp  |   12 +-
 include/ck_tile/ops/reduce.hpp                |    1 +
 .../ck_tile/ops/reduce/block/block_reduce.hpp |    2 +-
 include/ck_tile/remod.py                      |   11 +
 66 files changed, 1720 insertions(+), 1498 deletions(-)
 delete mode 100644 include/ck_tile/core/arch/amd_address_space.hpp
 create mode 100644 include/ck_tile/core/arch/arch.hpp
 create mode 100644 include/ck_tile/core/arch/utility.hpp
 create mode 100644 include/ck_tile/core/utility/transpose_vectors.hpp
 delete mode 100644 include/ck_tile/core/utility/type_convert.hpp
 create mode 100644 include/ck_tile/ops/common/README.md

diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp
index 1b2183960c..9e11f4d19e 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -72,7 +72,7 @@ auto get_elimit(int /*init_method*/)
 }
 
 template <>
-auto get_elimit<ck_tile::bhalf_t>(int init_method)
+auto get_elimit<ck_tile::bf16_t>(int init_method)
 {
     if(init_method == 0)
     {
@@ -510,7 +510,7 @@ int main(int argc, char* argv[])
     }
     else if(data_type == "bf16")
     {
-        return run<ck_tile::bhalf_t>(arg_parser) ? 0 : -2;
+        return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
     }
     else if(data_type == "fp8")
     {
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index eb11efb2e2..325ff6b78a 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -7,7 +7,6 @@
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/fmha.hpp"
 #include "ck_tile/ops/epilogue.hpp"
-#include "ck_tile/ops/common.hpp"
 #include "mask.hpp"
 
 template <typename DataType>
@@ -29,18 +28,18 @@ struct FmhaFwdTypeConfig<ck_tile::half_t>
 };
 
 template <>
-struct FmhaFwdTypeConfig<ck_tile::bhalf_t>
+struct FmhaFwdTypeConfig<ck_tile::bf16_t>
 {
-    using QDataType           = ck_tile::bhalf_t;
-    using KDataType           = ck_tile::bhalf_t;
-    using VDataType           = ck_tile::bhalf_t;
-    using BiasDataType        = ck_tile::bhalf_t;
+    using QDataType           = ck_tile::bf16_t;
+    using KDataType           = ck_tile::bf16_t;
+    using VDataType           = ck_tile::bf16_t;
+    using BiasDataType        = ck_tile::bf16_t;
     using LSEDataType         = float;       // data type for lse(logsumexp L_j = max_j + log(l_j))
     using SaccDataType        = float;       // data type for first gemm accumulation
     using SMPLComputeDataType = float;       // data type for reduction, softmax
-    using PDataType           = ck_tile::bhalf_t; // data type for A matrix of second gemm
+    using PDataType           = ck_tile::bf16_t; // data type for A matrix of second gemm
     using OaccDataType        = float;       // data type for second gemm accumulation
-    using ODataType           = ck_tile::bhalf_t;
+    using ODataType           = ck_tile::bf16_t;
 };
 
 template <>
diff --git a/example/ck_tile/01_fmha/generate.py b/example/ck_tile/01_fmha/generate.py
index b3be008f09..66feae6a5d 100644
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -11,7 +11,7 @@ import copy
 
 DTYPE_MAP = {
     "fp16": "ck_tile::half_t",
-    "bf16": "ck_tile::bhalf_t",
+    "bf16": "ck_tile::bf16_t",
     "fp8" : "ck_tile::fp8_t"
 }
 
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index 0123163a65..f53a6b0fd6 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -6,8 +6,9 @@
 #include "ck_tile/core/algorithm/cluster_descriptor.hpp"
 #include "ck_tile/core/algorithm/coordinate_transform.hpp"
 #include "ck_tile/core/algorithm/space_filling_curve.hpp"
-#include "ck_tile/core/arch/amd_address_space.hpp"
 #include "ck_tile/core/arch/amd_buffer_addressing.hpp"
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/core/arch/utility.hpp"
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/container/array.hpp"
 #include "ck_tile/core/container/container_helper.hpp"
@@ -51,6 +52,6 @@
 #include "ck_tile/core/utility/magic_div.hpp"
 #include "ck_tile/core/utility/random.hpp"
 #include "ck_tile/core/utility/to_sequence.hpp"
-#include "ck_tile/core/utility/type_convert.hpp"
+#include "ck_tile/core/utility/transpose_vectors.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 
diff --git a/include/ck_tile/core/arch/amd_address_space.hpp b/include/ck_tile/core/arch/amd_address_space.hpp
deleted file mode 100644
index 19a9ded568..0000000000
--- a/include/ck_tile/core/arch/amd_address_space.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-// Address Space for AMDGCN
-// https://llvm.org/docs/AMDGPUUsage.html#address-space
-
-namespace ck_tile {
-
-enum struct address_space_enum
-{
-    generic,
-    global,
-    lds,
-    sgpr,
-    vgpr,
-};
-
-} // namespace ck_tile
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index cfba73f74d..9a7c95f4c2 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -5,53 +5,27 @@
 
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/numeric/vector_type.hpp"
 #include "ck_tile/core/container/container_helper.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
-#include "ck_tile/core/numeric/vector_type.hpp"
+#include "ck_tile/core/utility/bit_cast.hpp"
+#include "ck_tile/core/utility/functional.hpp"
 
 namespace ck_tile {
 
-template <typename T>
-union buffer_resource
+// 128 bit SGPRs to supply buffer resource in buffer instructions
+// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
+struct __attribute__((packed)) buffer_resource
 {
-    CK_TILE_DEVICE constexpr buffer_resource() : content{} {}
-
-    // 128 bit SGPRs to supply buffer resource in buffer instructions
-    // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
-    int32x4_t content;
-    statically_indexed_array<T*, 2> address;
-    statically_indexed_array<int32_t, 4> range;
-    statically_indexed_array<int32_t, 4> config;
+    const void* ptr;
+    uint32_t range;
+    uint32_t config;
 };
 
-template <typename T>
-CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_size)
+CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t size = 0xffffffff)
 {
-    buffer_resource<T> wave_buffer_resource;
-
-    // wavewise base address (64 bit)
-    wave_buffer_resource.address(number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
-    // wavewise range (32 bit)
-    wave_buffer_resource.range(number<2>{}) = element_space_size * sizeof(T);
-    // wavewise setting (32 bit)
-    wave_buffer_resource.config(number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
-
-    return wave_buffer_resource.content;
-}
-
-template <typename T>
-CK_TILE_DEVICE int32x4_t make_wave_buffer_resource_with_default_range(T* p_wave)
-{
-    buffer_resource<T> wave_buffer_resource;
-
-    // wavewise base address (64 bit)
-    wave_buffer_resource.address(number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
-    // wavewise range (32 bit)
-    wave_buffer_resource.range(number<2>{}) = 0xffffffff; // max possible range
-    // wavewise setting (32 bit)
-    wave_buffer_resource.config(number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
-
-    return wave_buffer_resource.content;
+    buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD};
+    return __builtin_bit_cast(int32x4_t, res);
 }
 
 // TODO: glc/slc/...
@@ -73,9 +47,9 @@ struct buffer_load<16>
                                    index_t /*flag*/ = 0)
     {
         static_assert(sizeof(T) == 16);
-        using mubuf_t = float __attribute__((ext_vector_type(4)));
+        using mbuf_t = fp32x4_t;
         asm volatile("buffer_load_dwordx4 %0, %1, %2, %3 offen offset:%4"
-                     : "+v"(reinterpret_cast<mubuf_t&>(value))
+                     : "+v"(reinterpret_cast<mbuf_t&>(value))
                      : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
                      : "memory");
     }
@@ -93,9 +67,9 @@ struct buffer_load<8>
                                    index_t /*flag*/ = 0)
     {
         static_assert(sizeof(T) == 8);
-        using mubuf_t = float __attribute__((ext_vector_type(2)));
+        using mbuf_t = fp32x2_t;
         asm volatile("buffer_load_dwordx2 %0, %1, %2, %3 offen offset:%4"
-                     : "+v"(reinterpret_cast<mubuf_t&>(value))
+                     : "+v"(reinterpret_cast<mbuf_t&>(value))
                      : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
                      : "memory");
     }
@@ -113,9 +87,9 @@ struct buffer_load<4>
                                    index_t /*flag*/ = 0)
     {
         static_assert(sizeof(T) == 4);
-        using mubuf_t = float;
+        using mbuf_t = float;
         asm volatile("buffer_load_dword %0, %1, %2, %3 offen offset:%4"
-                     : "+v"(reinterpret_cast<mubuf_t&>(value))
+                     : "+v"(reinterpret_cast<mbuf_t&>(value))
                      : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
                      : "memory");
     }
@@ -133,9 +107,9 @@ struct buffer_load<2>
                                    index_t /*flag*/ = 0)
     {
         static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
-        using mubuf_t = float;
+        using mbuf_t = float;
         asm volatile("buffer_load_ushort %0, %1, %2, %3 offen offset:%4"
-                     : "+v"(reinterpret_cast<mubuf_t&>(value))
+                     : "+v"(reinterpret_cast<mbuf_t&>(value))
                      : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
                      : "memory");
     }
@@ -153,9 +127,9 @@ struct buffer_load<1>
                                    index_t /*flag*/ = 0)
     {
         static_assert(sizeof(T) == 4);
-        using mubuf_t = float;
+        using mbuf_t = float;
         asm volatile("buffer_load_ubyte %0, %1, %2, %3 offen offset:%4"
-                     : "+v"(reinterpret_cast<mubuf_t&>(value))
+                     : "+v"(reinterpret_cast<mbuf_t&>(value))
                      : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
                      : "memory");
     }
@@ -177,13 +151,13 @@ struct buffer_load_if<16>
     {
         static_assert(sizeof(T) == 16);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mubuf_t   = float __attribute__((ext_vector_type(4)));
-        static_assert(sizeof(mubuf_t) == sizeof(T));
+        using mbuf_t    = fp32x4_t;
+        static_assert(sizeof(mbuf_t) == sizeof(T));
         asm volatile(
             "v_cmpx_le_u32 exec, 1, %5\n"
             "buffer_load_dwordx4 %0, %1, %2, %3 offen offset:%4\n"
             "s_mov_b64 exec %6"
-            : "+v"(reinterpret_cast<mubuf_t&>(value))
+            : "+v"(reinterpret_cast<mbuf_t&>(value))
             : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset), "v"(flag), "s"(saved_exec)
             : "memory");
     }
@@ -202,12 +176,12 @@ struct buffer_load_if<8>
     {
         static_assert(sizeof(T) == 8);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mubuf_t   = float __attribute__((ext_vector_type(2)));
+        using mbuf_t    = fp32x2_t;
         asm volatile(
             "v_cmpx_le_u32 exec, 1, %5\n"
             "buffer_load_dwordx2 %0, %1, %2, %3 offen offset:%4\n"
             "s_mov_b64 exec %6"
-            : "+v"(reinterpret_cast<mubuf_t&>(value))
+            : "+v"(reinterpret_cast<mbuf_t&>(value))
             : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset), "v"(flag), "s"(saved_exec)
             : "memory");
     }
@@ -226,12 +200,12 @@ struct buffer_load_if<4>
     {
         static_assert(sizeof(T) == 4);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mubuf_t   = float;
+        using mbuf_t    = float;
         asm volatile(
             "v_cmpx_le_u32 exec, 1, %5\n"
             "buffer_load_dword %0, %1, %2, %3 offen offset:%4\n"
             "s_mov_b64 exec %6"
-            : "+v"(reinterpret_cast<mubuf_t&>(value))
+            : "+v"(reinterpret_cast<mbuf_t&>(value))
             : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset), "v"(flag), "s"(saved_exec)
             : "memory");
     }
@@ -250,12 +224,12 @@ struct buffer_load_if<2>
     {
         static_assert(sizeof(T) == 4);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mubuf_t   = float;
+        using mbuf_t    = float;
         asm volatile(
             "v_cmpx_le_u32 exec, 1, %5\n"
             "buffer_load_ushort %0, %1, %2, %3 offen offset:%4\n"
             "s_mov_b64 exec %6"
-            : "+v"(reinterpret_cast<mubuf_t&>(value))
+            : "+v"(reinterpret_cast<mbuf_t&>(value))
             : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset), "v"(flag), "s"(saved_exec)
             : "memory");
     }
@@ -274,12 +248,12 @@ struct buffer_load_if<1>
     {
         static_assert(sizeof(T) == 4);
         auto saved_exec = __builtin_amdgcn_read_exec();
-        using mubuf_t   = float;
+        using mbuf_t    = float;
         asm volatile(
             "v_cmpx_le_u32 exec, 1, %5\n"
             "buffer_load_ubyte %0, %1, %2, %3 offen offset:%4\n"
             "s_mov_b64 exec %6"
-            : "+v"(reinterpret_cast<mubuf_t&>(value))
+            : "+v"(reinterpret_cast<mbuf_t&>(value))
             : "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset), "v"(flag), "s"(saved_exec)
             : "memory");
     }
@@ -300,10 +274,12 @@ struct buffer_store<16>
                                    index_t /*flag*/ = 1)
     {
         static_assert(sizeof(T) == 16);
-        asm volatile("buffer_store_dwordx4 %0, %1, %2, %3 offen offset:%4"
-                     :
-                     : "v"(value), "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
-                     : "memory");
+        using mbuf_t = fp32x4_t;
+        asm volatile(
+            "buffer_store_dwordx4 %0, %1, %2, %3 offen offset:%4"
+            :
+            : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
+            : "memory");
     }
 };
 
@@ -319,10 +295,12 @@ struct buffer_store<8>
                                    index_t /*flag*/ = 1)
     {
         static_assert(sizeof(T) == 8);
-        asm volatile("buffer_store_dwordx2 %0, %1, %2, %3 offen offset:%4"
-                     :
-                     : "v"(value), "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
-                     : "memory");
+        using mbuf_t = fp32x2_t;
+        asm volatile(
+            "buffer_store_dwordx2 %0, %1, %2, %3 offen offset:%4"
+            :
+            : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
+            : "memory");
     }
 };
 
@@ -338,10 +316,12 @@ struct buffer_store<4>
                                    index_t /*flag*/ = 1)
     {
         static_assert(sizeof(T) == 4);
-        asm volatile("buffer_store_dword %0, %1, %2, %3 offen offset:%4"
-                     :
-                     : "v"(value), "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
-                     : "memory");
+        using mbuf_t = float;
+        asm volatile(
+            "buffer_store_dword %0, %1, %2, %3 offen offset:%4"
+            :
+            : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
+            : "memory");
     }
 };
 
@@ -356,11 +336,13 @@ struct buffer_store<2>
                                    index_t i_offset /*max 0xFFF*/,
                                    index_t /*flag*/ = 1)
     {
-        static_assert(sizeof(T) == 2);
-        asm volatile("buffer_store_short %0, %1, %2, %3 offen offset:%4"
-                     :
-                     : "v"(value), "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
-                     : "memory");
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = float;
+        asm volatile(
+            "buffer_store_short %0, %1, %2, %3 offen offset:%4"
+            :
+            : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
+            : "memory");
     }
 };
 
@@ -375,11 +357,13 @@ struct buffer_store<1>
                                    index_t i_offset /*max 0xFFF*/,
                                    index_t /*flag*/ = 1)
     {
-        static_assert(sizeof(T) == 1);
-        asm volatile("buffer_store_byte %0, %1, %2, %3 offen offset:%4"
-                     :
-                     : "v"(value), "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
-                     : "memory");
+        static_assert(sizeof(T) == 4);
+        using mbuf_t = float;
+        asm volatile(
+            "buffer_store_byte %0, %1, %2, %3 offen offset:%4"
+            :
+            : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "s"(s_offset), "n"(i_offset)
+            : "memory");
     }
 };
 
@@ -399,11 +383,12 @@ struct buffer_store_if<16>
     {
         static_assert(sizeof(T) == 16);
         auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = fp32x4_t;
         asm volatile("v_cmpx_le_u32 exec, 1, %5\n"
                      "buffer_store_dwordx4 %0, %1, %2, %3 offen offset:%4\n"
                      "s_mov_b64 exec %6"
                      :
-                     : "v"(value),
+                     : "v"(bit_cast<mbuf_t>(value)),
                        "v"(v_offset),
                        "s"(res),
                        "s"(s_offset),
@@ -427,11 +412,12 @@ struct buffer_store_if<8>
     {
         static_assert(sizeof(T) == 8);
         auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = fp32x2_t;
         asm volatile("v_cmpx_le_u32 exec, 1, %5\n"
                      "buffer_store_dwordx2 %0, %1, %2, %3 offen offset:%4\n"
                      "s_mov_b64 exec %6"
                      :
-                     : "v"(value),
+                     : "v"(bit_cast<mbuf_t>(value)),
                        "v"(v_offset),
                        "s"(res),
                        "s"(s_offset),
@@ -455,11 +441,12 @@ struct buffer_store_if<4>
     {
         static_assert(sizeof(T) == 4);
         auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = float;
         asm volatile("v_cmpx_le_u32 exec, 1, %5\n"
                      "buffer_store_dword %0, %1, %2, %3 offen offset:%4\n"
                      "s_mov_b64 exec %6"
                      :
-                     : "v"(value),
+                     : "v"(bit_cast<mbuf_t>(value)),
                        "v"(v_offset),
                        "s"(res),
                        "s"(s_offset),
@@ -481,13 +468,14 @@ struct buffer_store_if<2>
                                    index_t i_offset /*max 0xFFF*/,
                                    index_t flag = 1)
     {
-        static_assert(sizeof(T) == 2);
+        static_assert(sizeof(T) == 4);
         auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = float;
         asm volatile("v_cmpx_le_u32 exec, 1, %5\n"
                      "buffer_store_short %0, %1, %2, %3 offen offset:%4\n"
                      "s_mov_b64 exec %6"
                      :
-                     : "v"(value),
+                     : "v"(bit_cast<mbuf_t>(value)),
                        "v"(v_offset),
                        "s"(res),
                        "s"(s_offset),
@@ -509,13 +497,14 @@ struct buffer_store_if<1>
                                    index_t i_offset /*max 0xFFF*/,
                                    index_t flag = 1)
     {
-        static_assert(sizeof(T) == 1);
+        static_assert(sizeof(T) == 4);
         auto save_exec = __builtin_amdgcn_read_exec();
+        using mbuf_t   = float;
         asm volatile("v_cmpx_le_u32 exec, 1, %5\n"
                      "buffer_store_byte %0, %1, %2, %3 offen offset:%4\n"
                      "s_mov_b64 exec %6"
                      :
-                     : "v"(value),
+                     : "v"(bit_cast<mbuf_t>(value)),
                        "v"(v_offset),
                        "s"(res),
                        "s"(s_offset),
@@ -540,44 +529,44 @@ namespace impl{
 // TODO: may have scratch (because this is memory?)
 //       need to reduce extra move inside compiler
 template<index_t N>
-CK_TILE_DEVICE void insert_dummy_dep_per_dword(static_buffer_c<float, N>& b)
+CK_TILE_DEVICE void insert_dummy_dep_per_dword(array<float, N>& b)
 {
     for (auto i = 0; i < b.size(); i++) asm volatile(" " : : "v"(b.get(i)) : "memory");
 }
 
 template<>
-CK_TILE_DEVICE void insert_dummy_dep_per_dword<2>(static_buffer_c<float, 2>& b)
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<2>(array<float, 2>& b)
 {
     asm volatile(" " : : "v"(b.get(0)), "v"(b.get(1)) : "memory");
 }
 
 template<>
-CK_TILE_DEVICE void insert_dummy_dep_per_dword<3>(static_buffer_c<float, 3>& b)
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<3>(array<float, 3>& b)
 {
     asm volatile(" " : : "v"(b.get(0)), "v"(b.get(1)), "v"(b.get(2)) : "memory");
 }
 
 template<>
-CK_TILE_DEVICE void insert_dummy_dep_per_dword<4>(static_buffer_c<float, 4>& b)
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<4>(array<float, 4>& b)
 {
     asm volatile(" " : : "v"(b.get(0)), "v"(b.get(1)), "v"(b.get(2)), "v"(b.get(3)) : "memory");
 }
 
 template<>
-CK_TILE_DEVICE void insert_dummy_dep_per_dword<8>(static_buffer_c<float, 8>& b)
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<8>(array<float, 8>& b)
 {
     asm volatile(" " : : "v"(b.get(0)), "v"(b.get(1)), "v"(b.get(2)), "v"(b.get(3)), "v"(b.get(4)), "v"(b.get(5)), "v"(b.get(6)), "v"(b.get(7)) : "memory");
 }
 
 template<>
-CK_TILE_DEVICE void insert_dummy_dep_per_dword<16>(static_buffer_c<float, 16>& b)
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<16>(array<float, 16>& b)
 {
     asm volatile(" " : : "v"(b.get(0)), "v"(b.get(1)), "v"(b.get(2)), "v"(b.get(3)), "v"(b.get(4)), "v"(b.get(5)), "v"(b.get(6)), "v"(b.get(7)),
                        "v"(b.get(8)), "v"(b.get(9)), "v"(b.get(10)), "v"(b.get(11)), "v"(b.get(12)), "v"(b.get(13)), "v"(b.get(14)), "v"(b.get(15)) : "memory");
 }
 
 template<>
-CK_TILE_DEVICE void insert_dummy_dep_per_dword<32>(static_buffer_c<float, 32>& b)
+CK_TILE_DEVICE void insert_dummy_dep_per_dword<32>(array<float, 32>& b)
 {
     asm volatile(" " : : "v"(b.get(0)), "v"(b.get(1)), "v"(b.get(2)), "v"(b.get(3)), "v"(b.get(4)), "v"(b.get(5)), "v"(b.get(6)), "v"(b.get(7)),
                        "v"(b.get(8)), "v"(b.get(9)), "v"(b.get(10)), "v"(b.get(11)), "v"(b.get(12)), "v"(b.get(13)), "v"(b.get(14)), "v"(b.get(15)),
@@ -591,7 +580,7 @@ template<typename T>
 CK_TILE_DEVICE void insert_dummy_dep(T & buffer)
 {
     // TODO: indeed we expect T to be multiple of dword. subdword is always buggy
-    using da_type = static_buffer_c<float, (sizeof(T) + 3) / 4>;
+    using da_type = array<float, (sizeof(T) + 3) / 4>;
     auto & dummy = reinterpret_cast<da_type&>(buffer);
     insert_dummy_dep_per_dword(dummy);
 }
@@ -636,19 +625,19 @@ llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
 
 // buffer load i16
-CK_TILE_DEVICE bhalf_t
+CK_TILE_DEVICE int16_t
 llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc,
                                 index_t voffset,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16");
 
-CK_TILE_DEVICE bhalf2_t
+CK_TILE_DEVICE int16x2_t
 llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16");
 
-CK_TILE_DEVICE bhalf4_t
+CK_TILE_DEVICE int16x4_t
 llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc,
                                   index_t voffset,
                                   index_t soffset,
@@ -674,19 +663,19 @@ llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
 
 // buffer load fp16
-CK_TILE_DEVICE half_t
+CK_TILE_DEVICE fp16_t
 llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
 
-CK_TILE_DEVICE half2_t
+CK_TILE_DEVICE fp16x2_t
 llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16");
 
-CK_TILE_DEVICE half4_t
+CK_TILE_DEVICE fp16x4_t
 llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc,
                                    index_t voffset,
                                    index_t soffset,
@@ -699,13 +688,13 @@ llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32");
 
-CK_TILE_DEVICE float2_t
+CK_TILE_DEVICE fp32x2_t
 llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32");
 
-CK_TILE_DEVICE float4_t
+CK_TILE_DEVICE fp32x4_t
 llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc,
                                    index_t voffset,
                                    index_t soffset,
@@ -735,21 +724,21 @@ llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata,
 
 // buffer store i16
 CK_TILE_DEVICE void
-llvm_amdgcn_raw_buffer_store_i16(bhalf_t vdata,
+llvm_amdgcn_raw_buffer_store_i16(bf16_t vdata,
                                  int32x4_t rsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");
 
 CK_TILE_DEVICE void
-llvm_amdgcn_raw_buffer_store_i16x2(bhalf2_t vdata,
+llvm_amdgcn_raw_buffer_store_i16x2(bf16x2_t vdata,
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16");
 
 CK_TILE_DEVICE void
-llvm_amdgcn_raw_buffer_store_i16x4(bhalf4_t vdata,
+llvm_amdgcn_raw_buffer_store_i16x4(bf16x4_t vdata,
                                    int32x4_t rsrc,
                                    index_t voffset,
                                    index_t soffset,
@@ -779,21 +768,21 @@ llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata,
 
 // buffer store fp16
 CK_TILE_DEVICE void
-llvm_amdgcn_raw_buffer_store_fp16(half_t vdata,
+llvm_amdgcn_raw_buffer_store_fp16(fp16_t vdata,
                                   int32x4_t rsrc,
                                   index_t voffset,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
 
 CK_TILE_DEVICE void
-llvm_amdgcn_raw_buffer_store_fp16x2(half2_t vdata,
+llvm_amdgcn_raw_buffer_store_fp16x2(fp16x2_t vdata,
                                     int32x4_t rsrc,
                                     index_t voffset,
                                     index_t soffset,
                                     index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16");
 
 CK_TILE_DEVICE void
-llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata,
+llvm_amdgcn_raw_buffer_store_fp16x4(fp16x4_t vdata,
                                     int32x4_t rsrc,
                                     index_t voffset,
                                     index_t soffset,
@@ -808,22 +797,22 @@ llvm_amdgcn_raw_buffer_store_fp32(float vdata,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32");
 
 CK_TILE_DEVICE void
-llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata,
+llvm_amdgcn_raw_buffer_store_fp32x2(fp32x2_t vdata,
                                     int32x4_t rsrc,
                                     index_t voffset,
                                     index_t soffset,
                                     index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32");
 
 CK_TILE_DEVICE void
-llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
+llvm_amdgcn_raw_buffer_store_fp32x4(fp32x4_t vdata,
                                     int32x4_t rsrc,
                                     index_t voffset,
                                     index_t soffset,
                                     index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
 
 // buffer atomic-add fp16
-CK_TILE_DEVICE half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
-    half2_t vdata,
+CK_TILE_DEVICE fp16x2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
+    fp16x2_t vdata,
     int32x4_t rsrc,
     index_t voffset,
     index_t soffset,
@@ -886,20 +875,21 @@ enum struct amd_buffer_coherence_enum
 
 template <index_t N,
           amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
-CK_TILE_DEVICE typename vector_type<int8_t, N>::type
-amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource,
-                         index_t src_thread_addr_offset,
-                         index_t src_wave_addr_offset)
+CK_TILE_DEVICE array<int8_t, N> amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
+                                                                index_t src_thread_addr_offset,
+                                                                index_t src_wave_addr_offset)
 {
     static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
                   "wrong! not implemented");
 
+    using rtn_type = array<int8_t, N>;
+
     if constexpr(N == 1)
     {
-        return llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource,
-                                              src_thread_addr_offset,
-                                              src_wave_addr_offset,
-                                              static_cast<index_t>(coherence));
+        return bit_cast<rtn_type>(llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource,
+                                                                 src_thread_addr_offset,
+                                                                 src_wave_addr_offset,
+                                                                 static_cast<index_t>(coherence)));
     }
     else if constexpr(N == 2)
     {
@@ -909,7 +899,7 @@ amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource,
                                                       src_wave_addr_offset,
                                                       static_cast<index_t>(coherence));
 
-        return bit_cast<int8x2_t>(tmp);
+        return bit_cast<rtn_type>(tmp);
     }
     else if constexpr(N == 4)
     {
@@ -918,7 +908,7 @@ amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource,
                                                       src_wave_addr_offset,
                                                       static_cast<index_t>(coherence));
 
-        return bit_cast<int8x4_t>(tmp);
+        return bit_cast<rtn_type>(tmp);
     }
     else if constexpr(N == 8)
     {
@@ -927,7 +917,7 @@ amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource,
                                                           src_wave_addr_offset,
                                                           static_cast<index_t>(coherence));
 
-        return bit_cast<int8x8_t>(tmp);
+        return bit_cast<rtn_type>(tmp);
     }
     else if constexpr(N == 16)
     {
@@ -935,7 +925,7 @@ amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource,
                                                           src_thread_addr_offset,
                                                           src_wave_addr_offset,
                                                           static_cast<index_t>(coherence));
-        return bit_cast<int8x16_t>(tmp);
+        return bit_cast<rtn_type>(tmp);
     }
     else if constexpr(N == 32)
     {
@@ -948,12 +938,12 @@ amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource,
                                               src_thread_addr_offset,
                                               src_wave_addr_offset + 4 * sizeof(int32_t),
                                               static_cast<index_t>(coherence));
-        vector_type<int32_t, 8> tmp;
+        array<int32_t, 8> tmp;
 
-        tmp.AsType<int32x4_t>()(number<0>{}) = tmp0;
-        tmp.AsType<int32x4_t>()(number<1>{}) = tmp1;
+        tmp.template get_as<int32x4_t>()(number<0>{}) = tmp0;
+        tmp.template get_as<int32x4_t>()(number<1>{}) = tmp1;
 
-        return bit_cast<int8x32_t>(tmp);
+        return bit_cast<rtn_type>(tmp);
     }
     else if constexpr(N == 64)
     {
@@ -977,14 +967,14 @@ amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource,
                                               src_wave_addr_offset + 12 * sizeof(int32_t),
                                               static_cast<index_t>(coherence));
 
-        vector_type<int32_t, 16> tmp;
+        array<int32_t, 16> tmp;
 
-        tmp.AsType<int32x4_t>()(number<0>{}) = tmp0;
-        tmp.AsType<int32x4_t>()(number<1>{}) = tmp1;
-        tmp.AsType<int32x4_t>()(number<2>{}) = tmp2;
-        tmp.AsType<int32x4_t>()(number<3>{}) = tmp3;
+        tmp.template get_as<int32x4_t>()(number<0>{}) = tmp0;
+        tmp.template get_as<int32x4_t>()(number<1>{}) = tmp1;
+        tmp.template get_as<int32x4_t>()(number<2>{}) = tmp2;
+        tmp.template get_as<int32x4_t>()(number<3>{}) = tmp3;
 
-        return bit_cast<int8x64_t>(tmp);
+        return bit_cast<rtn_type>(tmp);
     }
 }
 
@@ -995,150 +985,161 @@ amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource,
 template <typename T,
           index_t N,
           amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
-CK_TILE_DEVICE typename vector_type<T, N>::type
-amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
-                     index_t src_thread_addr_offset,
-                     index_t src_wave_addr_offset)
+CK_TILE_DEVICE array<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
+                                                index_t src_thread_addr_offset,
+                                                index_t src_wave_addr_offset)
 {
     static_assert(
-        (is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
-            (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+        (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, int32_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
         "wrong! not implemented");
 
-    if constexpr(is_same<T, float>::value) // fp32
+    using rtn_type = array<T, N>;
+
+    if constexpr(std::is_same<T, float>::value) // fp32
     {
         if constexpr(N == 1)
         {
-            return llvm_amdgcn_raw_buffer_load_fp32(src_wave_buffer_resource,
-                                                    src_thread_addr_offset,
-                                                    src_wave_addr_offset,
-                                                    static_cast<index_t>(coherence));
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp32(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset,
+                                                 static_cast<index_t>(coherence)));
         }
         else if constexpr(N == 2)
         {
-            return llvm_amdgcn_raw_buffer_load_fp32x2(src_wave_buffer_resource,
-                                                      src_thread_addr_offset,
-                                                      src_wave_addr_offset,
-                                                      static_cast<index_t>(coherence));
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp32x2(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence)));
         }
         else if constexpr(N == 4)
         {
-            return llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
-                                                      src_thread_addr_offset,
-                                                      src_wave_addr_offset,
-                                                      static_cast<index_t>(coherence));
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence)));
         }
         else if constexpr(N == 8)
         {
-            vector_type<float, 8> tmp;
+            array<float, 8> tmp;
 
-            tmp.AsType<float4_t>()(number<0>{}) =
+            tmp.template get_as<fp32x4_t>()(number<0>{}) =
                 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                    src_thread_addr_offset,
                                                    src_wave_addr_offset,
                                                    static_cast<index_t>(coherence));
 
-            tmp.AsType<float4_t>()(number<1>{}) =
+            tmp.template get_as<fp32x4_t>()(number<1>{}) =
                 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                    src_thread_addr_offset,
                                                    src_wave_addr_offset + 4 * sizeof(float),
                                                    static_cast<index_t>(coherence));
 
-            return tmp.AsType<float8_t>()(number<0>{});
+            return tmp;
         }
         else if constexpr(N == 16)
         {
-            vector_type<float, 16> tmp;
+            array<float, 16> tmp;
 
-            tmp.AsType<float4_t>()(number<0>{}) =
+            tmp.template get_as<fp32x4_t>()(number<0>{}) =
                 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                    src_thread_addr_offset,
                                                    src_wave_addr_offset,
                                                    static_cast<index_t>(coherence));
 
-            tmp.AsType<float4_t>()(number<1>{}) =
+            tmp.template get_as<fp32x4_t>()(number<1>{}) =
                 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                    src_thread_addr_offset,
                                                    src_wave_addr_offset + 4 * sizeof(float),
                                                    static_cast<index_t>(coherence));
 
-            tmp.AsType<float4_t>()(number<2>{}) =
+            tmp.template get_as<fp32x4_t>()(number<2>{}) =
                 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                    src_thread_addr_offset,
                                                    src_wave_addr_offset + 8 * sizeof(float),
                                                    static_cast<index_t>(coherence));
 
-            tmp.AsType<float4_t>()(number<3>{}) =
+            tmp.template get_as<fp32x4_t>()(number<3>{}) =
                 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                    src_thread_addr_offset,
                                                    src_wave_addr_offset + 12 * sizeof(float),
                                                    static_cast<index_t>(coherence));
 
-            return tmp.AsType<float16_t>()(number<0>{});
+            return tmp;
         }
     }
-    else if constexpr(is_same<T, half_t>::value) // fp16
+    else if constexpr(std::is_same<T, fp16_t>::value) // fp16
     {
         if constexpr(N == 1)
         {
-            return llvm_amdgcn_raw_buffer_load_fp16(src_wave_buffer_resource,
-                                                    src_thread_addr_offset,
-                                                    src_wave_addr_offset,
-                                                    static_cast<index_t>(coherence));
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp16(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset,
+                                                 static_cast<index_t>(coherence)));
         }
         else if constexpr(N == 2)
         {
-            return llvm_amdgcn_raw_buffer_load_fp16x2(src_wave_buffer_resource,
-                                                      src_thread_addr_offset,
-                                                      src_wave_addr_offset,
-                                                      static_cast<index_t>(coherence));
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp16x2(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence)));
         }
         else if constexpr(N == 4)
         {
-            return llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
-                                                      src_thread_addr_offset,
-                                                      src_wave_addr_offset,
-                                                      static_cast<index_t>(coherence));
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset,
+                                                   static_cast<index_t>(coherence)));
         }
         else if constexpr(N == 8)
         {
             // use fp32 load to mimic fp16 load
-            float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+            fp32x4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
                                                               src_thread_addr_offset,
                                                               src_wave_addr_offset,
                                                               static_cast<index_t>(coherence));
 
-            return bit_cast<half8_t>(tmp);
+            return bit_cast<rtn_type>(tmp);
         }
     }
-    else if constexpr(is_same<T, bhalf_t>::value) // bf16
+    else if constexpr(std::is_same<T, bf16_t>::value) // bf16
     {
         if constexpr(N == 1)
         {
-            return llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
-                                                   src_thread_addr_offset,
-                                                   src_wave_addr_offset,
-                                                   static_cast<index_t>(coherence));
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
+                                                src_thread_addr_offset,
+                                                src_wave_addr_offset,
+                                                static_cast<index_t>(coherence)));
         }
         else if constexpr(N == 2)
         {
-            return llvm_amdgcn_raw_buffer_load_i16x2(src_wave_buffer_resource,
-                                                     src_thread_addr_offset,
-                                                     src_wave_addr_offset,
-                                                     static_cast<index_t>(coherence));
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_i16x2(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset,
+                                                  static_cast<index_t>(coherence)));
         }
         else if constexpr(N == 4)
         {
-            return llvm_amdgcn_raw_buffer_load_i16x4(src_wave_buffer_resource,
-                                                     src_thread_addr_offset,
-                                                     src_wave_addr_offset,
-                                                     static_cast<index_t>(coherence));
+            return bit_cast<rtn_type>(
+                llvm_amdgcn_raw_buffer_load_i16x4(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset,
+                                                  static_cast<index_t>(coherence)));
         }
         else if constexpr(N == 8)
         {
@@ -1147,17 +1148,15 @@ amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
                                                               src_wave_addr_offset,
                                                               static_cast<index_t>(coherence));
 
-            return bit_cast<bhalf8_t>(tmp);
+            return bit_cast<rtn_type>(tmp);
         }
     }
     else // other datatype
     {
-        using r_t = typename vector_type<T, N>::type;
-
-        auto raw_data = amd_buffer_load_impl_raw<sizeof(T) * N, coherence>(
+        auto raw_data = amd_buffer_load_impl_with_bytes<sizeof(T) * N, coherence>(
             src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
 
-        return bit_cast<r_t>(raw_data);
+        return bit_cast<rtn_type>(raw_data);
     }
 }
 
@@ -1165,7 +1164,7 @@ template <typename T,
           index_t N,
           amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
           bool oob_conditional_check          = true>
-CK_TILE_DEVICE void amd_buffer_load_raw_impl(typename vector_type<T, N>::type& dst,
+CK_TILE_DEVICE void amd_buffer_load_raw_impl(array<T, N>& dst,
                                              int32x4_t src_wave_buffer_resource,
                                              index_t src_thread_addr_offset,
                                              index_t src_wave_addr_offset,
@@ -1175,7 +1174,7 @@ CK_TILE_DEVICE void amd_buffer_load_raw_impl(typename vector_type<T, N>::type& d
     static_assert(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8 || bytes == 16,
                   "wrong! not supported by buffer_load instruction");
 
-    using type = typename vector_type<T, N>::type;
+    using type = array<T, N>;
     if constexpr(oob_conditional_check)
     {
         buffer_load_if<sizeof(type)>{}(
@@ -1208,18 +1207,17 @@ CK_TILE_DEVICE void amd_async_buffer_load_impl(T* smem,
 
 template <index_t N,
           amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
-CK_TILE_DEVICE void
-amd_buffer_store_impl_raw(const typename vector_type<int8_t, N>::type src_thread_data,
-                          int32x4_t dst_wave_buffer_resource,
-                          index_t dst_thread_addr_offset,
-                          index_t dst_wave_addr_offset)
+CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const array<int8_t, N> src_thread_data,
+                                                     int32x4_t dst_wave_buffer_resource,
+                                                     index_t dst_thread_addr_offset,
+                                                     index_t dst_wave_addr_offset)
 {
     static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
                   "wrong! not implemented");
 
     if constexpr(N == 1)
     {
-        llvm_amdgcn_raw_buffer_store_i8(src_thread_data,
+        llvm_amdgcn_raw_buffer_store_i8(bit_cast<int8_t>(src_thread_data),
                                         dst_wave_buffer_resource,
                                         dst_thread_addr_offset,
                                         dst_wave_addr_offset,
@@ -1260,74 +1258,77 @@ amd_buffer_store_impl_raw(const typename vector_type<int8_t, N>::type src_thread
     }
     else if constexpr(N == 32)
     {
-        vector_type<int32_t, 8> tmp{bit_cast<int32x8_t>(src_thread_data)};
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<0>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset,
+            static_cast<index_t>(coherence));
 
-        llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[number<0>{}],
-                                           dst_wave_buffer_resource,
-                                           dst_thread_addr_offset,
-                                           dst_wave_addr_offset,
-                                           static_cast<index_t>(coherence));
-
-        llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[number<1>{}],
-                                           dst_wave_buffer_resource,
-                                           dst_thread_addr_offset,
-                                           dst_wave_addr_offset + sizeof(int32_t) * 4,
-                                           static_cast<index_t>(coherence));
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<1>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset + sizeof(int32_t) * 4,
+            static_cast<index_t>(coherence));
     }
     else if constexpr(N == 64)
     {
-        vector_type<int32_t, 16> tmp{bit_cast<int32x16_t>(src_thread_data)};
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<0>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset,
+            static_cast<index_t>(coherence));
 
-        llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[number<0>{}],
-                                           dst_wave_buffer_resource,
-                                           dst_thread_addr_offset,
-                                           dst_wave_addr_offset,
-                                           static_cast<index_t>(coherence));
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<1>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset + sizeof(int32_t) * 4,
+            static_cast<index_t>(coherence));
 
-        llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[number<1>{}],
-                                           dst_wave_buffer_resource,
-                                           dst_thread_addr_offset,
-                                           dst_wave_addr_offset + sizeof(int32_t) * 4,
-                                           static_cast<index_t>(coherence));
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<2>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset + sizeof(int32_t) * 8,
+            static_cast<index_t>(coherence));
 
-        llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[number<2>{}],
-                                           dst_wave_buffer_resource,
-                                           dst_thread_addr_offset,
-                                           dst_wave_addr_offset + sizeof(int32_t) * 8,
-                                           static_cast<index_t>(coherence));
-
-        llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[number<3>{}],
-                                           dst_wave_buffer_resource,
-                                           dst_thread_addr_offset,
-                                           dst_wave_addr_offset + sizeof(int32_t) * 12,
-                                           static_cast<index_t>(coherence));
+        llvm_amdgcn_raw_buffer_store_i32x4(
+            src_thread_data.template get_as<int32x4_t>()[number<3>{}],
+            dst_wave_buffer_resource,
+            dst_thread_addr_offset,
+            dst_wave_addr_offset + sizeof(int32_t) * 12,
+            static_cast<index_t>(coherence));
     }
 }
 
 template <typename T,
           index_t N,
           amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
-CK_TILE_DEVICE void amd_buffer_store_impl(const typename vector_type<T, N>::type src_thread_data,
+CK_TILE_DEVICE void amd_buffer_store_impl(const array<T, N> src_thread_data,
                                           int32x4_t dst_wave_buffer_resource,
                                           index_t dst_thread_addr_offset,
                                           index_t dst_wave_addr_offset)
 {
     static_assert(
-        (is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
-            (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+        (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, int32_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
         "wrong! not implemented");
 
-    if constexpr(is_same<T, float>::value) // fp32
+    if constexpr(std::is_same<T, float>::value) // fp32
     {
         if constexpr(N == 1)
         {
-            llvm_amdgcn_raw_buffer_store_fp32(src_thread_data,
+            llvm_amdgcn_raw_buffer_store_fp32(bit_cast<float>(src_thread_data),
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
@@ -1335,7 +1336,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const typename vector_type<T, N>::type
         }
         else if constexpr(N == 2)
         {
-            llvm_amdgcn_raw_buffer_store_fp32x2(src_thread_data,
+            llvm_amdgcn_raw_buffer_store_fp32x2(bit_cast<fp32x2_t>(src_thread_data),
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
@@ -1343,7 +1344,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const typename vector_type<T, N>::type
         }
         else if constexpr(N == 4)
         {
-            llvm_amdgcn_raw_buffer_store_fp32x4(src_thread_data,
+            llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data),
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
@@ -1351,24 +1352,25 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const typename vector_type<T, N>::type
         }
         else if constexpr(N == 8)
         {
-            vector_type<float, 8> tmp{src_thread_data};
-            llvm_amdgcn_raw_buffer_store_fp32x4(tmp.AsType<float4_t>()[number<0>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                static_cast<index_t>(coherence));
-            llvm_amdgcn_raw_buffer_store_fp32x4(tmp.AsType<float4_t>()[number<1>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset + 4 * sizeof(float),
-                                                static_cast<index_t>(coherence));
+            llvm_amdgcn_raw_buffer_store_fp32x4(
+                src_thread_data.template get_as<fp32x4_t>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                static_cast<index_t>(coherence));
+            llvm_amdgcn_raw_buffer_store_fp32x4(
+                src_thread_data.template get_as<fp32x4_t>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 4 * sizeof(float),
+                static_cast<index_t>(coherence));
         }
     }
-    else if constexpr(is_same<T, half_t>::value) // fp16
+    else if constexpr(std::is_same<T, fp16_t>::value) // fp16
     {
         if constexpr(N == 1)
         {
-            llvm_amdgcn_raw_buffer_store_fp16(src_thread_data,
+            llvm_amdgcn_raw_buffer_store_fp16(bit_cast<fp16_t>(src_thread_data),
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
@@ -1376,7 +1378,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const typename vector_type<T, N>::type
         }
         else if constexpr(N == 2)
         {
-            llvm_amdgcn_raw_buffer_store_fp16x2(src_thread_data,
+            llvm_amdgcn_raw_buffer_store_fp16x2(bit_cast<fp16x2_t>(src_thread_data),
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
@@ -1384,7 +1386,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const typename vector_type<T, N>::type
         }
         else if constexpr(N == 4)
         {
-            llvm_amdgcn_raw_buffer_store_fp16x4(src_thread_data,
+            llvm_amdgcn_raw_buffer_store_fp16x4(bit_cast<fp16x4_t>(src_thread_data),
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
@@ -1393,21 +1395,21 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const typename vector_type<T, N>::type
         else if constexpr(N == 8)
         {
 #if 0
-            vector_type<half_t, 8> tmp{src_thread_data};
+            array<fp16_t, 8> tmp{src_thread_data};
 
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[number<0>{}],
+            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<0>{}],
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
                                                 static_cast<index_t>(coherence));
 
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[number<1>{}],
+            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<1>{}],
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
-                                                dst_wave_addr_offset + 4 * sizeof(half_t),
+                                                dst_wave_addr_offset + 4 * sizeof(fp16_t),
                                                 static_cast<index_t>(coherence));
 #else
-            llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<float4_t>(src_thread_data),
+            llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data),
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
@@ -1415,11 +1417,11 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const typename vector_type<T, N>::type
 #endif
         }
     }
-    else if constexpr(is_same<T, bhalf_t>::value) // bf16
+    else if constexpr(std::is_same<T, bf16_t>::value) // bf16
     {
         if constexpr(N == 1)
         {
-            llvm_amdgcn_raw_buffer_store_i16(src_thread_data,
+            llvm_amdgcn_raw_buffer_store_i16(bit_cast<bf16_t>(src_thread_data),
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
@@ -1427,7 +1429,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const typename vector_type<T, N>::type
         }
         else if constexpr(N == 2)
         {
-            llvm_amdgcn_raw_buffer_store_i16x2(src_thread_data,
+            llvm_amdgcn_raw_buffer_store_i16x2(bit_cast<bf16x2_t>(src_thread_data),
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
@@ -1435,7 +1437,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const typename vector_type<T, N>::type
         }
         else if constexpr(N == 4)
         {
-            llvm_amdgcn_raw_buffer_store_i16x4(src_thread_data,
+            llvm_amdgcn_raw_buffer_store_i16x4(bit_cast<bf16x4_t>(src_thread_data),
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
@@ -1443,29 +1445,29 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const typename vector_type<T, N>::type
         }
         else if constexpr(N == 8)
         {
-            vector_type<bhalf_t, 8> tmp{src_thread_data};
+            llvm_amdgcn_raw_buffer_store_i16x4(
+                src_thread_data.template get_as<bf16x4_t>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                static_cast<index_t>(coherence));
 
-            llvm_amdgcn_raw_buffer_store_i16x4(tmp.AsType<bhalf4_t>()[number<0>{}],
-                                               dst_wave_buffer_resource,
-                                               dst_thread_addr_offset,
-                                               dst_wave_addr_offset,
-                                               static_cast<index_t>(coherence));
-
-            llvm_amdgcn_raw_buffer_store_i16x4(tmp.AsType<bhalf4_t>()[number<1>{}],
-                                               dst_wave_buffer_resource,
-                                               dst_thread_addr_offset,
-                                               dst_wave_addr_offset + 4 * sizeof(bhalf_t),
-                                               static_cast<index_t>(coherence));
+            llvm_amdgcn_raw_buffer_store_i16x4(
+                src_thread_data.template get_as<bf16x4_t>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 4 * sizeof(bf16_t),
+                static_cast<index_t>(coherence));
         }
     }
     else
     {
-        using r_t = typename vector_type<int8_t, sizeof(T) * N>::type;
+        using r_t = array<int8_t, sizeof(T) * N>;
 
-        amd_buffer_store_impl_raw<sizeof(T) * N, coherence>(bit_cast<r_t>(src_thread_data),
-                                                            dst_wave_buffer_resource,
-                                                            dst_thread_addr_offset,
-                                                            dst_wave_addr_offset);
+        amd_buffer_store_impl_with_bytes<sizeof(T) * N, coherence>(bit_cast<r_t>(src_thread_data),
+                                                                   dst_wave_buffer_resource,
+                                                                   dst_thread_addr_offset,
+                                                                   dst_wave_addr_offset);
     }
 }
 
@@ -1473,18 +1475,17 @@ template <typename T,
           index_t N,
           amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
           bool oob_conditional_check          = true>
-CK_TILE_DEVICE void
-amd_buffer_store_raw_impl(const typename vector_type<T, N>::type dst_thread_data,
-                          int32x4_t dst_wave_buffer_resource,
-                          index_t dst_thread_addr_offset,
-                          index_t dst_wave_addr_offset,
-                          index_t is_valid_element = 1)
+CK_TILE_DEVICE void amd_buffer_store_raw_impl(const array<T, N>& dst_thread_data,
+                                              int32x4_t dst_wave_buffer_resource,
+                                              index_t dst_thread_addr_offset,
+                                              index_t dst_wave_addr_offset,
+                                              index_t is_valid_element = 1)
 {
     constexpr index_t bytes = sizeof(T) * N;
     static_assert(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8 || bytes == 16,
                   "wrong! not supported by buffer_store instruction");
 
-    using type = typename vector_type<T, N>::type;
+    using type = array<T, N>;
     if constexpr(oob_conditional_check)
     {
         buffer_store_if<sizeof(type)>{}(dst_thread_data,
@@ -1505,22 +1506,21 @@ amd_buffer_store_raw_impl(const typename vector_type<T, N>::type dst_thread_data
 }
 
 template <typename T, index_t N>
-CK_TILE_DEVICE void
-amd_buffer_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
-                           int32x4_t dst_wave_buffer_resource,
-                           index_t dst_thread_addr_offset,
-                           index_t dst_wave_addr_offset)
+CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const array<T, N>& src_thread_data,
+                                               int32x4_t dst_wave_buffer_resource,
+                                               index_t dst_thread_addr_offset,
+                                               index_t dst_wave_addr_offset)
 {
-    static_assert((is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
-                      (is_same<T, half_t>::value && (N == 2 || N == 4 || N == 8)) ||
-                      (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
+    static_assert((std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
+                      (std::is_same<T, fp16_t>::value && (N == 2 || N == 4 || N == 8)) ||
+                      (std::is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
                   "wrong! not implemented");
 
-    if constexpr(is_same<T, float>::value)
+    if constexpr(std::is_same<T, float>::value)
     {
         if constexpr(N == 1)
         {
-            llvm_amdgcn_raw_buffer_atomic_add_fp32(src_thread_data,
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(bit_cast<float>(src_thread_data),
                                                    dst_wave_buffer_resource,
                                                    dst_thread_addr_offset,
                                                    dst_wave_addr_offset,
@@ -1528,54 +1528,56 @@ amd_buffer_atomic_add_impl(const typename vector_type<T, N>::type src_thread_dat
         }
         else if constexpr(N == 2)
         {
-            vector_type<float, 2> tmp{src_thread_data};
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
 
-            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[number<0>{}],
-                                                   dst_wave_buffer_resource,
-                                                   dst_thread_addr_offset,
-                                                   dst_wave_addr_offset,
-                                                   0);
-
-            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[number<1>{}],
-                                                   dst_wave_buffer_resource,
-                                                   dst_thread_addr_offset,
-                                                   dst_wave_addr_offset + sizeof(float),
-                                                   0);
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(float),
+                0);
         }
         else if constexpr(N == 4)
         {
-            vector_type<float, 4> tmp{src_thread_data};
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
 
-            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[number<0>{}],
-                                                   dst_wave_buffer_resource,
-                                                   dst_thread_addr_offset,
-                                                   dst_wave_addr_offset,
-                                                   0);
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(float),
+                0);
 
-            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[number<1>{}],
-                                                   dst_wave_buffer_resource,
-                                                   dst_thread_addr_offset,
-                                                   dst_wave_addr_offset + sizeof(float),
-                                                   0);
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<2>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 2 * sizeof(float),
+                0);
 
-            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[number<2>{}],
-                                                   dst_wave_buffer_resource,
-                                                   dst_thread_addr_offset,
-                                                   dst_wave_addr_offset + 2 * sizeof(float),
-                                                   0);
-
-            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[number<3>{}],
-                                                   dst_wave_buffer_resource,
-                                                   dst_thread_addr_offset,
-                                                   dst_wave_addr_offset + 3 * sizeof(float),
-                                                   0);
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(
+                src_thread_data.template get_as<float>()[number<3>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 3 * sizeof(float),
+                0);
         }
     }
-    else if constexpr(is_same<T, half_t>::value)
+    else if constexpr(std::is_same<T, fp16_t>::value)
     {
         if constexpr(N == 2)
         {
-            llvm_amdgcn_raw_buffer_atomic_add_fp16x2(src_thread_data,
+            llvm_amdgcn_raw_buffer_atomic_add_fp16x2(bit_cast<fp16_t>(src_thread_data),
                                                      dst_wave_buffer_resource,
                                                      dst_thread_addr_offset,
                                                      dst_wave_addr_offset,
@@ -1583,34 +1585,32 @@ amd_buffer_atomic_add_impl(const typename vector_type<T, N>::type src_thread_dat
         }
         else if constexpr(N == 4)
         {
-            vector_type<half_t, 4> tmp{src_thread_data};
-
             static_for<0, 2, 1>{}([&](auto i) {
-                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(tmp.AsType<half2_t>()[i],
-                                                         dst_wave_buffer_resource,
-                                                         dst_thread_addr_offset,
-                                                         dst_wave_addr_offset + i * sizeof(half2_t),
-                                                         0);
+                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
+                    src_thread_data.template get_as<fp16x2_t>()[i],
+                    dst_wave_buffer_resource,
+                    dst_thread_addr_offset,
+                    dst_wave_addr_offset + i * sizeof(fp16x2_t),
+                    0);
             });
         }
         else if constexpr(N == 8)
         {
-            vector_type<half_t, 8> tmp{src_thread_data};
-
             static_for<0, 4, 1>{}([&](auto i) {
-                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(tmp.AsType<half2_t>()[i],
-                                                         dst_wave_buffer_resource,
-                                                         dst_thread_addr_offset,
-                                                         dst_wave_addr_offset + i * sizeof(half2_t),
-                                                         0);
+                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
+                    src_thread_data.template get_as<fp16x2_t>()[i],
+                    dst_wave_buffer_resource,
+                    dst_thread_addr_offset,
+                    dst_wave_addr_offset + i * sizeof(fp16x2_t),
+                    0);
             });
         }
     }
-    else if constexpr(is_same<T, int32_t>::value)
+    else if constexpr(std::is_same<T, int32_t>::value)
     {
         if constexpr(N == 1)
         {
-            llvm_amdgcn_raw_buffer_atomic_add_i32(src_thread_data,
+            llvm_amdgcn_raw_buffer_atomic_add_i32(bit_cast<int32_t>(src_thread_data),
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset,
@@ -1618,65 +1618,66 @@ amd_buffer_atomic_add_impl(const typename vector_type<T, N>::type src_thread_dat
         }
         else if constexpr(N == 2)
         {
-            vector_type<int32_t, 2> tmp{src_thread_data};
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
 
-            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[number<0>{}],
-                                                  dst_wave_buffer_resource,
-                                                  dst_thread_addr_offset,
-                                                  dst_wave_addr_offset,
-                                                  0);
-
-            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[number<1>{}],
-                                                  dst_wave_buffer_resource,
-                                                  dst_thread_addr_offset,
-                                                  dst_wave_addr_offset + sizeof(int32_t),
-                                                  0);
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(int32_t),
+                0);
         }
         else if constexpr(N == 4)
         {
-            vector_type<int32_t, 4> tmp{src_thread_data};
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
 
-            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[number<0>{}],
-                                                  dst_wave_buffer_resource,
-                                                  dst_thread_addr_offset,
-                                                  dst_wave_addr_offset,
-                                                  0);
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(int32_t),
+                0);
 
-            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[number<1>{}],
-                                                  dst_wave_buffer_resource,
-                                                  dst_thread_addr_offset,
-                                                  dst_wave_addr_offset + sizeof(int32_t),
-                                                  0);
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<2>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 2 * sizeof(int32_t),
+                0);
 
-            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[number<2>{}],
-                                                  dst_wave_buffer_resource,
-                                                  dst_thread_addr_offset,
-                                                  dst_wave_addr_offset + 2 * sizeof(int32_t),
-                                                  0);
-
-            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[number<3>{}],
-                                                  dst_wave_buffer_resource,
-                                                  dst_thread_addr_offset,
-                                                  dst_wave_addr_offset + 3 * sizeof(int32_t),
-                                                  0);
+            llvm_amdgcn_raw_buffer_atomic_add_i32(
+                src_thread_data.template get_as<int32_t>()[number<3>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 3 * sizeof(int32_t),
+                0);
         }
     }
 }
 
 template <typename T, index_t N>
-CK_TILE_DEVICE void
-amd_buffer_atomic_max_impl(const typename vector_type<T, N>::type src_thread_data,
-                           int32x4_t dst_wave_buffer_resource,
-                           index_t dst_thread_addr_offset,
-                           index_t dst_wave_addr_offset)
+CK_TILE_DEVICE void amd_buffer_atomic_max_impl(const array<T, N> src_thread_data,
+                                               int32x4_t dst_wave_buffer_resource,
+                                               index_t dst_thread_addr_offset,
+                                               index_t dst_wave_addr_offset)
 {
-    static_assert((is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
+    static_assert((std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
                   "wrong! not implemented");
-    if constexpr(is_same<T, double>::value)
+    if constexpr(std::is_same<T, double>::value)
     {
         if constexpr(N == 1)
         {
-            llvm_amdgcn_raw_buffer_atomic_max_fp64(src_thread_data,
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(bit_cast<double>(src_thread_data),
                                                    dst_wave_buffer_resource,
                                                    dst_thread_addr_offset,
                                                    dst_wave_addr_offset,
@@ -1684,47 +1685,49 @@ amd_buffer_atomic_max_impl(const typename vector_type<T, N>::type src_thread_dat
         }
         else if constexpr(N == 2)
         {
-            vector_type<double, 2> tmp{src_thread_data};
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
 
-            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[number<0>{}],
-                                                   dst_wave_buffer_resource,
-                                                   dst_thread_addr_offset,
-                                                   dst_wave_addr_offset,
-                                                   0);
-
-            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[number<1>{}],
-                                                   dst_wave_buffer_resource,
-                                                   dst_thread_addr_offset,
-                                                   dst_wave_addr_offset + sizeof(double),
-                                                   0);
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(double),
+                0);
         }
         else if constexpr(N == 4)
         {
-            vector_type<double, 4> tmp{src_thread_data};
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<0>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset,
+                0);
 
-            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[number<0>{}],
-                                                   dst_wave_buffer_resource,
-                                                   dst_thread_addr_offset,
-                                                   dst_wave_addr_offset,
-                                                   0);
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<1>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + sizeof(double),
+                0);
 
-            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[number<1>{}],
-                                                   dst_wave_buffer_resource,
-                                                   dst_thread_addr_offset,
-                                                   dst_wave_addr_offset + sizeof(double),
-                                                   0);
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<2>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 2 * sizeof(double),
+                0);
 
-            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[number<2>{}],
-                                                   dst_wave_buffer_resource,
-                                                   dst_thread_addr_offset,
-                                                   dst_wave_addr_offset + 2 * sizeof(double),
-                                                   0);
-
-            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[number<3>{}],
-                                                   dst_wave_buffer_resource,
-                                                   dst_thread_addr_offset,
-                                                   dst_wave_addr_offset + 3 * sizeof(double),
-                                                   0);
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(
+                src_thread_data.template get_as<double>()[number<3>{}],
+                dst_wave_buffer_resource,
+                dst_thread_addr_offset,
+                dst_wave_addr_offset + 3 * sizeof(double),
+                0);
         }
     }
 }
@@ -1738,22 +1741,17 @@ template <typename T,
           index_t N,
           amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
           bool oob_conditional_check          = true>
-CK_TILE_DEVICE typename vector_type_maker<T, N>::type::type
+CK_TILE_DEVICE array<T, N>
 amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
                                             index_t src_thread_element_offset,
                                             bool src_thread_element_valid,
                                             index_t src_element_space_size)
 {
     const int32x4_t src_wave_buffer_resource =
-        make_wave_buffer_resource(p_src_wave, src_element_space_size);
+        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
 
     index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
 
-    using vector_t = typename vector_type_maker<T, N>::type::type;
-    using scalar_t = typename scalar_type<vector_t>::type;
-
-    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
-
 #if CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
     uint32_t src_addr_shift = [&]() {
         if constexpr(oob_conditional_check)
@@ -1761,13 +1759,13 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
         else
             return 0;
     }();
-    return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+    return amd_buffer_load_impl<T, N, coherence>(
         src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
 #else
-    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
-        src_wave_buffer_resource, src_thread_addr_offset, 0);
+    array<T, N> tmp =
+        amd_buffer_load_impl<T, N, coherence>(src_wave_buffer_resource, src_thread_addr_offset, 0);
     if constexpr(oob_conditional_check)
-        return src_thread_element_valid ? tmp : vector_t(0);
+        return src_thread_element_valid ? tmp : array<T, N>{0};
     else
         return tmp;
 #endif
@@ -1781,7 +1779,7 @@ template <typename T,
           index_t N,
           amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
           bool oob_conditional_check          = true>
-CK_TILE_DEVICE typename vector_type_maker<T, N>::type::type
+CK_TILE_DEVICE array<T, N>
 amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
                                                         index_t src_thread_element_offset,
                                                         bool src_thread_element_valid,
@@ -1789,20 +1787,15 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
                                                         T customized_value)
 {
     const int32x4_t src_wave_buffer_resource =
-        make_wave_buffer_resource(p_src_wave, src_element_space_size);
+        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
 
     index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
 
-    using vector_t = typename vector_type_maker<T, N>::type::type;
-    using scalar_t = typename scalar_type<vector_t>::type;
-
-    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
-
-    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
-        src_wave_buffer_resource, src_thread_addr_offset, 0);
+    array<T, N> tmp =
+        amd_buffer_load_impl<T, N, coherence>(src_wave_buffer_resource, src_thread_addr_offset, 0);
 
     if constexpr(oob_conditional_check)
-        return src_thread_element_valid ? tmp : vector_t(customized_value);
+        return src_thread_element_valid ? tmp : array<T, N>{customized_value};
     else
         return tmp;
 }
@@ -1811,23 +1804,18 @@ template <typename T,
           index_t N,
           amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
           bool oob_conditional_check          = true>
-CK_TILE_DEVICE void amd_buffer_load_raw(typename vector_type_maker<T, N>::type::type& dst,
+CK_TILE_DEVICE void amd_buffer_load_raw(array<T, N>& dst,
                                         const T* p_src_wave,
                                         index_t src_thread_element_offset,
                                         index_t src_element_space_size,
                                         index_t is_valid_element = 0)
 {
     const int32x4_t src_wave_buffer_resource =
-        make_wave_buffer_resource(p_src_wave, src_element_space_size);
+        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
 
     index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
 
-    using vector_t = typename vector_type_maker<T, N>::type::type;
-    using scalar_t = typename scalar_type<vector_t>::type;
-
-    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
-
-    amd_buffer_load_raw_impl<scalar_t, vector_size, coherence, oob_conditional_check>(
+    amd_buffer_load_raw_impl<T, N, coherence, oob_conditional_check>(
         dst, src_wave_buffer_resource, src_thread_addr_offset, 0, is_valid_element);
 }
 
@@ -1844,7 +1832,7 @@ CK_TILE_DEVICE void amd_async_buffer_load_with_oob(T* smem,
                                                    index_t src_element_space_size)
 {
     const int32x4_t src_wave_buffer_resource =
-        make_wave_buffer_resource(p_src_wave, src_element_space_size);
+        make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
 
     index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
 
@@ -1860,22 +1848,17 @@ template <typename T,
           index_t N,
           amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
           bool oob_conditional_check          = true>
-CK_TILE_DEVICE void
-amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_data,
-                 T* p_dst_wave,
-                 const index_t dst_thread_element_offset,
-                 const bool dst_thread_element_valid,
-                 const index_t dst_element_space_size)
+CK_TILE_DEVICE void amd_buffer_store(const array<T, N>& src_thread_data,
+                                     T* p_dst_wave,
+                                     const index_t dst_thread_element_offset,
+                                     const bool dst_thread_element_valid,
+                                     const index_t dst_element_space_size)
 {
     const int32x4_t dst_wave_buffer_resource =
-        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
 
     index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
 
-    using vector_t                = typename vector_type_maker<T, N>::type::type;
-    using scalar_t                = typename scalar_type<vector_t>::type;
-    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
-
 #if CK_TILE_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
     uint32_t dst_addr_shift = [&]() {
         if constexpr(oob_conditional_check)
@@ -1883,20 +1866,20 @@ amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_d
         else
             return 0;
     }();
-    amd_buffer_store_impl<scalar_t, vector_size, coherence>(
+    amd_buffer_store_impl<T, N, coherence>(
         src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
 #else
     if constexpr(oob_conditional_check)
     {
         if(dst_thread_element_valid)
         {
-            amd_buffer_store_impl<scalar_t, vector_size, coherence>(
+            amd_buffer_store_impl<T, N, coherence>(
                 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
         }
     }
     else
     {
-        amd_buffer_store_impl<scalar_t, vector_size, coherence>(
+        amd_buffer_store_impl<T, N, coherence>(
             src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
     }
 #endif
@@ -1906,28 +1889,22 @@ template <typename T,
           index_t N,
           amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
           bool oob_conditional_check          = true>
-CK_TILE_DEVICE void
-amd_buffer_store_raw(const typename vector_type_maker<T, N>::type::type src_thread_data,
-                     T* p_dst_wave,
-                     const index_t dst_thread_element_offset,
-                     const bool dst_thread_element_valid,
-                     const index_t dst_element_space_size)
+CK_TILE_DEVICE void amd_buffer_store_raw(const array<T, N>& src_thread_data,
+                                         T* p_dst_wave,
+                                         const index_t dst_thread_element_offset,
+                                         const bool dst_thread_element_valid,
+                                         const index_t dst_element_space_size)
 {
     const int32x4_t dst_wave_buffer_resource =
-        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
 
     index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
 
-    using vector_t                = typename vector_type_maker<T, N>::type::type;
-    using scalar_t                = typename scalar_type<vector_t>::type;
-    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
-
-    amd_buffer_store_raw_impl<scalar_t, vector_size, coherence, oob_conditional_check>(
-        src_thread_data,
-        dst_wave_buffer_resource,
-        dst_thread_addr_offset,
-        0,
-        dst_thread_element_valid);
+    amd_buffer_store_raw_impl<T, N, coherence, oob_conditional_check>(src_thread_data,
+                                                                      dst_wave_buffer_resource,
+                                                                      dst_thread_addr_offset,
+                                                                      0,
+                                                                      dst_thread_element_valid);
 }
 
 // buffer_atomic_add requires:
@@ -1935,31 +1912,26 @@ amd_buffer_store_raw(const typename vector_type_maker<T, N>::type::type src_thre
 //   2) p_dst_wave must be a wavewise pointer.
 // It is user's responsibility to make sure that is true.
 template <typename T, index_t N>
-CK_TILE_DEVICE void
-amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thread_data,
-                      T* p_dst_wave,
-                      const index_t dst_thread_element_offset,
-                      const bool dst_thread_element_valid,
-                      const index_t dst_element_space_size)
+CK_TILE_DEVICE void amd_buffer_atomic_add(const array<T, N>& src_thread_data,
+                                          T* p_dst_wave,
+                                          const index_t dst_thread_element_offset,
+                                          const bool dst_thread_element_valid,
+                                          const index_t dst_element_space_size)
 {
     const int32x4_t dst_wave_buffer_resource =
-        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
 
     index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
 
-    using vector_t                = typename vector_type_maker<T, N>::type::type;
-    using scalar_t                = typename scalar_type<vector_t>::type;
-    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
-
 #if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
     uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
 
-    amd_buffer_atomic_add_impl<scalar_t, vector_size>(
+    amd_buffer_atomic_add_impl<T, N>(
         src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
 #else
     if(dst_thread_element_valid)
     {
-        amd_buffer_atomic_add_impl<scalar_t, vector_size>(
+        amd_buffer_atomic_add_impl<T, N>(
             src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
     }
 #endif
@@ -1970,31 +1942,26 @@ amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thr
 //   2) p_dst_wave must be a wavewise pointer.
 // It is user's responsibility to make sure that is true.
 template <typename T, index_t N>
-CK_TILE_DEVICE void
-amd_buffer_atomic_max(const typename vector_type_maker<T, N>::type::type src_thread_data,
-                      T* p_dst_wave,
-                      const index_t dst_thread_element_offset,
-                      const bool dst_thread_element_valid,
-                      const index_t dst_element_space_size)
+CK_TILE_DEVICE void amd_buffer_atomic_max(const array<T, N>& src_thread_data,
+                                          T* p_dst_wave,
+                                          const index_t dst_thread_element_offset,
+                                          const bool dst_thread_element_valid,
+                                          const index_t dst_element_space_size)
 {
     const int32x4_t dst_wave_buffer_resource =
-        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
 
     index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
 
-    using vector_t                = typename vector_type_maker<T, N>::type::type;
-    using scalar_t                = typename scalar_type<vector_t>::type;
-    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
-
 #if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
     uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
 
-    amd_buffer_atomic_max_impl<scalar_t, vector_size>(
+    amd_buffer_atomic_max_impl<T, N>(
         src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
 #else
     if(dst_thread_element_valid)
     {
-        amd_buffer_atomic_max_impl<scalar_t, vector_size>(
+        amd_buffer_atomic_max_impl<T, N>(
             src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
     }
 #endif
@@ -2025,7 +1992,8 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
 
     const uint32_t* global_ptr =
         reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
-    const int32x4_t src_resource = make_wave_buffer_resource(global_ptr, src_element_space_size);
+    const int32x4_t src_resource =
+        make_wave_buffer_resource(global_ptr, src_element_space_size * sizeof(T));
     const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
 
 #if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
new file mode 100644
index 0000000000..333168fd2a
--- /dev/null
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+// Address Space for AMDGCN
+// https://llvm.org/docs/AMDGPUUsage.html#address-space
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/integral_constant.hpp"
+
+namespace ck_tile {
+
+enum struct address_space_enum
+{
+    generic,
+    global,
+    lds,
+    sgpr,
+    vgpr,
+};
+
+enum struct memory_operation_enum
+{
+    set,
+    atomic_add,
+    atomic_max,
+    add
+};
+
+CK_TILE_HOST_DEVICE constexpr index_t get_warp_size()
+{
+    // warpSize is defined by HIP
+    return warpSize;
+}
+
+CK_TILE_DEVICE index_t get_grid_size() { return gridDim.x; }
+
+CK_TILE_DEVICE index_t get_block_size() { return blockDim.x; }
+
+// TODO: deprecate these
+CK_TILE_DEVICE index_t get_thread_local_1d_id() { return threadIdx.x; }
+
+CK_TILE_DEVICE index_t get_thread_global_1d_id() { return blockIdx.x * blockDim.x + threadIdx.x; }
+
+CK_TILE_DEVICE index_t get_block_1d_id() { return blockIdx.x; }
+
+// Use these instead
+CK_TILE_DEVICE index_t get_lane_id() { return __lane_id(); }
+
+CK_TILE_DEVICE index_t get_warp_id()
+{
+    return __builtin_amdgcn_readfirstlane(threadIdx.x / get_warp_size());
+}
+
+CK_TILE_DEVICE index_t get_thread_id() { return threadIdx.x; }
+
+CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; }
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/arch/utility.hpp b/include/ck_tile/core/arch/utility.hpp
new file mode 100644
index 0000000000..1ab2ba1002
--- /dev/null
+++ b/include/ck_tile/core/arch/utility.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+// Address Space for AMDGCN
+// https://llvm.org/docs/AMDGPUUsage.html#address-space
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/integral_constant.hpp"
+
+namespace ck_tile {
+
+// TODO: we have "memory" clobber here because this inline asm is used for async copy
+CK_TILE_DEVICE void m0_set_with_memory(index_t v)
+{
+    asm volatile("s_mov_b32 m0, %0" : : "s"(v) : "memory");
+}
+
+// NOTE: this is an immediate value
+CK_TILE_DEVICE void m0_inc_with_memory(index_t v)
+{
+    asm volatile("s_add_u32 m0, %0, m0" : : "n"(v) : "memory");
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp
index 55cca88c8c..b655ae0a6c 100644
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -120,3 +120,15 @@
 #ifndef CK_TILE_DEBUG_LOG
 #define CK_TILE_DEBUG_LOG 0
 #endif
+
+#ifndef __HIP_DEVICE_COMPILE__ // for host code
+#define CK_TILE_BUFFER_RESOURCE_3RD_DWORD -1
+#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
+    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) ||                          \
+    defined(__gfx942__) // for GPU code
+#define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x00020000
+#elif defined(__gfx1030__) // for GPU code
+#define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31014000
+#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code
+#define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31004000
+#endif
diff --git a/include/ck_tile/core/container/array.hpp b/include/ck_tile/core/container/array.hpp
index fd910abac9..7752f31375 100644
--- a/include/ck_tile/core/container/array.hpp
+++ b/include/ck_tile/core/container/array.hpp
@@ -44,6 +44,19 @@ struct array
             data[i] = vlast;
         }
     }
+    CK_TILE_HOST_DEVICE explicit constexpr array(value_type c)
+    {
+        for(auto i = 0; i < size(); i++)
+            data[i] = c;
+    }
+    template <typename ArrayType>
+    CK_TILE_HOST_DEVICE constexpr array(const ArrayType& o)
+    {
+        static_assert(ArrayType::size() == size(), "wrong! size not the same");
+        for(auto i = 0; i < size(); i++)
+            data[i] = o.data[i];
+    }
+
     CK_TILE_HOST_DEVICE static constexpr auto size() { return N; }
     CK_TILE_HOST_DEVICE static constexpr bool is_static() { return is_static_v<value_type>; }
 
@@ -67,18 +80,18 @@ struct array
     CK_TILE_HOST_DEVICE constexpr const value_type& operator[](index_t i) const { return data[i]; }
     CK_TILE_HOST_DEVICE constexpr value_type& operator[](index_t i)             { return data[i]; }
     CK_TILE_HOST_DEVICE constexpr value_type& operator()(index_t i)             { return data[i]; }     // TODO: compatible
-
-    template <typename T>
-    CK_TILE_HOST_DEVICE constexpr auto operator=(const T& a)
+#if 0
+    template <typename ArrayType>
+    CK_TILE_HOST_DEVICE constexpr auto operator=(const ArrayType& a)
     {
-        static_assert(T::size() == size(), "wrong! size not the same");
+        static_assert(ArrayType::size() == size(), "wrong! size not the same");
         for(index_t i = 0; i < size(); ++i)
         {
             data[i] = a[i];
         }
         return *this;
     }
-
+#endif
     // type punning (strict aliasing) member functions for read/write
     // aliasing this array of type "T", "N" elements
     // as array of type "Tx", sizeof(T)*N/sizeof(Tx) elements
@@ -122,6 +135,17 @@ struct array<T, 0>
     CK_TILE_HOST_DEVICE void print() const { printf("array{size: 0, data: []}"); }
 };
 
+template <typename>
+struct vector_traits;
+
+// specialization for array
+template <typename T, index_t N>
+struct vector_traits<array<T, N>>
+{
+    using scalar_type                    = T;
+    static constexpr index_t vector_size = N;
+};
+
 template <typename T, typename... Ts>
 CK_TILE_HOST_DEVICE constexpr auto make_array(T&& x, Ts&&... xs)
 {
diff --git a/include/ck_tile/core/container/container_helper.hpp b/include/ck_tile/core/container/container_helper.hpp
index 88405f6fcb..eec15d2538 100644
--- a/include/ck_tile/core/container/container_helper.hpp
+++ b/include/ck_tile/core/container/container_helper.hpp
@@ -468,6 +468,7 @@ CK_TILE_HOST_DEVICE constexpr auto sequence_to_tuple_of_number(sequence<Is...>)
         number<Seq::size()>{});
 }
 
+#if 0
 #define TO_TUPLE_OF_SEQUENCE(a_of_b_impl, a_size, bs_sizes)             \
     [a_of_b_impl, a_size, bs_sizes] {                                   \
         return ck_tile::generate_tuple(                                 \
@@ -479,5 +480,21 @@ CK_TILE_HOST_DEVICE constexpr auto sequence_to_tuple_of_number(sequence<Is...>)
             },                                                          \
             ck_tile::number<a_size>{});                                 \
     }()
+#else
+// constexpr index_t can't be captured "-Wunused-lambda-capture"
+// TODO: this is ugly
+#define TO_TUPLE_OF_SEQUENCE(a_of_b_impl, a_size, bs_sizes)             \
+    [a_of_b_impl, bs_sizes] {                                   \
+        return ck_tile::generate_tuple(                                 \
+            [=](auto i) {                                               \
+                constexpr auto b_impl    = a_of_b_impl[i];              \
+                constexpr index_t b_size = bs_sizes[i];                 \
+                constexpr auto b         = TO_SEQUENCE(b_impl, b_size); \
+                return b;                                               \
+            },                                                          \
+            ck_tile::number<a_size>{});                                 \
+    }()
+#endif
+
 
 } // namespace ck_tile
diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp
index 15313b1b65..581e3b8d61 100644
--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -67,13 +67,12 @@ struct sequence
     CK_TILE_HOST_DEVICE static constexpr auto get(number<I>)
     {
         static_assert(I < size(), "wrong! I too large");
-        return number<get(I)>{};
+        return number<get<I>()>{};
     }
 
     CK_TILE_HOST_DEVICE static constexpr index_t at(index_t I)
     {
         // the last dummy element is to prevent compiler complain about empty array, when mSize = 0
-        static_assert(I < size(), "wrong! I too large");
         const index_t mData[size() + 1] = {Is..., 0};
         return mData[I];
     }
@@ -89,7 +88,7 @@ struct sequence
     CK_TILE_HOST_DEVICE static constexpr auto at(number<I>)
     {
         static_assert(I < size(), "wrong! I too large");
-        return number<get(I)>{};
+        return number<get<I>()>{};
     }
 
     template <typename I>
diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp
index f95ddb5435..a47cf94811 100644
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -16,40 +16,48 @@ namespace ck_tile {
 
 namespace impl {
 
+// the place where content is stored
 template <index_t idx, typename T, bool is_empty = std::is_empty_v<T>>
-struct tuple_element
+struct tuple_object
 {
 };
 
 template <index_t idx, typename T>
-struct tuple_element<idx, T, true>
+struct tuple_object<idx, T, true>
 {
-    CK_TILE_HOST_DEVICE constexpr tuple_element() {}
-    CK_TILE_HOST_DEVICE constexpr tuple_element(const T&) {}
+    CK_TILE_HOST_DEVICE constexpr tuple_object() {}
+    CK_TILE_HOST_DEVICE constexpr tuple_object(const T&) {}
 };
 
 template <index_t idx, typename T>
-struct tuple_element<idx, T, false>
+struct tuple_object<idx, T, false>
 {
-    CK_TILE_HOST_DEVICE constexpr tuple_element() {}
-    CK_TILE_HOST_DEVICE constexpr tuple_element(const T& e) : element(e) {}
+    CK_TILE_HOST_DEVICE constexpr tuple_object() : element{} {}
+    CK_TILE_HOST_DEVICE constexpr tuple_object(const T& e) : element(e) {}
     T element;
 };
 
+// NOTE: we return a instance(not a reference) if content is empty
 template <std::size_t I, class T>
-CK_TILE_HOST_DEVICE constexpr T const& getv(tuple_element<I, T, false> const& x)
+CK_TILE_HOST_DEVICE constexpr T getv(const tuple_object<I, T, true>&)
+{
+    return {};
+}
+
+template <std::size_t I, class T>
+CK_TILE_HOST_DEVICE constexpr const T& getv(const tuple_object<I, T, false>& x)
 {
     return x.element;
 }
 
 template <std::size_t I, class T>
-CK_TILE_HOST_DEVICE constexpr T& getv(tuple_element<I, T, false>& x)
+CK_TILE_HOST_DEVICE constexpr T& getv(tuple_object<I, T, false>& x)
 {
     return x.element;
 }
 
 template <std::size_t I, class T>
-CK_TILE_HOST_DEVICE constexpr T&& getv(tuple_element<I, T, false>&& x)
+CK_TILE_HOST_DEVICE constexpr T&& getv(tuple_object<I, T, false>&& x)
 {
     return static_cast<T&&>(x.element);
 }
@@ -58,18 +66,18 @@ template <typename index_seq, typename... T>
 struct tuple_base;
 
 template <index_t... I, typename... T>
-struct tuple_base<sequence<I...>, T...> : public tuple_element<I, T>...
+struct tuple_base<sequence<I...>, T...> : tuple_object<I, T>...
 {
     CK_TILE_HOST_DEVICE constexpr tuple_base() {}
 
     template <class... U>
-    CK_TILE_HOST_DEVICE constexpr explicit tuple_base(U const&... u) : tuple_element<I, T>(u)...
+    CK_TILE_HOST_DEVICE constexpr explicit tuple_base(const U&... u) : tuple_object<I, T>(u)...
     {
     }
 
     template <class... U>
-    CK_TILE_HOST_DEVICE constexpr tuple_base(tuple_base<sequence<I...>, U...> const& u)
-        : tuple_element<I, T>(getv(static_cast<tuple_element<I, U> const&>(u)))...
+    CK_TILE_HOST_DEVICE constexpr tuple_base(const tuple_base<sequence<I...>, U...>& u)
+        : tuple_object<I, T>(getv(static_cast<const tuple_object<I, U>&>(u)))...
     {
     }
 };
@@ -84,15 +92,13 @@ struct tuple : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>
     CK_TILE_HOST_DEVICE constexpr tuple() {}
 
     template <class... U>
-    CK_TILE_HOST_DEVICE constexpr tuple(U const&... u)
-        : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>(u...)
+    CK_TILE_HOST_DEVICE constexpr tuple(const U&... u) : base(u...)
     {
     }
 
     template <class... U>
-    CK_TILE_HOST_DEVICE constexpr tuple(tuple<U...> const& u)
-        : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>(
-              static_cast<impl::tuple_base<U...> const&>(u))
+    CK_TILE_HOST_DEVICE constexpr tuple(const tuple<U...>& u)
+        : base(static_cast<const impl::tuple_base<make_index_sequence<sizeof...(U)>, U...>&>(u))
     {
     }
 
@@ -109,19 +115,19 @@ struct tuple : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>
 
 #define TP_COM_() static_assert(I < size(), "wrong! out of range")
     // clang-format off
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr const auto & get() const          { TP_COM_(); return impl::getv<I>(*this); }
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr const auto & get(number<I>) const { TP_COM_(); return get<I>(); }
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr auto & get()                      { TP_COM_(); return impl::getv<I>(*this); }
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr auto & get(number<I>)             { TP_COM_(); return get<I>(); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() const          { TP_COM_(); return impl::getv<I>(*this); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) const { TP_COM_(); return get<I>(); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get()                      { TP_COM_(); return impl::getv<I>(*this); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>)             { TP_COM_(); return get<I>(); }
 
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr const auto & at() const          { TP_COM_(); return impl::getv<I>(*this); }
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr const auto & at(number<I>) const { TP_COM_(); return get<I>(); }
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr auto & at()                      { TP_COM_(); return impl::getv<I>(*this); }
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr auto & at(number<I>)             { TP_COM_(); return get<I>(); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) at() const          { TP_COM_(); return impl::getv<I>(*this); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) at(number<I>) const { TP_COM_(); return get<I>(); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) at()                      { TP_COM_(); return impl::getv<I>(*this); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) at(number<I>)             { TP_COM_(); return get<I>(); }
 
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr auto & operator[](number<I>)             { TP_COM_(); return get<I>(); }
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr const auto & operator[](number<I>) const { TP_COM_(); return get<I>(); }
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr auto & operator()(number<I>)             { TP_COM_(); return get<I>(); }  // TODO: compatible
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) operator[](number<I>)             { TP_COM_(); return get<I>(); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) operator[](number<I>) const { TP_COM_(); return get<I>(); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) operator()(number<I>)             { TP_COM_(); return get<I>(); }  // TODO: compatible
     // clang-format on
 #undef TP_COM_
 };
@@ -250,15 +256,15 @@ CK_TILE_HOST_DEVICE constexpr auto transform_tuples(F f, const X& x, const Y& y,
 
 // By default unroll to the flatten
 template <index_t Depth = 0, index_t MaxDepth = -1>
-CK_TILE_HOST_DEVICE constexpr auto unroll_nested_tuple(const tuple<>& element)
+CK_TILE_HOST_DEVICE constexpr auto unroll_nested_tuple(const tuple<>& t)
 {
-    return element;
+    return t;
 }
 
 template <index_t Depth = 0, index_t MaxDepth = -1, typename T>
-CK_TILE_HOST_DEVICE constexpr auto unroll_nested_tuple(const T& element)
+CK_TILE_HOST_DEVICE constexpr auto unroll_nested_tuple(const T& t)
 {
-    return make_tuple(element);
+    return make_tuple(t);
 }
 
 template <index_t Depth = 0, index_t MaxDepth = -1, typename... Ts>
@@ -334,7 +340,7 @@ CK_TILE_HOST_DEVICE constexpr auto to_array_of_array(tuple<Seqs...> t_of_s)
         index_t max_n1_ = 0;
 
         static_for<0, n0, 1>{}([&](auto i0) {
-            constexpr index_t n1 = t_of_s[i0].size()();
+            constexpr index_t n1 = t_of_s[i0].size();
 
             max_n1_ = max_n1_ < n1 ? n1 : max_n1_;
         });
@@ -345,7 +351,7 @@ CK_TILE_HOST_DEVICE constexpr auto to_array_of_array(tuple<Seqs...> t_of_s)
     array<array<index_t, max_n1>, n0> a_of_a{{-1}};
 
     static_for<0, n0, 1>{}([&](auto i0) {
-        constexpr index_t n1 = t_of_s[i0].size()();
+        constexpr index_t n1 = t_of_s[i0].size();
 
         static_for<0, n1, 1>{}([&](auto i1) { a_of_a(i0)(i1) = t_of_s[i0][i1]; });
     });
@@ -482,3 +488,60 @@ struct tuple_element<I, const ck_tile::tuple<Ts...>>
 };
 
 } // namespace std
+
+#if 1
+#define TO_TUPLE_OF_NUMBER(a, n)                                                               \
+    _Pragma("clang diagnostic push")                                                           \
+    _Pragma("clang diagnostic ignored \"-Wc++20-extensions\"")                                 \
+        [a]<ck_tile::index_t... IDX_IDX_>(ck_tile::sequence<IDX_IDX_...>)                     \
+    {                                                                                          \
+        return ck_tile::tuple<ck_tile::number<a[ck_tile::number<IDX_IDX_>{}]>...>{};          \
+    }                                                                                          \
+    (ck_tile::make_index_sequence<n>{})                                                        \
+    _Pragma("clang diagnostic pop")
+#else
+#define TO_TUPLE_OF_NUMBER(arr, n_)                                                              \
+    [&arr, n_] {                                                                                \
+        static_assert(arr.size() >= n_, "wrong! out of bound");                                  \
+                                                                                                \
+        static_assert(n_ < 7, "not implemented");                                                \
+                                                                                                \
+        if constexpr(n_ == 0)                                                                    \
+        {                                                                                       \
+            return ck_tile::tuple<>{};                                                               \
+        }                                                                                       \
+        else if constexpr(n_ == 1)                                                               \
+        {                                                                                       \
+        return ck_tile::tuple<number<arr[0]>>{};                                                 \
+        }                                                                                       \
+        else if constexpr(n_ == 2)                                                               \
+        {                                                                                       \
+            return ck_tile::tuple<number<arr[0]>, number<arr[1]>>{};                                 \
+        }                                                                                       \
+        else if constexpr(n_ == 3)                                                               \
+        {                                                                                       \
+            return ck_tile::tuple<number<arr[0]>, number<arr[1]>, number<arr[2]>>{};                 \
+        }                                                                                       \
+        else if constexpr(n_ == 4)                                                               \
+        {                                                                                       \
+            return ck_tile::tuple<number<arr[0]>, number<arr[1]>, number<arr[2]>, number<arr[3]>>{}; \
+        }                                                                                       \
+        else if constexpr(n_ == 5)                                                               \
+        {                                                                                       \
+            return ck_tile::tuple<number<arr[0]>,                                                    \
+                             number<arr[1]>,                                                    \
+                             number<arr[2]>,                                                    \
+                             number<arr[3]>,                                                    \
+                             number<arr[4]>>{};                                                 \
+        }                                                                                       \
+        else if constexpr(n_ == 6)                                                               \
+        {                                                                                       \
+            return ck_tile::tuple<number<arr[0]>,                                                    \
+                             number<arr[1]>,                                                    \
+                             number<arr[2]>,                                                    \
+                             number<arr[3]>,                                                    \
+                             number<arr[4]>,                                                    \
+                             number<arr[5]>>{};                                                 \
+        }                                                                                       \
+    }()
+#endif
diff --git a/include/ck_tile/core/numeric/bfloat16.hpp b/include/ck_tile/core/numeric/bfloat16.hpp
index b19aefc928..6fd433f005 100644
--- a/include/ck_tile/core/numeric/bfloat16.hpp
+++ b/include/ck_tile/core/numeric/bfloat16.hpp
@@ -20,7 +20,8 @@ enum class bf16_rounding_mode
     truncate,
 };
 
-template <bf16_rounding_mode rounding = CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT>
+template <bf16_rounding_mode rounding =
+              static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
 CK_TILE_HOST_DEVICE uint16_t float_to_bf16_raw(float f, constant<rounding> = {});
 
 CK_TILE_HOST_DEVICE
@@ -41,36 +42,42 @@ struct alignas(2) bfloat16_t
     }
 
     // constructor
-    bfloat16_t() = default;
+    constexpr bfloat16_t() : data() {}
 
     // construct from float
     CK_TILE_HOST_DEVICE
-    explicit bfloat16_t(const float& x) { data = float_to_bf16_raw(x); }
+    explicit constexpr bfloat16_t(const float& x) : data(float_to_bf16_raw(x)) {}
 
     // construct from int
     CK_TILE_HOST_DEVICE
-    explicit bfloat16_t(const int& x) { data = float_to_bf16_raw(static_cast<float>(x)); }
+    explicit constexpr bfloat16_t(const int& x) : data(float_to_bf16_raw(static_cast<float>(x))) {}
 
     // construct from unsigned int
     CK_TILE_HOST_DEVICE
-    explicit bfloat16_t(const unsigned int& x) { data = float_to_bf16_raw(static_cast<float>(x)); }
+    explicit constexpr bfloat16_t(const unsigned int& x)
+        : data(float_to_bf16_raw(static_cast<float>(x)))
+    {
+    }
 
     // cast to float
     CK_TILE_HOST_DEVICE
-    explicit operator float() const { return bf16_to_float_raw(data); }
+    explicit constexpr operator float() const { return bf16_to_float_raw(data); }
 
     // cast to int
     CK_TILE_HOST_DEVICE
-    explicit operator int() const { return static_cast<int>(bf16_to_float_raw(data)); }
+    explicit constexpr operator int() const { return static_cast<int>(bf16_to_float_raw(data)); }
 
     // internal access
     CK_TILE_HOST_DEVICE
-    raw_type& get() { return data; }
+    constexpr raw_type& get() { return data; }
 
     CK_TILE_HOST_DEVICE
-    raw_type get() const { return data; }
+    constexpr raw_type get() const { return data; }
 };
 
+using bf16_t     = bfloat16_t;
+using bf16_raw_t = typename bf16_t::raw_type;
+
 // round to nearest
 CK_TILE_HOST_DEVICE
 uint16_t float_to_bf16_rtn_raw(float f)
@@ -139,8 +146,8 @@ uint16_t float_to_bf16_truc_raw(float f)
     return uint16_t(u.int32 >> 16);
 }
 
-template <bf16_rounding_mode rounding = CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT>
-CK_TILE_HOST_DEVICE uint16_t float_to_bf16_raw(float f, constant<rounding> = {})
+template <bf16_rounding_mode rounding>
+CK_TILE_HOST_DEVICE uint16_t float_to_bf16_raw(float f, constant<rounding>)
 {
     if constexpr(rounding == bf16_rounding_mode::standard)
         return float_to_bf16_rtn_raw(f);
@@ -161,8 +168,9 @@ float bf16_to_float_raw(uint16_t x)
     return u.fp32;
 }
 
-template <bf16_rounding_mode rounding = CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT>
-CK_TILE_HOST_DEVICE bfloat16_t float_to_bf16(float f, constant<rounding> = {})
+template <bf16_rounding_mode rounding =
+              static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
+CK_TILE_HOST_DEVICE bfloat16_t float_to_bf16(float f, constant<rounding>)
 {
     return bfloat16_t::bit_cast(float_to_bf16_raw(f, constant<rounding>{}));
 }
@@ -170,14 +178,15 @@ CK_TILE_HOST_DEVICE bfloat16_t float_to_bf16(float f, constant<rounding> = {})
 CK_TILE_HOST_DEVICE
 float bf16_to_float(bfloat16_t x) { return static_cast<float>(x); }
 
-template <bf16_rounding_mode rounding = CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT>
+template <bf16_rounding_mode rounding =
+              static_cast<bf16_rounding_mode>(CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT)>
 CK_TILE_HOST_DEVICE bfloat16_t fp16_to_bf16(half_t f, constant<rounding> = {})
 {
     return bfloat16_t::bit_cast(float_to_bf16_raw(static_cast<float>(f), constant<rounding>{}));
 }
 
 CK_TILE_HOST_DEVICE
-float bf16_to_fp16(bfloat16_t x) { return float_to_fp16(static_cast<float>(x)); }
+half_t bf16_to_fp16(bfloat16_t x) { return float_to_fp16(static_cast<float>(x)); }
 
 template <class T>
 struct numeric_limits;
@@ -259,6 +268,4 @@ bfloat16_t exp2(bfloat16_t x) { return static_cast<bfloat16_t>(exp2f(static_cast
 CK_TILE_DEVICE
 bfloat16_t log(bfloat16_t x) { return static_cast<bfloat16_t>(__logf(static_cast<float>(x))); };
 
-using bf16_t = bfloat16_t;
-
 } // namespace ck_tile
diff --git a/include/ck_tile/core/numeric/float8.hpp b/include/ck_tile/core/numeric/float8.hpp
index 8ff6f06a19..11e971661b 100644
--- a/include/ck_tile/core/numeric/float8.hpp
+++ b/include/ck_tile/core/numeric/float8.hpp
@@ -7,6 +7,7 @@
 #include "ck_tile/core/utility/random.hpp"
 #include "ck_tile/core/numeric/arithmetic.hpp"
 #include "ck_tile/core/numeric/half.hpp"
+#include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/utility/limits.hpp"
 #include <stdint.h>
@@ -75,20 +76,19 @@ struct alignas(1) float8_e4m3_t
 
     // construct from float
     CK_TILE_HOST_DEVICE
-    explicit constexpr float8_e4m3_t(const float& x) { data = float_to_fp8_raw(x); }
+    explicit constexpr float8_e4m3_t(const float& x) : data(float_to_fp8_raw(x)) {}
 
     // construct from int
     CK_TILE_HOST_DEVICE
-    explicit constexpr float8_e4m3_t(const int& x)
+    explicit constexpr float8_e4m3_t(const int& x) : data(float_to_fp8_raw(static_cast<float>(x)))
     {
-        data = float_to_fp8_raw(static_cast<float>(x));
     }
 
     // construct from unsigned int
     CK_TILE_HOST_DEVICE
     explicit constexpr float8_e4m3_t(const unsigned int& x)
+        : data(float_to_fp8_raw(static_cast<float>(x)))
     {
-        data = float_to_fp8_raw(static_cast<float>(x));
     }
 
     // cast to float
@@ -106,6 +106,8 @@ struct alignas(1) float8_e4m3_t
     CK_TILE_HOST_DEVICE
     constexpr raw_type get() const { return data; }
 };
+using fp8_t     = float8_e4m3_t;
+using fp8_raw_t = typename fp8_t::raw_type;
 
 struct alignas(1) float8_e5m2_t
 {
@@ -132,25 +134,24 @@ struct alignas(1) float8_e5m2_t
 
     // construct from float
     CK_TILE_HOST_DEVICE
-    explicit constexpr float8_e5m2_t(const float& x) { data = float_to_bf8_raw(x); }
+    explicit constexpr float8_e5m2_t(const float& x) : data(float_to_bf8_raw(x)) {}
 
     // construct from int
     CK_TILE_HOST_DEVICE
-    explicit constexpr float8_e5m2_t(const int& x)
+    explicit constexpr float8_e5m2_t(const int& x) : data(float_to_bf8_raw(static_cast<float>(x)))
     {
-        data = float_to_bf8_raw(static_cast<float>(x));
     }
 
     // construct from unsigned int
     CK_TILE_HOST_DEVICE
     explicit constexpr float8_e5m2_t(const unsigned int& x)
+        : data(float_to_bf8_raw(static_cast<float>(x)))
     {
-        data = float_to_bf8_raw(static_cast<float>(x));
     }
 
     // cast to float
     CK_TILE_HOST_DEVICE
-    explicit constexpr constexpr operator float() const { return bf8_to_float_raw(data); }
+    explicit constexpr operator float() const { return bf8_to_float_raw(data); }
 
     // cast to int
     CK_TILE_HOST_DEVICE
@@ -163,6 +164,8 @@ struct alignas(1) float8_e5m2_t
     CK_TILE_HOST_DEVICE
     constexpr raw_type get() const { return data; }
 };
+using bf8_t     = float8_e5m2_t;
+using bf8_raw_t = typename bf8_t::raw_type;
 
 // below is sw fp8 conversion, not utilizing hw instruction
 namespace impl {
@@ -431,10 +434,10 @@ CK_TILE_HOST_DEVICE Y cast_from_f8(X x)
 }
 } // namespace impl
 
-CK_TILE_HOST_DEVICE uint8_t float_to_fp8_sr_raw(float x)
+CK_TILE_HOST_DEVICE fp8_raw_t float_to_fp8_sr_raw(float x)
 {
     constexpr int seed = 42;
-    uint32_t rng       = prand_generator<float, seed>{}(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng       = prand_generator_t<float, seed>{}(reinterpret_cast<uintptr_t>(&x), x);
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
     float max_fp8 = 240.0f;
     x             = x > max_fp8 ? max_fp8 : (x < -max_fp8 ? -max_fp8 : x);
@@ -453,16 +456,18 @@ CK_TILE_HOST_DEVICE uint8_t float_to_fp8_sr_raw(float x)
     constexpr bool negative_zero_nan = true;
     constexpr bool clip              = true;
     constexpr fp8_rounding_mode rm   = fp8_rounding_mode::stochastic;
-    return impl::
-        cast_to_f8<float, fp8_t, negative_zero_nan, clip, (rm == fp8_rounding_mode::stochastic)>(
-            x, rng);
+    return bit_cast<fp8_raw_t>(impl::cast_to_f8<float,
+                                                fp8_t,
+                                                negative_zero_nan,
+                                                clip,
+                                                (rm == fp8_rounding_mode::stochastic)>(x, rng));
 #endif
 }
 
-CK_TILE_HOST_DEVICE uint8_t float_to_bf8_sr_raw(float x)
+CK_TILE_HOST_DEVICE bf8_raw_t float_to_bf8_sr_raw(float x)
 {
     constexpr int seed = 42;
-    uint32_t rng       = prand_generator<float, seed>{}(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng       = prand_generator_t<float, seed>{}(reinterpret_cast<uintptr_t>(&x), x);
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
     union
     {
@@ -479,13 +484,15 @@ CK_TILE_HOST_DEVICE uint8_t float_to_bf8_sr_raw(float x)
     constexpr bool negative_zero_nan = true;
     constexpr bool clip              = true;
     constexpr fp8_rounding_mode rm   = fp8_rounding_mode::stochastic;
-    return impl::
-        cast_to_f8<float, bf8_t, negative_zero_nan, clip, (rm == fp8_rounding_mode::stochastic)>(
-            x, rng);
+    return bit_cast<bf8_raw_t>(impl::cast_to_f8<float,
+                                                bf8_t,
+                                                negative_zero_nan,
+                                                clip,
+                                                (rm == fp8_rounding_mode::stochastic)>(x, rng));
 #endif
 }
 
-CK_TILE_HOST_DEVICE uint8_t float_to_fp8_rtn_raw(float x)
+CK_TILE_HOST_DEVICE fp8_raw_t float_to_fp8_rtn_raw(float x)
 {
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
     float max_fp8 = 240.0f;
@@ -506,12 +513,14 @@ CK_TILE_HOST_DEVICE uint8_t float_to_fp8_rtn_raw(float x)
     constexpr bool clip              = true;
     constexpr fp8_rounding_mode rm   = fp8_rounding_mode::standard;
     constexpr uint32_t rng           = 0;
-    return impl::
-        cast_to_f8<float, fp8_t, negative_zero_nan, clip, (rm == fp8_rounding_mode::stochastic)>(
-            x, rng);
+    return bit_cast<fp8_raw_t>(impl::cast_to_f8<float,
+                                                fp8_t,
+                                                negative_zero_nan,
+                                                clip,
+                                                (rm == fp8_rounding_mode::stochastic)>(x, rng));
 #endif
 }
-CK_TILE_HOST_DEVICE uint8_t float_to_bf8_rtn_raw(float x)
+CK_TILE_HOST_DEVICE bf8_raw_t float_to_bf8_rtn_raw(float x)
 {
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
     union
@@ -530,30 +539,32 @@ CK_TILE_HOST_DEVICE uint8_t float_to_bf8_rtn_raw(float x)
     constexpr bool clip              = true;
     constexpr fp8_rounding_mode rm   = fp8_rounding_mode::standard;
     constexpr uint32_t rng           = 0;
-    return impl::
-        cast_to_f8<float, bf8_t, negative_zero_nan, clip, (rm == fp8_rounding_mode::stochastic)>(
-            x, rng);
+    return bit_cast<bf8_raw_t>(impl::cast_to_f8<float,
+                                                bf8_t,
+                                                negative_zero_nan,
+                                                clip,
+                                                (rm == fp8_rounding_mode::stochastic)>(x, rng));
 #endif
 }
 
 // clang-format off
-template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
-CK_TILE_HOST_DEVICE uint8_t float_to_fp8_raw(float x, constant<rounding> = {})
+template<fp8_rounding_mode rounding>
+CK_TILE_HOST_DEVICE fp8_raw_t float_to_fp8_raw(float x, constant<rounding>)
 {
     if      constexpr (rounding == fp8_rounding_mode::standard)   return float_to_fp8_rtn_raw(x);
     else if constexpr (rounding == fp8_rounding_mode::stochastic) return float_to_fp8_sr_raw(x);
-    else return uint8_t{0};
+    else return fp8_raw_t{0};
 }
 
-template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
-CK_TILE_HOST_DEVICE uint8_t float_to_bf8_raw(float x, constant<rounding> = {})
+template<fp8_rounding_mode rounding>
+CK_TILE_HOST_DEVICE bf8_raw_t float_to_bf8_raw(float x, constant<rounding>)
 {
     if      constexpr (rounding == fp8_rounding_mode::standard)   return float_to_bf8_rtn_raw(x);
     else if constexpr (rounding == fp8_rounding_mode::stochastic) return float_to_bf8_sr_raw(x);
-    else return uint8_t{0};
+    else return bf8_raw_t{0};
 }
 
-CK_TILE_HOST_DEVICE float fp8_to_float_raw(uint8_t x)
+CK_TILE_HOST_DEVICE float fp8_to_float_raw(fp8_raw_t x)
 {
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
     float fval;
@@ -563,11 +574,11 @@ CK_TILE_HOST_DEVICE float fp8_to_float_raw(uint8_t x)
     return fval;
 #else
     constexpr bool negative_zero_nan = true;
-    return impl::cast_from_f8<fp8_t, float, negative_zero_nan>(x);
+    return impl::cast_from_f8<fp8_t, float, negative_zero_nan>(fp8_t::bit_cast(x));
 #endif
 }
 
-CK_TILE_HOST_DEVICE float bf8_to_float_raw(uint8_t x)
+CK_TILE_HOST_DEVICE float bf8_to_float_raw(bf8_raw_t x)
 {
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
     float fval;
@@ -577,18 +588,18 @@ CK_TILE_HOST_DEVICE float bf8_to_float_raw(uint8_t x)
     return fval;
 #else
     constexpr bool negative_zero_nan = true;
-    return impl::cast_from_f8<bf8_t, float, negative_zero_nan>(x);
+    return impl::cast_from_f8<bf8_t, float, negative_zero_nan>(bf8_t::bit_cast(x));
 #endif
 }
 
-template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
-CK_TILE_HOST_DEVICE float8_e4m3_t float_to_fp8(float x, constant<rounding> = {})
+template<fp8_rounding_mode rounding>
+CK_TILE_HOST_DEVICE float8_e4m3_t float_to_fp8(float x, constant<rounding>)
 {
     return float8_e4m3_t::bit_cast(float_to_fp8_raw(x, constant<rounding>{}));
 }
 
-template<fp8_rounding_mode rounding = static_cast<fp8_rounding_mode>(CK_TILE_FLOAT_TO_FP8_DEFAULT)>
-CK_TILE_HOST_DEVICE float8_e5m2_t float_to_bf8(float x, constant<rounding> = {})
+template<fp8_rounding_mode rounding>
+CK_TILE_HOST_DEVICE float8_e5m2_t float_to_bf8(float x, constant<rounding>)
 {
     return float8_e5m2_t::bit_cast(float_to_bf8_raw(x, constant<rounding>{}));
 }
@@ -604,8 +615,6 @@ CK_TILE_HOST_DEVICE float bf8_to_float(float8_e5m2_t x)
 }
 
 // clang-format on
-using fp8_t = float8_e4m3_t;
-using bf8_t = float8_e5m2_t;
 
 template <typename T>
 struct numeric_utils;
diff --git a/include/ck_tile/core/numeric/half.hpp b/include/ck_tile/core/numeric/half.hpp
index 02cf05a7d1..b22f71c045 100644
--- a/include/ck_tile/core/numeric/half.hpp
+++ b/include/ck_tile/core/numeric/half.hpp
@@ -76,6 +76,9 @@ struct alignas(2) half_t
     constexpr raw_type get() const { return data; }
 };
 
+using fp16_t     = half_t;
+using fp16_raw_t = typename fp16_t::raw_type;
+
 // conversions
 CK_TILE_HOST_DEVICE
 float fp16_to_float_hip(const fp16_hip_t& x)
@@ -282,6 +285,4 @@ half_t exp2(half_t x) { return static_cast<half_t>(exp2f(static_cast<float>(x)))
 CK_TILE_DEVICE
 half_t log(half_t x) { return static_cast<half_t>(__logf(static_cast<float>(x))); };
 
-using fp16_t = half_t;
-
 } // namespace ck_tile
diff --git a/include/ck_tile/core/numeric/math.hpp b/include/ck_tile/core/numeric/math.hpp
index aa7c96b6e6..9615b979d5 100644
--- a/include/ck_tile/core/numeric/math.hpp
+++ b/include/ck_tile/core/numeric/math.hpp
@@ -147,6 +147,9 @@ CK_TILE_HOST_DEVICE constexpr T clamp(const T& x, const T& lowerbound, const T&
     return min(max(x, lowerbound), upperbound);
 }
 
+CK_TILE_HOST inline int clz(uint32_t x) { return __builtin_clz(x); }
+CK_TILE_DEVICE inline int clz(uint32_t x) { return __clz(x); }
+
 // greatest common divisor, aka highest common factor
 CK_TILE_HOST_DEVICE constexpr index_t gcd(index_t x, index_t y)
 {
@@ -222,7 +225,7 @@ struct less
 CK_TILE_HOST_DEVICE constexpr int32_t next_power_of_two(int32_t x)
 {
     // TODO: x need to be 2 ~ 0x7fffffff. 0, 1, or larger than 0x7fffffff will compile fail
-    return 1 << (32 - __builtin_clz(x - 1));
+    return 1 << (32 - clz(x - 1));
 }
 
 template <index_t X>
@@ -243,7 +246,7 @@ CK_TILE_HOST_DEVICE constexpr int32_t integer_log2_floor(int32_t x)
 {
     // TODO: x need to be 1 ~ 0x7fffffff
     // __builtin_clz will produce unexpected result if x is 0;
-    return 31 - __builtin_clz(x);
+    return 31 - clz(x);
 }
 
 CK_TILE_HOST_DEVICE constexpr bool is_power_of_two_integer(int32_t x)
diff --git a/include/ck_tile/core/numeric/type_convert.hpp b/include/ck_tile/core/numeric/type_convert.hpp
index 5eac399bf7..d64f3f4349 100644
--- a/include/ck_tile/core/numeric/type_convert.hpp
+++ b/include/ck_tile/core/numeric/type_convert.hpp
@@ -3,10 +3,22 @@
 
 #pragma once
 
-#include "ck_tile/core/config.hpp"
+#include <stdint.h>
+#include <tuple>
 #include <type_traits>
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/numeric/half.hpp"
+#include "ck_tile/core/numeric/bfloat16.hpp"
+#include "ck_tile/core/numeric/float8.hpp"
 
 namespace ck_tile {
+
+template <typename Y, typename X>
+CK_TILE_HOST_DEVICE constexpr remove_cvref_t<Y> type_convert(const X& x)
+{
+    return static_cast<Y>(x);
+}
+
 #if 0
 // Convert X to Y, both X and Y are non-const data types.
 template <typename Y,
@@ -15,11 +27,9 @@ template <typename Y,
 CK_TILE_HOST_DEVICE constexpr Y type_convert(X x)
 {
     static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
-
     return static_cast<Y>(x);
 }
 
-// TODO: const version never called, we may never need
 // Convert X to Y, either X or Y is a const data type.
 template <typename Y,
           typename X,
@@ -28,18 +38,29 @@ CK_TILE_HOST_DEVICE constexpr Y type_convert(X x)
 {
     static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
 
-    using NonConstY = std::remove_const_t<Y>;
-    using NonConstX = std::remove_const_t<X>;
-    return static_cast<Y>(type_convert<NonConstY, NonConstX>(x));
+    using non_const_y = std::remove_const_t<Y>;
+    using non_const_x = std::remove_const_t<X>;
+    return static_cast<Y>(type_convert<non_const_y, non_const_x>(x));
 }
-#else
-// compatible way to call conversion operator and constructor of each custom data type
-template <typename Y, typename X>
-CK_TILE_HOST_DEVICE constexpr Y type_convert(X x)
-{
-    static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
 
-    return static_cast<Y>(x);
-}
+#define CK_TILE_TYPE_CONVERT(dtype_, stype_)                                           \
+    template <>                                                                        \
+    inline CK_TILE_HOST_DEVICE constexpr dtype_ type_convert<dtype_, stype_>(stype_ x) \
+    {                                                                                  \
+        return stype_##_to_##dtype_(x);                                                \
+    }
+
+CK_TILE_TYPE_CONVERT(float, fp16_t)
+CK_TILE_TYPE_CONVERT(float, bf16_t)
+CK_TILE_TYPE_CONVERT(float, fp8_t)
+CK_TILE_TYPE_CONVERT(float, bf8_t)
+
+CK_TILE_TYPE_CONVERT(fp16_t, float)
+CK_TILE_TYPE_CONVERT(bf16_t, float)
+CK_TILE_TYPE_CONVERT(fp8_t, float)
+CK_TILE_TYPE_CONVERT(bf8_t, float)
+
+#undef CK_TILE_TYPE_CONVERT
 #endif
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/numeric/vector_type.hpp b/include/ck_tile/core/numeric/vector_type.hpp
index 56baba0567..f20891296b 100644
--- a/include/ck_tile/core/numeric/vector_type.hpp
+++ b/include/ck_tile/core/numeric/vector_type.hpp
@@ -10,295 +10,112 @@
 #include "ck_tile/core/numeric/float8.hpp"
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/bfloat16.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
 
 namespace ck_tile {
 
-// TODO: the whole content of this file should consider deprecated!
+// we name this as ext_vector purposely, because clang ext_vector_type extention only accept literay
+// basic type to construct a ext_vector_type you must be very careful using this, or will have lot
+// of compiler errors e.g. struct A; using Ax2_t = A __attribute__((ext_vector_type(2)));  -> will
+// have compiler error
+namespace impl {
 template <typename T_, index_t N_>
-struct vector_type
+struct ext_vector
 {
     static constexpr index_t N = N_;
     using value_type           = T_;
     using type                 = value_type __attribute__((ext_vector_type(N))); // this is danguous
-
-    CK_HOST_DEVICE constexpr vector_type()
-    {
-        for(auto i = 0; i < N; i++)
-            data[i] = static_cast<value_type>(0);
-    }
-    CK_HOST_DEVICE constexpr vector_type(type v)
-    {
-        auto& r = reinterpret_cast<const array<value_type, N>&>(v);
-        for(auto i = 0; i < N; i++)
-            data[i] = r.get(i);
-    }
-
-    value_type data[N];
-    CK_HOST_DEVICE static constexpr auto size() { return N; }
-    CK_HOST_DEVICE auto& get() { return data; }
-    CK_HOST_DEVICE const auto& get() const { return data; }
-    CK_HOST_DEVICE auto& get(index_t i) { return data[i]; }
-    CK_HOST_DEVICE const auto& get(index_t i) const { return data[i]; }
-
-    template <index_t I>
-    CK_HOST_DEVICE auto& operator[](number<I>)
-    {
-        return data[I];
-    }
-    template <index_t I>
-    CK_HOST_DEVICE const auto& operator[](number<I>) const
-    {
-        return data[I];
-    }
-    template <index_t I>
-    CK_HOST_DEVICE auto& operator()(number<I>)
-    {
-        return data[I];
-    }
-
-    CK_HOST_DEVICE auto& at(index_t i) { return data[i]; }
-    CK_HOST_DEVICE const auto& at(index_t i) const { return data[i]; }
-    template <index_t I>
-    CK_HOST_DEVICE auto& at()
-    {
-        return data[I];
-    }
-    template <index_t I>
-    CK_HOST_DEVICE const auto& at() const
-    {
-        return data[I];
-    }
-    template <index_t I>
-    CK_HOST_DEVICE auto& at(number<I>)
-    {
-        return data[I];
-    }
-    template <index_t I>
-    CK_HOST_DEVICE const auto& at(number<I>) const
-    {
-        return data[I];
-    }
-
-#define _VT_COMMON_AS()                                      \
-    static_assert(sizeof(value_type) * N % sizeof(Tx) == 0); \
-    constexpr int vx = sizeof(value_type) * N / sizeof(Tx)
-
-    template <typename Tx>
-    CK_HOST_DEVICE auto& get_as()
-    {
-        _VT_COMMON_AS();
-        return reinterpret_cast<array<Tx, vx>&>(data);
-    }
-    template <typename Tx>
-    CK_HOST_DEVICE const auto& get_as() const
-    {
-        _VT_COMMON_AS();
-        return reinterpret_cast<const array<Tx, vx>&>(data);
-    }
-    template <typename Tx>
-    CK_HOST_DEVICE auto& get_as(index_t i)
-    {
-        _VT_COMMON_AS();
-        return reinterpret_cast<array<Tx, vx>&>(data).get(i);
-    }
-    template <typename Tx>
-    CK_HOST_DEVICE const auto& get_as(index_t i) const
-    {
-        _VT_COMMON_AS();
-        return reinterpret_cast<const array<Tx, vx>&>(data).get(i);
-    }
-#undef _VT_COMMON_AS
 };
+} // namespace impl
 
 template <typename T, index_t N>
-struct vector_type_maker
+using ext_vector_t = typename impl::ext_vector<T, N>::type;
+
+// by default, any type will result in a vector_size=1 with scalar_type=T traits.
+// ... unless we have other vector_traits specialization
+template <typename T>
+struct vector_traits
 {
-    using type = vector_type<T, N>;
-};
-
-template <typename T, index_t N0, index_t N1>
-struct vector_type_maker<T __attribute__((ext_vector_type(N1))), N0>
-{
-    using type = vector_type<T, N0 * N1>;
-};
-
-template <typename T, index_t N0, index_t N1>
-struct vector_type_maker<vector_type<T, N1>, N0>
-{
-    using type = vector_type<T, N0 * N1>;
+    using scalar_type                    = remove_cvref_t<T>;
+    static constexpr index_t vector_size = 1;
 };
 
+// specialization for ext_vector_type()
 template <typename T, index_t N>
-using vector_type_maker_t = typename vector_type_maker<T, N>::type;
-
-template <typename T, index_t N>
-CK_HOST_DEVICE constexpr auto make_vector_type(number<N>)
+struct vector_traits<T __attribute__((ext_vector_type(N)))>
 {
-    return typename vector_type_maker<T, N>::type{};
-}
-
-// scalar_type
-template <typename TV>
-struct scalar_type;
-
-// is_scalar_type
-template <typename TV>
-struct is_scalar_type
-{
-    static constexpr bool value = (scalar_type<remove_cvref_t<TV>>::vector_size == 1);
-};
-
-// has_same_scalar_type
-template <typename X, typename Y>
-using has_same_scalar_type = is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                     typename scalar_type<remove_cvref_t<Y>>::type>;
-
-template <typename T, index_t N>
-struct scalar_type<T __attribute__((ext_vector_type(N)))>
-{
-    using type                           = T;
+    using scalar_type                    = T;
     static constexpr index_t vector_size = N;
 };
 
-template <typename T, index_t N>
-struct scalar_type<vector_type<T, N>>
-{
-    using type                           = T;
-    static constexpr index_t vector_size = N;
-};
-
-//
-template <>
-struct scalar_type<double>
-{
-    using type                           = double;
-    static constexpr index_t vector_size = 1;
-};
-
-template <>
-struct scalar_type<float>
-{
-    using type                           = float;
-    static constexpr index_t vector_size = 1;
-};
-
-template <>
-struct scalar_type<half_t>
-{
-    using type                           = half_t;
-    static constexpr index_t vector_size = 1;
-};
-
-template <>
-struct scalar_type<bhalf_t>
-{
-    using type                           = bhalf_t;
-    static constexpr index_t vector_size = 1;
-};
-
-template <>
-struct scalar_type<int64_t>
-{
-    using type                           = int64_t;
-    static constexpr index_t vector_size = 1;
-};
-
-template <>
-struct scalar_type<int32_t>
-{
-    using type                           = int32_t;
-    static constexpr index_t vector_size = 1;
-};
-
-template <>
-struct scalar_type<int8_t>
-{
-    using type                           = int8_t;
-    static constexpr index_t vector_size = 1;
-};
-
-#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-template <>
-struct scalar_type<int4_t>
-{
-    using type                           = int4_t;
-    static constexpr index_t vector_size = 1;
-};
-#endif
-
-template <>
-struct scalar_type<fp8_t>
-{
-    using type                           = fp8_t;
-    static constexpr index_t vector_size = 1;
-};
-
-template <>
-struct scalar_type<bf8_t>
-{
-    using type                           = bf8_t;
-    static constexpr index_t vector_size = 1;
-};
-
 // below are some pre-defines of ext_vector_type
+// attention! 2 vector type could be just the same type
 // fp64
-using double2_t = typename vector_type<double, 2>::type;
-using double4_t = typename vector_type<double, 4>::type;
+using fp64x2_t = double __attribute__((ext_vector_type(2)));
+using fp64x4_t = double __attribute__((ext_vector_type(4)));
 
 // fp32
-using float2_t  = typename vector_type<float, 2>::type;
-using float4_t  = typename vector_type<float, 4>::type;
-using float8_t  = typename vector_type<float, 8>::type;
-using float16_t = typename vector_type<float, 16>::type;
-using float32_t = typename vector_type<float, 32>::type;
-using float64_t = typename vector_type<float, 64>::type;
+using fp32x2_t  = float __attribute__((ext_vector_type(2)));
+using fp32x4_t  = float __attribute__((ext_vector_type(4)));
+using fp32x8_t  = float __attribute__((ext_vector_type(8)));
+using fp32x16_t = float __attribute__((ext_vector_type(16)));
+using fp32x32_t = float __attribute__((ext_vector_type(32)));
+using fp32x64_t = float __attribute__((ext_vector_type(64)));
 
 // fp16
-using half2_t  = typename vector_type<half_t, 2>::type;
-using half4_t  = typename vector_type<half_t, 4>::type;
-using half8_t  = typename vector_type<half_t, 8>::type;
-using half16_t = typename vector_type<half_t, 16>::type;
-using half32_t = typename vector_type<half_t, 32>::type;
-using half64_t = typename vector_type<half_t, 64>::type;
+using fp16x2_t  = fp16_raw_t __attribute__((ext_vector_type(2)));
+using fp16x4_t  = fp16_raw_t __attribute__((ext_vector_type(4)));
+using fp16x8_t  = fp16_raw_t __attribute__((ext_vector_type(8)));
+using fp16x16_t = fp16_raw_t __attribute__((ext_vector_type(16)));
+using fp16x32_t = fp16_raw_t __attribute__((ext_vector_type(32)));
+using fp16x64_t = fp16_raw_t __attribute__((ext_vector_type(64)));
 
 // bfp16
-using bhalf2_t  = typename vector_type<bhalf_t, 2>::type;
-using bhalf4_t  = typename vector_type<bhalf_t, 4>::type;
-using bhalf8_t  = typename vector_type<bhalf_t, 8>::type;
-using bhalf16_t = typename vector_type<bhalf_t, 16>::type;
-using bhalf32_t = typename vector_type<bhalf_t, 32>::type;
-using bhalf64_t = typename vector_type<bhalf_t, 64>::type;
+using bf16x2_t  = bf16_raw_t __attribute__((ext_vector_type(2)));
+using bf16x4_t  = bf16_raw_t __attribute__((ext_vector_type(4)));
+using bf16x8_t  = bf16_raw_t __attribute__((ext_vector_type(8)));
+using bf16x16_t = bf16_raw_t __attribute__((ext_vector_type(16)));
+using bf16x32_t = bf16_raw_t __attribute__((ext_vector_type(32)));
+using bf16x64_t = bf16_raw_t __attribute__((ext_vector_type(64)));
 
 // i32
-using int32x2_t  = typename vector_type<int32_t, 2>::type;
-using int32x4_t  = typename vector_type<int32_t, 4>::type;
-using int32x8_t  = typename vector_type<int32_t, 8>::type;
-using int32x16_t = typename vector_type<int32_t, 16>::type;
-using int32x32_t = typename vector_type<int32_t, 32>::type;
-using int32x64_t = typename vector_type<int32_t, 64>::type;
+using int32x2_t  = int32_t __attribute__((ext_vector_type(2)));
+using int32x4_t  = int32_t __attribute__((ext_vector_type(4)));
+using int32x8_t  = int32_t __attribute__((ext_vector_type(8)));
+using int32x16_t = int32_t __attribute__((ext_vector_type(16)));
+using int32x32_t = int32_t __attribute__((ext_vector_type(32)));
+using int32x64_t = int32_t __attribute__((ext_vector_type(64)));
+
+// i16
+using int16x2_t  = int16_t __attribute__((ext_vector_type(2)));
+using int16x4_t  = int16_t __attribute__((ext_vector_type(4)));
+using int16x8_t  = int16_t __attribute__((ext_vector_type(8)));
+using int16x16_t = int16_t __attribute__((ext_vector_type(16)));
+using int16x32_t = int16_t __attribute__((ext_vector_type(32)));
+using int16x64_t = int16_t __attribute__((ext_vector_type(64)));
 
 // i8
-using int8x2_t  = typename vector_type<int8_t, 2>::type;
-using int8x4_t  = typename vector_type<int8_t, 4>::type;
-using int8x8_t  = typename vector_type<int8_t, 8>::type;
-using int8x16_t = typename vector_type<int8_t, 16>::type;
-using int8x32_t = typename vector_type<int8_t, 32>::type;
-using int8x64_t = typename vector_type<int8_t, 64>::type;
+using int8x2_t  = int8_t __attribute((ext_vector_type(2)));
+using int8x4_t  = int8_t __attribute((ext_vector_type(4)));
+using int8x8_t  = int8_t __attribute((ext_vector_type(8)));
+using int8x16_t = int8_t __attribute((ext_vector_type(16)));
+using int8x32_t = int8_t __attribute((ext_vector_type(32)));
+using int8x64_t = int8_t __attribute((ext_vector_type(64)));
 
 // f8
-using fp8x2_t  = typename vector_type<fp8_t, 2>::type;
-using fp8x4_t  = typename vector_type<fp8_t, 4>::type;
-using fp8x8_t  = typename vector_type<fp8_t, 8>::type;
-using fp8x16_t = typename vector_type<fp8_t, 16>::type;
-using fp8x32_t = typename vector_type<fp8_t, 32>::type;
-using fp8x64_t = typename vector_type<fp8_t, 64>::type;
+using fp8x2_t  = fp8_raw_t __attribute((ext_vector_type(2)));
+using fp8x4_t  = fp8_raw_t __attribute((ext_vector_type(4)));
+using fp8x8_t  = fp8_raw_t __attribute((ext_vector_type(8)));
+using fp8x16_t = fp8_raw_t __attribute((ext_vector_type(16)));
+using fp8x32_t = fp8_raw_t __attribute((ext_vector_type(32)));
+using fp8x64_t = fp8_raw_t __attribute((ext_vector_type(64)));
 
 // bf8
-using bf8x2_t  = typename vector_type<bf8_t, 2>::type;
-using bf8x4_t  = typename vector_type<bf8_t, 4>::type;
-using bf8x8_t  = typename vector_type<bf8_t, 8>::type;
-using bf8x16_t = typename vector_type<bf8_t, 16>::type;
-using bf8x32_t = typename vector_type<bf8_t, 32>::type;
-using bf8x64_t = typename vector_type<bf8_t, 64>::type;
+using bf8x2_t  = bf8_raw_t __attribute((ext_vector_type(2)));
+using bf8x4_t  = bf8_raw_t __attribute((ext_vector_type(4)));
+using bf8x8_t  = bf8_raw_t __attribute((ext_vector_type(8)));
+using bf8x16_t = bf8_raw_t __attribute((ext_vector_type(16)));
+using bf8x32_t = bf8_raw_t __attribute((ext_vector_type(32)));
+using bf8x64_t = bf8_raw_t __attribute((ext_vector_type(64)));
 
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index eca5efdffb..efb4f2ad43 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "ck_tile/core/config.hpp"
-#include "ck_tile/core/arch/amd_address_space.hpp"
+#include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/arch/amd_buffer_addressing.hpp"
 #include "ck_tile/core/container/array.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
@@ -12,6 +12,7 @@
 #include "ck_tile/core/numeric/float8.hpp"
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/bfloat16.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
 
 namespace ck_tile {
 
@@ -22,13 +23,13 @@ namespace ck_tile {
 // FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
 //        transforms of tensor_view/Tensor
 // FIXME: amd_buffer_coherence_enum is only meaningful for buffer addressing. Need to split
-// BufferView definition for different memory address space (Global/GenericLds/Vgpr)
+// buffer_view definition for different memory address space (Global/GenericLds/Vgpr)
 template <address_space_enum BufferAddressSpace,
           typename T,
           typename BufferSizeType,
           bool InvalidElementUseNumericalZeroValue,
           amd_buffer_coherence_enum Coherence = amd_buffer_coherence_enum::coherence_default>
-struct BufferView;
+struct buffer_view;
 
 // Address Space: generic
 // T may be scalar or vector
@@ -82,17 +83,18 @@ struct buffer_view<address_space_enum::generic,
 
     // i is offset of T, not X. i should be aligned to X
     template <typename X,
-              bool oob_conditional_check          = true,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE constexpr auto
     get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
     {
         // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
 
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
 
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
@@ -123,19 +125,20 @@ struct buffer_view<address_space_enum::generic,
     }
 
     // i is offset of T, not X. i should be aligned to X
-    template <InMemoryDataOperationEnum Op,
+    template <memory_operation_enum Op,
               typename X,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
     {
-        if constexpr(Op == InMemoryDataOperationEnum::set)
+        if constexpr(Op == memory_operation_enum::set)
         {
             this->template set<X>(i, is_valid_element, x);
         }
-        // FIXME: remove InMemoryDataOperationEnum::Add
-        else if constexpr(Op == InMemoryDataOperationEnum::Add)
+        // FIXME: remove memory_operation_enum::add
+        else if constexpr(Op == memory_operation_enum::add)
         {
             auto tmp = this->template get<X>(i, is_valid_element);
             this->template set<X>(i, is_valid_element, x + tmp);
@@ -144,15 +147,16 @@ struct buffer_view<address_space_enum::generic,
 
     // i is offset of T, not X. i should be aligned to X
     template <typename X,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
     {
         // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
 
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
 
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
@@ -253,17 +257,18 @@ struct buffer_view<address_space_enum::global,
 
     // i is offset of T, not X. i should be aligned to X
     template <typename X,
-              bool oob_conditional_check          = true,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE constexpr auto
     get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
     {
         // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
 
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
 
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
@@ -326,16 +331,17 @@ struct buffer_view<address_space_enum::global,
 
     // i is offset of T, not X. i should be aligned to X
     template <typename X,
-              bool oob_conditional_check          = true,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE constexpr auto
     get_raw(remove_cvref_t<X>& dst, index_t i, bool is_valid_element) const
     {
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
 
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
 
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
@@ -348,15 +354,16 @@ struct buffer_view<address_space_enum::global,
 
     // i is offset of T, not X. i should be aligned to X
     template <typename X,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE constexpr auto
     async_get(remove_cvref_t<T>* smem, index_t i, bool /*is_valid_element*/) const
     {
         // X is vector of T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
 
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
@@ -368,27 +375,28 @@ struct buffer_view<address_space_enum::global,
     }
 
     // i is offset of T, not X. i should be aligned to X
-    template <InMemoryDataOperationEnum Op,
+    template <memory_operation_enum Op,
               typename X,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
     {
-        if constexpr(Op == InMemoryDataOperationEnum::set)
+        if constexpr(Op == memory_operation_enum::set)
         {
             this->template set<X>(i, is_valid_element, x);
         }
-        else if constexpr(Op == InMemoryDataOperationEnum::atomic_add)
+        else if constexpr(Op == memory_operation_enum::atomic_add)
         {
             this->template atomic_add<X>(i, is_valid_element, x);
         }
-        else if constexpr(Op == InMemoryDataOperationEnum::atomic_max)
+        else if constexpr(Op == memory_operation_enum::atomic_max)
         {
             this->template atomic_max<X>(i, is_valid_element, x);
         }
-        // FIXME: remove InMemoryDataOperationEnum::Add
-        else if constexpr(Op == InMemoryDataOperationEnum::Add)
+        // FIXME: remove memory_operation_enum::add
+        else if constexpr(Op == memory_operation_enum::add)
         {
             auto tmp = this->template get<X>(i, is_valid_element);
             this->template set<X>(i, is_valid_element, x + tmp);
@@ -399,16 +407,17 @@ struct buffer_view<address_space_enum::global,
 
     // i is offset of T, not X. i should be aligned to X
     template <typename X,
-              bool oob_conditional_check          = true,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
     {
         // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
 
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
 
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
@@ -443,16 +452,17 @@ struct buffer_view<address_space_enum::global,
 
     // i is offset of T, not X. i should be aligned to X
     template <typename X,
-              bool oob_conditional_check          = true,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE void set_raw(index_t i, bool is_valid_element, const X& x)
     {
         // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
 
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
 
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
@@ -463,17 +473,18 @@ struct buffer_view<address_space_enum::global,
     }
 
     template <typename X,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE void atomic_add(index_t i, bool is_valid_element, const X& x)
     {
-        using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
+        using scalar_t = typename vector_traits<remove_cvref_t<T>>::scalar_type;
 
         // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
 
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
 
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
@@ -482,15 +493,16 @@ struct buffer_view<address_space_enum::global,
 
 #if CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
         bool constexpr use_amd_buffer_addressing =
-            is_same_v<remove_cvref_t<scalar_t>, int32_t> ||
-            is_same_v<remove_cvref_t<scalar_t>, float> ||
-            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
+            std::is_same_v<remove_cvref_t<scalar_t>, int32_t> ||
+            std::is_same_v<remove_cvref_t<scalar_t>, float> ||
+            (std::is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
 #elif CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
-        bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, int32_t>;
+        bool constexpr use_amd_buffer_addressing =
+            std::is_same_v<remove_cvref_t<scalar_t>, int32_t>;
 #elif(!CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
         bool constexpr use_amd_buffer_addressing =
-            is_same_v<remove_cvref_t<scalar_t>, float> ||
-            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
+            std::is_same_v<remove_cvref_t<scalar_t>, float> ||
+            (std::is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
 #else
         bool constexpr use_amd_buffer_addressing = false;
 #endif
@@ -512,15 +524,16 @@ struct buffer_view<address_space_enum::global,
     }
 
     template <typename X,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE void atomic_max(index_t i, bool is_valid_element, const X& x)
     {
         // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
 
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
 
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
@@ -528,8 +541,8 @@ struct buffer_view<address_space_enum::global,
         static_assert(get_address_space() == address_space_enum::global, "only support global mem");
 
 #if CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
-        using scalar_t                           = typename scalar_type<remove_cvref_t<T>>::type;
-        bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, double>;
+        using scalar_t = typename vector_traits<remove_cvref_t<T>>::scalar_type;
+        bool constexpr use_amd_buffer_addressing = std::is_same_v<remove_cvref_t<scalar_t>, double>;
 #else
         bool constexpr use_amd_buffer_addressing = false;
 #endif
@@ -628,17 +641,18 @@ struct buffer_view<address_space_enum::lds,
 
     // i is offset of T, not X. i should be aligned to X
     template <typename X,
-              bool oob_conditional_check          = true,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE constexpr auto
     get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
     {
         // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
 
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
 
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
@@ -669,19 +683,20 @@ struct buffer_view<address_space_enum::lds,
     }
 
     // i is offset of T, not X. i should be aligned to X
-    template <InMemoryDataOperationEnum Op,
+    template <memory_operation_enum Op,
               typename X,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
     {
-        if constexpr(Op == InMemoryDataOperationEnum::set)
+        if constexpr(Op == memory_operation_enum::set)
         {
             this->template set<X>(i, is_valid_element, x);
         }
-        // FIXME: remove InMemoryDataOperationEnum::Add
-        else if constexpr(Op == InMemoryDataOperationEnum::Add)
+        // FIXME: remove memory_operation_enum::add
+        else if constexpr(Op == memory_operation_enum::add)
         {
             auto tmp = this->template get<X>(i, is_valid_element);
             this->template set<X>(i, is_valid_element, x + tmp);
@@ -690,15 +705,16 @@ struct buffer_view<address_space_enum::lds,
 
     // i is offset of T, not X. i should be aligned to X
     template <typename X,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
     {
         // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
 
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
 
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
@@ -709,7 +725,8 @@ struct buffer_view<address_space_enum::lds,
         bool constexpr workaround_int8_ds_write_issue = false;
 #endif
 
-        if constexpr(is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value &&
+        if constexpr(std::is_same<typename vector_traits<remove_cvref_t<T>>::scalar_type,
+                                  int8_t>::value &&
                      workaround_int8_ds_write_issue)
         {
             if(is_valid_element)
@@ -718,83 +735,83 @@ struct buffer_view<address_space_enum::lds,
                 // ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
                 // ds_write_b128
                 // TODO: remove this after compiler fix
-                static_assert((is_same<remove_cvref_t<T>, int8_t>::value &&
-                               is_same<remove_cvref_t<X>, int8_t>::value) ||
-                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   is_same<remove_cvref_t<X>, int8x2_t>::value) ||
-                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   is_same<remove_cvref_t<X>, int8x4_t>::value) ||
-                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   is_same<remove_cvref_t<X>, int8x8_t>::value) ||
-                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   is_same<remove_cvref_t<X>, int8x16_t>::value) ||
-                                  (is_same<remove_cvref_t<T>, int8x4_t>::value &&
-                                   is_same<remove_cvref_t<X>, int8x4_t>::value) ||
-                                  (is_same<remove_cvref_t<T>, int8x8_t>::value &&
-                                   is_same<remove_cvref_t<X>, int8x8_t>::value) ||
-                                  (is_same<remove_cvref_t<T>, int8x16_t>::value &&
-                                   is_same<remove_cvref_t<X>, int8x16_t>::value),
+                static_assert((std::is_same<remove_cvref_t<T>, int8_t>::value &&
+                               std::is_same<remove_cvref_t<X>, int8_t>::value) ||
+                                  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   std::is_same<remove_cvref_t<X>, int8x2_t>::value) ||
+                                  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   std::is_same<remove_cvref_t<X>, int8x4_t>::value) ||
+                                  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   std::is_same<remove_cvref_t<X>, int8x8_t>::value) ||
+                                  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   std::is_same<remove_cvref_t<X>, int8x16_t>::value) ||
+                                  (std::is_same<remove_cvref_t<T>, int8x4_t>::value &&
+                                   std::is_same<remove_cvref_t<X>, int8x4_t>::value) ||
+                                  (std::is_same<remove_cvref_t<T>, int8x8_t>::value &&
+                                   std::is_same<remove_cvref_t<X>, int8x8_t>::value) ||
+                                  (std::is_same<remove_cvref_t<T>, int8x16_t>::value &&
+                                   std::is_same<remove_cvref_t<X>, int8x16_t>::value),
                               "wrong! not implemented for this combination, please add "
                               "implementation");
 
-                if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
-                             is_same<remove_cvref_t<X>, int8_t>::value)
+                if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
+                             std::is_same<remove_cvref_t<X>, int8_t>::value)
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
                     *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
                         *c_style_pointer_cast<const int8_t*>(&x);
                 }
-                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  is_same<remove_cvref_t<X>, int8x2_t>::value)
+                else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  std::is_same<remove_cvref_t<X>, int8x2_t>::value)
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
                     *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
                         *c_style_pointer_cast<const int16_t*>(&x);
                 }
-                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  is_same<remove_cvref_t<X>, int8x4_t>::value)
+                else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  std::is_same<remove_cvref_t<X>, int8x4_t>::value)
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
                     *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
                         *c_style_pointer_cast<const int32_t*>(&x);
                 }
-                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  is_same<remove_cvref_t<X>, int8x8_t>::value)
+                else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  std::is_same<remove_cvref_t<X>, int8x8_t>::value)
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
                     *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
                         *c_style_pointer_cast<const int32x2_t*>(&x);
                 }
-                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  is_same<remove_cvref_t<X>, int8x16_t>::value)
+                else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  std::is_same<remove_cvref_t<X>, int8x16_t>::value)
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
                     *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
                         *c_style_pointer_cast<const int32x4_t*>(&x);
                 }
-                else if constexpr(is_same<remove_cvref_t<T>, int8x4_t>::value &&
-                                  is_same<remove_cvref_t<X>, int8x4_t>::value)
+                else if constexpr(std::is_same<remove_cvref_t<T>, int8x4_t>::value &&
+                                  std::is_same<remove_cvref_t<X>, int8x4_t>::value)
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
                     *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
                         *c_style_pointer_cast<const int32_t*>(&x);
                 }
-                else if constexpr(is_same<remove_cvref_t<T>, int8x8_t>::value &&
-                                  is_same<remove_cvref_t<X>, int8x8_t>::value)
+                else if constexpr(std::is_same<remove_cvref_t<T>, int8x8_t>::value &&
+                                  std::is_same<remove_cvref_t<X>, int8x8_t>::value)
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
                     *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
                         *c_style_pointer_cast<const int32x2_t*>(&x);
                 }
-                else if constexpr(is_same<remove_cvref_t<T>, int8x16_t>::value &&
-                                  is_same<remove_cvref_t<X>, int8x16_t>::value)
+                else if constexpr(std::is_same<remove_cvref_t<T>, int8x16_t>::value &&
+                                  std::is_same<remove_cvref_t<X>, int8x16_t>::value)
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
@@ -899,17 +916,18 @@ struct buffer_view<address_space_enum::vgpr,
 
     // i is offset of T, not X. i should be aligned to X
     template <typename X,
-              bool oob_conditional_check          = true,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE constexpr auto
     get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
     {
         // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
 
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
 
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
@@ -940,19 +958,20 @@ struct buffer_view<address_space_enum::vgpr,
     }
 
     // i is offset of T, not X. i should be aligned to X
-    template <InMemoryDataOperationEnum Op,
+    template <memory_operation_enum Op,
               typename X,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
     {
-        if constexpr(Op == InMemoryDataOperationEnum::set)
+        if constexpr(Op == memory_operation_enum::set)
         {
             this->template set<X>(i, is_valid_element, x);
         }
-        // FIXME: remove InMemoryDataOperationEnum::Add
-        else if constexpr(Op == InMemoryDataOperationEnum::Add)
+        // FIXME: remove memory_operation_enum::add
+        else if constexpr(Op == memory_operation_enum::add)
         {
             auto tmp = this->template get<X>(i, is_valid_element);
             this->template set<X>(i, is_valid_element, x + tmp);
@@ -961,15 +980,16 @@ struct buffer_view<address_space_enum::vgpr,
 
     // i is offset of T, not X. i should be aligned to X
     template <typename X,
-              typename std::enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                              typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                      bool>::type = false>
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
     CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
     {
         // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
 
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
 
         static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
                       "wrong! X should contain multiple T");
@@ -1029,7 +1049,7 @@ template <address_space_enum BufferAddressSpace,
           typename T,
           typename BufferSizeType,
           typename X,
-          typename std::enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value,
+          typename std::enable_if<std::is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value,
                                   bool>::type = false>
 CK_TILE_HOST_DEVICE constexpr auto
 make_buffer_view(T* p, BufferSizeType buffer_size, X invalid_element_value)
diff --git a/include/ck_tile/core/tensor/load_tile.hpp b/include/ck_tile/core/tensor/load_tile.hpp
index 1d9c7c9c79..288a60602a 100644
--- a/include/ck_tile/core/tensor/load_tile.hpp
+++ b/include/ck_tile/core/tensor/load_tile.hpp
@@ -11,6 +11,9 @@
 #include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/core/tensor/tile_window.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
+#include "ck_tile/core/tensor/tile_window.hpp"
+#include "ck_tile/core/tensor/null_tile_window.hpp"
+#include "ck_tile/core/tensor/null_tensor.hpp"
 
 namespace ck_tile {
 
@@ -65,13 +68,13 @@ CK_TILE_DEVICE auto async_load_fence(index_t cnt = 0)
 }
 
 template <typename WindowLengths>
-CK_TILE_DEVICE auto load_tile(const NullTileWindow<WindowLengths>&)
+CK_TILE_DEVICE auto load_tile(const null_tile_window<WindowLengths>&)
 {
-    return NullTensor{};
+    return null_tensor{};
 }
 
 template <typename T, typename WindowLengths>
-CK_TILE_DEVICE auto load_tile_raw(T& /*null_tile*/, const NullTileWindow<WindowLengths>&)
+CK_TILE_DEVICE auto load_tile_raw(T& /*null_tile*/, const null_tile_window<WindowLengths>&)
 {
 }
 
diff --git a/include/ck_tile/core/tensor/null_tile_window.hpp b/include/ck_tile/core/tensor/null_tile_window.hpp
index ad7dd072dc..89806203ab 100644
--- a/include/ck_tile/core/tensor/null_tile_window.hpp
+++ b/include/ck_tile/core/tensor/null_tile_window.hpp
@@ -9,6 +9,7 @@
 #include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/core/tensor/tile_window.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
+#include "ck_tile/core/tensor/tensor_view.hpp"
 
 namespace ck_tile {
 
diff --git a/include/ck_tile/core/tensor/shuffle_tile.hpp b/include/ck_tile/core/tensor/shuffle_tile.hpp
index 43a8a38e89..edf3e6eebb 100644
--- a/include/ck_tile/core/tensor/shuffle_tile.hpp
+++ b/include/ck_tile/core/tensor/shuffle_tile.hpp
@@ -8,11 +8,13 @@
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/algorithm/coordinate_transform.hpp"
+#include "ck_tile/core/algorithm/space_filling_curve.hpp"
 #include "ck_tile/core/container/container_helper.hpp"
 #include "ck_tile/core/container/statically_indexed_array.hpp"
 #include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 #include "ck_tile/core/tensor/tile_elementwise.hpp"
+#include "ck_tile/core/utility/transpose_vectors.hpp"
 
 namespace ck_tile {
 namespace detail {
@@ -74,8 +76,8 @@ CK_TILE_DEVICE void shuffle_tile_impl_in_thread(OutTensor& out_tensor, const InT
     constexpr index_t num_vec_in  = vec_length_out;
     constexpr index_t num_vec_out = vec_length_in;
 
-    using InVec  = vector_type<DataType, vec_length_in>;
-    using OutVec = vector_type<DataType, vec_length_out>;
+    using InVec  = array<DataType, vec_length_in>;
+    using OutVec = array<DataType, vec_length_out>;
 
     using InVecType  = typename InVec::type;
     using OutVecType = typename OutVec::type;
@@ -114,7 +116,7 @@ CK_TILE_DEVICE void shuffle_tile_impl_in_thread(OutTensor& out_tensor, const InT
 
             constexpr index_t in_offset = y_in_desc.calculate_offset(idx_y_in);
 
-            in_vectors(i).template AsType<InVecType>()(I0) =
+            in_vectors(i).template get_as<InVecType>()(I0) =
                 in_tensor.get_thread_buffer().template get_as<InVecType>(number<in_offset>{});
         });
 
@@ -134,7 +136,7 @@ CK_TILE_DEVICE void shuffle_tile_impl_in_thread(OutTensor& out_tensor, const InT
 
             out_tensor.get_thread_buffer().template set_as<OutVecType>(
                 number<out_offset / sizeof(OutVecType)>{},
-                out_vectors[i].template AsType<OutVecType>()[I0]);
+                out_vectors[i].template get_as<OutVecType>()[I0]);
         });
     });
 }
diff --git a/include/ck_tile/core/tensor/slice_tile.hpp b/include/ck_tile/core/tensor/slice_tile.hpp
index 35ef4ac405..59f94a2796 100644
--- a/include/ck_tile/core/tensor/slice_tile.hpp
+++ b/include/ck_tile/core/tensor/slice_tile.hpp
@@ -84,7 +84,7 @@ set_slice_tile(static_distributed_tensor<DstDataType_, DstStaticTileDistribution
     constexpr auto sliced_y_origins = sliced_dstr_yidx_ylen.template at<1>();
     constexpr auto sliced_y_lengths = sliced_dstr_yidx_ylen.template at<2>();
 
-    static_assert(is_same_v<decltype(sliced_dstr), DstDistribution>, "wrong!");
+    static_assert(std::is_same_v<decltype(sliced_dstr), DstDistribution>, "wrong!");
 
     dst_tile.SetSlicedThreadData(sliced_y_origins, sliced_y_lengths, src_tile.get_thread_buffer());
 }
diff --git a/include/ck_tile/core/tensor/static_distributed_tensor.hpp b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
index 1f90ff2b95..0c9e0debb1 100644
--- a/include/ck_tile/core/tensor/static_distributed_tensor.hpp
+++ b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
@@ -11,6 +11,7 @@
 #include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
+#include "ck_tile/core/tensor/tile_distribution.hpp"
 
 namespace ck_tile {
 
diff --git a/include/ck_tile/core/tensor/store_tile.hpp b/include/ck_tile/core/tensor/store_tile.hpp
index 6563f75a06..c12ad883d9 100644
--- a/include/ck_tile/core/tensor/store_tile.hpp
+++ b/include/ck_tile/core/tensor/store_tile.hpp
@@ -25,7 +25,7 @@ store_tile(tile_window_with_static_lengths<BottomTensorView_, WindowLengths_>& t
     using DataType = remove_cvref_t<typename BottomTensorView_::DataType>;
     using TileDstr = remove_cvref_t<TileDistribution_>;
 
-    static_assert(is_same_v<remove_cvref_t<DataType_>, DataType>, "wrong!");
+    static_assert(std::is_same_v<remove_cvref_t<DataType_>, DataType>, "wrong!");
 
     constexpr auto tile_dstr = TileDstr{};
 
@@ -48,7 +48,7 @@ store_tile_raw(tile_window_with_static_lengths<BottomTensorView_, WindowLengths_
     using DataType = remove_cvref_t<typename BottomTensorView_::DataType>;
     using TileDstr = remove_cvref_t<TileDistribution_>;
 
-    static_assert(is_same_v<remove_cvref_t<DataType_>, DataType>, "wrong!");
+    static_assert(std::is_same_v<remove_cvref_t<DataType_>, DataType>, "wrong!");
 
     constexpr auto tile_dstr = TileDstr{};
 
diff --git a/include/ck_tile/core/tensor/tensor_adaptor.hpp b/include/ck_tile/core/tensor/tensor_adaptor.hpp
index 726696e9d7..2a3ecd8f70 100644
--- a/include/ck_tile/core/tensor/tensor_adaptor.hpp
+++ b/include/ck_tile/core/tensor/tensor_adaptor.hpp
@@ -717,7 +717,7 @@ CK_TILE_HOST_DEVICE constexpr auto chain_tensor_adaptors(const X& x, const Xs&..
         constexpr auto encoded_top_dims    = encoded_tensor_adaptor.template at<4>();             \
         constexpr index_t num_top_dim      = encoded_tensor_adaptor.template at<5>();             \
                                                                                                   \
-        constexpr auto trans = [&encoded_transforms, &num_transform]() {                          \
+        constexpr auto trans = [&encoded_transforms]() {                          \
             return generate_tuple(                                                                \
                 [&encoded_transforms](auto i) constexpr {                                         \
                     constexpr auto name        = encoded_transforms[i].template at<0>();          \
@@ -725,7 +725,7 @@ CK_TILE_HOST_DEVICE constexpr auto chain_tensor_adaptors(const X& x, const Xs&..
                     constexpr auto num_low_dim = encoded_transforms[i].template at<2>();          \
                     constexpr auto num_up_dim  = encoded_transforms[i].template at<4>();          \
                                                                                                   \
-                    STATIC_ASSERT(name == cood_transform_enum::PassThrough ||                     \
+                    STATIC_ASSERT(name == cood_transform_enum::pass_through ||                    \
                                       name == cood_transform_enum::pad ||                         \
                                       name == cood_transform_enum::embed ||                       \
                                       name == cood_transform_enum::merge ||                       \
@@ -733,7 +733,7 @@ CK_TILE_HOST_DEVICE constexpr auto chain_tensor_adaptors(const X& x, const Xs&..
                                       name == cood_transform_enum::replicate,                     \
                                   "");                                                            \
                                                                                                   \
-                    if constexpr(name == cood_transform_enum::PassThrough)                        \
+                    if constexpr(name == cood_transform_enum::pass_through)                       \
                     {                                                                             \
                         index_t pos  = 0;                                                         \
                         auto low_len = meta_data.template pop<index_t>(pos);                      \
@@ -841,7 +841,7 @@ CK_TILE_HOST_DEVICE constexpr auto chain_tensor_adaptors(const X& x, const Xs&..
         constexpr auto encoded_top_dims    = encoded_tensor_adaptor.template at<4>();              \
         constexpr index_t num_top_dim      = encoded_tensor_adaptor.template at<5>();              \
                                                                                                    \
-        constexpr auto trans = [&encoded_transforms, &num_transform]() {                           \
+        constexpr auto trans = [&encoded_transforms]() {                           \
             return generate_tuple(                                                                 \
                 [&encoded_transforms](auto i) constexpr {                                          \
                     constexpr auto name        = encoded_transforms[i].template at<0>();           \
@@ -849,7 +849,7 @@ CK_TILE_HOST_DEVICE constexpr auto chain_tensor_adaptors(const X& x, const Xs&..
                     constexpr auto num_low_dim = encoded_transforms[i].template at<2>();           \
                     constexpr auto num_up_dim  = encoded_transforms[i].template at<4>();           \
                                                                                                    \
-                    STATIC_ASSERT(name == cood_transform_enum::PassThrough ||                      \
+                    STATIC_ASSERT(name == cood_transform_enum::pass_through ||                     \
                                       name == cood_transform_enum::pad ||                          \
                                       name == cood_transform_enum::embed ||                        \
                                       name == cood_transform_enum::merge ||                        \
@@ -857,7 +857,7 @@ CK_TILE_HOST_DEVICE constexpr auto chain_tensor_adaptors(const X& x, const Xs&..
                                       name == cood_transform_enum::replicate,                      \
                                   "");                                                             \
                                                                                                    \
-                    if constexpr(name == cood_transform_enum::PassThrough)                         \
+                    if constexpr(name == cood_transform_enum::pass_through)                        \
                     {                                                                              \
                         constexpr index_t low_len = meta_data.template get<index_t>(0);            \
                                                                                                    \
@@ -912,7 +912,7 @@ CK_TILE_HOST_DEVICE constexpr auto chain_tensor_adaptors(const X& x, const Xs&..
                 number<num_transform>{});                                                          \
         }();                                                                                       \
                                                                                                    \
-        constexpr auto low_dim_idss = [&encoded_transforms, &num_transform]() {                    \
+        constexpr auto low_dim_idss = [&encoded_transforms]() {                    \
             return generate_tuple(                                                                 \
                 [&encoded_transforms](auto i) {                                                    \
                     constexpr auto num_low_dim = encoded_transforms[i].template at<2>();           \
@@ -923,7 +923,7 @@ CK_TILE_HOST_DEVICE constexpr auto chain_tensor_adaptors(const X& x, const Xs&..
                 number<num_transform>());                                                          \
         }();                                                                                       \
                                                                                                    \
-        constexpr auto up_dim_idss = [&encoded_transforms, &num_transform] {                       \
+        constexpr auto up_dim_idss = [&encoded_transforms] {                       \
             return generate_tuple(                                                                 \
                 [&encoded_transforms](auto i) {                                                    \
                     constexpr auto num_up_dim = encoded_transforms[i].template at<4>();            \
diff --git a/include/ck_tile/core/tensor/tensor_descriptor.hpp b/include/ck_tile/core/tensor/tensor_descriptor.hpp
index 0ff9210e5f..aa9cf108c5 100644
--- a/include/ck_tile/core/tensor/tensor_descriptor.hpp
+++ b/include/ck_tile/core/tensor/tensor_descriptor.hpp
@@ -299,7 +299,7 @@ template <typename... Lengths,
 CK_TILE_HOST_DEVICE constexpr auto
 make_naive_tensor_descriptor_with_offset(const tuple<Lengths...>& lengths,
                                          const tuple<Strides...>& strides,
-                                         const offset& offset,
+                                         const offset& os,
                                          number<GuaranteedLastDimensionVectorLength> = number<-1>{},
                                          number<GuaranteedLastDimensionVectorStride> = number<-1>{})
 {
@@ -307,7 +307,7 @@ make_naive_tensor_descriptor_with_offset(const tuple<Lengths...>& lengths,
         const auto element_space_size = detail::calculate_element_space_size_impl(
             lengths, strides, number<0>{}, long_number<1>{});
 
-        const auto transforms = make_tuple(make_offset_transform(element_space_size, offset));
+        const auto transforms = make_tuple(make_offset_transform(element_space_size, os));
 
         constexpr auto low_dim_hidden_idss = make_tuple(sequence<0>{});
 
@@ -383,12 +383,12 @@ make_naive_tensor_descriptor_packed(const tuple<Lengths...>& lengths,
 
 template <typename... Lengths,
           typename... Strides,
-          typename offset,
+          typename Offset,
           index_t GuaranteedLastDimensionVectorLength                                   = -1,
           typename std::enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
 CK_TILE_HOST_DEVICE constexpr auto make_naive_tensor_descriptor_packed_with_offset(
     const tuple<Lengths...>& lengths,
-    const offset& offset,
+    const Offset& offset,
     number<GuaranteedLastDimensionVectorLength> = number<-1>{})
 {
     const auto desc_0 = [&]() {
diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp
index 9a10ca3af3..e37bd806de 100644
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -3,12 +3,15 @@
 
 #pragma once
 
+#include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/algorithm/coordinate_transform.hpp"
 #include "ck_tile/core/container/container_helper.hpp"
 #include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/core/tensor/tensor_descriptor.hpp"
+#include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 
 namespace ck_tile {
@@ -16,15 +19,16 @@ namespace ck_tile {
 template <typename BufferView_, typename TensorDesc_>
 struct tensor_view
 {
-    using BufferView  = remove_reference_t<BufferView_>;
-    using DataType    = typename BufferView::type;
+    using buffer_view = remove_reference_t<BufferView_>;
+    using DataType    = typename buffer_view::type;
     using TensorDesc  = remove_cvref_t<TensorDesc_>;
     using TensorIndex = array<index_t, TensorDesc::get_num_of_top_dimension()>;
     using TensorCoord = decltype(make_tensor_coordinate(TensorDesc{}, TensorIndex{}));
 
     CK_TILE_HOST_DEVICE constexpr tensor_view() = default;
 
-    CK_TILE_HOST_DEVICE constexpr tensor_view(const BufferView& buffer_view, const TensorDesc& desc)
+    CK_TILE_HOST_DEVICE constexpr tensor_view(const buffer_view& buffer_view,
+                                              const TensorDesc& desc)
         : buf_{buffer_view}, desc_{desc}
     {
     }
@@ -58,12 +62,12 @@ struct tensor_view
 #endif
     // X is vector of DataType.
     // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
-    template <
-        typename X,
-        bool oob_conditional_check          = true,
-        typename std::enable_if<is_same_v<typename scalar_type<remove_cvref_t<X>>::type,
-                                          typename scalar_type<remove_cvref_t<DataType>>::type>,
-                                bool>::type = false>
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
     CK_TILE_HOST_DEVICE constexpr remove_cvref_t<X>
     get_vectorized_elements(const TensorCoord& coord,
                             bool_constant<oob_conditional_check> = {}) const
@@ -76,12 +80,12 @@ struct tensor_view
 
     // X is vector of DataType.
     // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
-    template <
-        typename X,
-        bool oob_conditional_check          = true,
-        typename std::enable_if<is_same_v<typename scalar_type<remove_cvref_t<X>>::type,
-                                          typename scalar_type<remove_cvref_t<DataType>>::type>,
-                                bool>::type = false>
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
     CK_TILE_HOST_DEVICE void
     get_vectorized_elements_raw(remove_cvref_t<X>& dst,
                                 const TensorCoord& coord,
@@ -93,11 +97,11 @@ struct tensor_view
             coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord));
     }
 
-    template <
-        typename X,
-        typename std::enable_if<is_same_v<typename scalar_type<remove_cvref_t<X>>::type,
-                                          typename scalar_type<remove_cvref_t<DataType>>::type>,
-                                bool>::type = false>
+    template <typename X,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
     CK_TILE_HOST_DEVICE constexpr void async_get_vectorized_elements(remove_cvref_t<DataType>* smem,
                                                                      const TensorCoord& coord) const
     {
@@ -106,12 +110,12 @@ struct tensor_view
 
     // X is vector of DataType.
     // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
-    template <
-        typename X,
-        bool oob_conditional_check          = true,
-        typename std::enable_if<is_same_v<typename scalar_type<remove_cvref_t<X>>::type,
-                                          typename scalar_type<remove_cvref_t<DataType>>::type>,
-                                bool>::type = false>
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
     CK_TILE_HOST_DEVICE constexpr void set_vectorized_elements(
         const TensorCoord& coord, const X& x, bool_constant<oob_conditional_check> = {})
     {
@@ -121,12 +125,12 @@ struct tensor_view
             x);
     }
 
-    template <
-        typename X,
-        bool oob_conditional_check          = true,
-        typename std::enable_if<is_same_v<typename scalar_type<remove_cvref_t<X>>::type,
-                                          typename scalar_type<remove_cvref_t<DataType>>::type>,
-                                bool>::type = false>
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
     CK_TILE_HOST_DEVICE constexpr void set_vectorized_elements_raw(
         const TensorCoord& coord, const X& x, bool_constant<oob_conditional_check> = {})
     {
@@ -153,7 +157,7 @@ struct tensor_view
     }
 
     // member
-    BufferView buf_;
+    buffer_view buf_;
     TensorDesc desc_;
 };
 
@@ -162,7 +166,7 @@ struct null_tensor_view
 {
 };
 
-template <AddressSpaceEnum BufferAddressSpace = AddressSpaceEnum::Generic,
+template <address_space_enum BufferAddressSpace = address_space_enum::generic,
           typename DataType,
           typename... Ts>
 CK_TILE_HOST_DEVICE constexpr auto make_tensor_view(DataType* p,
@@ -173,7 +177,7 @@ CK_TILE_HOST_DEVICE constexpr auto make_tensor_view(DataType* p,
     return tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
 }
 
-template <AddressSpaceEnum BufferAddressSpace = AddressSpaceEnum::Generic,
+template <address_space_enum BufferAddressSpace = address_space_enum::generic,
           typename DataType,
           typename... Lengths,
           typename... Strides,
@@ -197,7 +201,7 @@ make_naive_tensor_view(DataType* p,
     return tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
 }
 
-template <AddressSpaceEnum BufferAddressSpace = AddressSpaceEnum::Generic,
+template <address_space_enum BufferAddressSpace = address_space_enum::generic,
           typename DataType,
           typename... Lengths,
           index_t GuaranteedLastDimensionVectorLength = -1>
@@ -228,19 +232,19 @@ CK_TILE_HOST_DEVICE constexpr auto transform_tensor_view(const OldTensorView& ol
                                                 NewLowerDimensionOldVisibleIdss{},
                                                 NewUpperDimensionNewVisibleIdss{});
 
-    return tensor_view<typename OldTensorView::BufferView, remove_cvref_t<decltype(new_desc)>>{
+    return tensor_view<typename OldTensorView::buffer_view, remove_cvref_t<decltype(new_desc)>>{
         old_tensor_view.buf_, new_desc};
 }
 
-template <typename tensor_view,
+template <typename TensorView,
           typename TileLengths, // tuple<...>
           typename DoPads>      // sequence<bool, bool, ...>
 CK_TILE_HOST_DEVICE constexpr auto
-pad_tensor_view(const tensor_view& tensor_view, const TileLengths& tile_lengths, DoPads)
+pad_tensor_view(const TensorView& tensor_view, const TileLengths& tile_lengths, DoPads)
 {
     constexpr index_t num_dim = DoPads::size();
 
-    static_assert(num_dim == TileLengths::size() && num_dim == tensor_view::get_num_of_dimension(),
+    static_assert(num_dim == TileLengths::size() && num_dim == TensorView::get_num_of_dimension(),
                   "wrong! inconsistent # of dimensions");
 
     // transforms
diff --git a/include/ck_tile/core/tensor/tile_distribution.hpp b/include/ck_tile/core/tensor/tile_distribution.hpp
index 8a1a2e4810..38a02acb32 100644
--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -3,12 +3,14 @@
 
 #pragma once
 
+#include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/container/array.hpp"
 #include "ck_tile/core/container/sequence.hpp"
 #include "ck_tile/core/container/tuple.hpp"
 #include "ck_tile/core/container/container_helper.hpp"
 #include "ck_tile/core/tensor/tensor_adaptor.hpp"
+#include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 
@@ -293,7 +295,9 @@ CK_TILE_HOST_DEVICE constexpr auto
                                 &hidden_dim_cnt,
                                 &rh_major_minor_to_hidden_ids,
                                 &rh_major_minor_to_hidden_lengths](auto idim_x) {
-        constexpr auto h_minor_lengths = tuple_element_t<idim_x, HsLengthss>{};
+        // typename HsLengthss::base{}.foo();
+        constexpr auto h_minor_lengths = HsLengthss{}.get(idim_x); //std::tuple_element_t<idim_x, HsLengthss>{};
+        // constexpr auto h_minor_lengths = impl::getv<idim_x>(HsLengthss{});
 
         constexpr index_t ndim_h_minor = h_minor_lengths.size();
 
diff --git a/include/ck_tile/core/tensor/tile_elementwise.hpp b/include/ck_tile/core/tensor/tile_elementwise.hpp
index e2b1f0c385..974cb2ee1e 100644
--- a/include/ck_tile/core/tensor/tile_elementwise.hpp
+++ b/include/ck_tile/core/tensor/tile_elementwise.hpp
@@ -9,6 +9,7 @@
 #include "ck_tile/core/container/tuple.hpp"
 #include "ck_tile/core/container/container_helper.hpp"
 #include "ck_tile/core/tensor/tensor_adaptor.hpp"
+#include "ck_tile/core/tensor/null_tensor.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 
@@ -18,7 +19,7 @@ namespace ck_tile {
 template <typename InOutElementFunc,
           typename... InOutDstrTensors,
           typename = std::enable_if_t<std::conjunction_v<
-              std::negation<std::is_same<std::remove_const_t<InOutDstrTensors>, NullTensor>>...>>>
+              std::negation<std::is_same<std::remove_const_t<InOutDstrTensors>, null_tensor>>...>>>
 CK_TILE_DEVICE void tile_elementwise_inout(const InOutElementFunc& inout_element_func,
                                            InOutDstrTensors&... inout_dstr_tensors)
 {
@@ -26,7 +27,7 @@ CK_TILE_DEVICE void tile_elementwise_inout(const InOutElementFunc& inout_element
     // static_assert(xxx);
 
     constexpr index_t thread_buffer_size =
-        type_pack_element<0, InOutDstrTensors...>::get_thread_buffer_size();
+        __type_pack_element<0, InOutDstrTensors...>::get_thread_buffer_size();
 
     static_for<0, thread_buffer_size, 1>{}(
         [&](auto i) { inout_element_func(inout_dstr_tensors.get_thread_buffer().at(i)...); });
@@ -35,7 +36,7 @@ CK_TILE_DEVICE void tile_elementwise_inout(const InOutElementFunc& inout_element
 template <typename InElementFunc,
           typename... InDstrTensors,
           typename = std::enable_if_t<
-              std::conjunction_v<std::negation<std::is_same<InDstrTensors, NullTensor>>...>>>
+              std::conjunction_v<std::negation<std::is_same<InDstrTensors, null_tensor>>...>>>
 CK_TILE_DEVICE auto tile_elementwise_in(const InElementFunc& in_element_func,
                                         const InDstrTensors&... in_dstr_tensors)
 {
@@ -43,10 +44,10 @@ CK_TILE_DEVICE auto tile_elementwise_in(const InElementFunc& in_element_func,
 
     // TODO: make sure all distributed tensors have same lengths and distribution
     // static_assert(xxx);
-    constexpr auto in_tile_dstr = type_pack_element<0, InDstrTensors...>::get_tile_distribution();
+    constexpr auto in_tile_dstr = __type_pack_element<0, InDstrTensors...>::get_tile_distribution();
 
     constexpr index_t thread_buffer_size =
-        type_pack_element<0, InDstrTensors...>::get_thread_buffer_size();
+        __type_pack_element<0, InDstrTensors...>::get_thread_buffer_size();
 
     auto out_dstr_tensor = make_static_distributed_tensor<OutDataType>(in_tile_dstr);
 
@@ -69,7 +70,7 @@ CK_TILE_DEVICE void set_tile(DstrTensors& dstr_tensor, const T& value)
 }
 
 template <typename T>
-CK_TILE_DEVICE void set_tile(NullTensor&, const T&)
+CK_TILE_DEVICE void set_tile(null_tensor&, const T&)
 {
 }
 
@@ -82,7 +83,7 @@ CK_TILE_DEVICE void set_tile(DstrTensors& dstr_tensor, number<v>)
         DstrTensors::get_thread_buffer_size() * sizeof(typename DstrTensors::DataType);
     if constexpr(v == 0 && tensor_bytes % 4 == 0)
     {
-        using dvec_t = static_buffer_c<index_t, tensor_bytes / 4>;
+        using dvec_t = array<index_t, tensor_bytes / 4>;
         auto& tensor = reinterpret_cast<dvec_t&>(dstr_tensor.get_thread_buffer());
         for(auto i = 0; i < tensor.size(); i++)
             tensor.get(i) = v;
@@ -96,7 +97,7 @@ CK_TILE_DEVICE void set_tile(DstrTensors& dstr_tensor, number<v>)
 }
 
 template <index_t v>
-CK_TILE_DEVICE void set_tile(NullTensor&, number<v>)
+CK_TILE_DEVICE void set_tile(null_tensor&, number<v>)
 {
 }
 
@@ -139,7 +140,7 @@ CK_TILE_DEVICE auto cast_tile_pk_fp8x4(const InDstrTensors& in_dstr_tensors)
             false); // false -> WORD0
 
         constexpr int32_t m0 = 0x05040100;
-        using vec_t          = typename vector_type<OutDataType, 4>::type;
+        using vec_t          = array<OutDataType, 4>;
 
         vec_t d = bit_cast<vec_t>(__builtin_amdgcn_perm(y, x, m0));
         out_dstr_tensor.get_thread_buffer().template set_as<vec_t>(number<i>{}, d);
@@ -157,9 +158,9 @@ CK_TILE_DEVICE auto cast_tile_pk_fp8x4(const InDstrTensors& in_dstr_tensors)
 template <typename DstType, typename SrcDstrTensors>
 CK_TILE_DEVICE auto cast_tile(const SrcDstrTensors& src_tensor)
 {
-    if constexpr((ck_tile::is_same_v<DstType, fp8_t> ||
-                  ck_tile::is_same_v<DstType, bf8_t>)&&ck_tile::
-                     is_same_v<typename SrcDstrTensors::DataType, float> &&
+    if constexpr((std::is_same_v<DstType, fp8_t> ||
+                  std::is_same_v<DstType, bf8_t>)&&std::is_same_v<typename SrcDstrTensors::DataType,
+                                                                  float> &&
                  (SrcDstrTensors::get_thread_buffer_size() % 4 == 0))
     {
         return cast_tile_pk_fp8x4<DstType, SrcDstrTensors>(src_tensor);
@@ -169,23 +170,23 @@ CK_TILE_DEVICE auto cast_tile(const SrcDstrTensors& src_tensor)
                                    src_tensor);
 }
 
-// no-op function for NullTensor arguments
+// no-op function for null_tensor arguments
 template <typename InOutElementFunc,
           typename... MaybeNullTensor,
           typename = std::enable_if_t<
-              std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, NullTensor>...>>>
+              std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, null_tensor>...>>>
 CK_TILE_DEVICE void tile_elementwise_inout(const InOutElementFunc&, MaybeNullTensor&&...)
 {
 }
 
-// no-op function for NullTensor arguments
+// no-op function for null_tensor arguments
 template <typename InElementFunc,
           typename... MaybeNullTensor,
           typename = std::enable_if_t<
-              std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, NullTensor>...>>>
+              std::disjunction_v<std::is_same<remove_cvref_t<MaybeNullTensor>, null_tensor>...>>>
 CK_TILE_DEVICE auto tile_elementwise_in(const InElementFunc&, MaybeNullTensor&&...)
 {
-    return NullTensor{};
+    return null_tensor{};
 }
 
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index 80e483e6ad..76db527780 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -3,12 +3,15 @@
 
 #pragma once
 
+#include "ck_tile/core/arch/utility.hpp"
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/container/array.hpp"
 #include "ck_tile/core/container/sequence.hpp"
 #include "ck_tile/core/container/tuple.hpp"
 #include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/tensor/static_distributed_tensor.hpp"
 #include "ck_tile/core/tensor/tensor_adaptor.hpp"
+#include "ck_tile/core/tensor/tile_distribution.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 
@@ -85,8 +88,9 @@ struct tile_window_with_static_distribution
         static constexpr index_t ScalarPerVector =
             get_vector_dim_y_scalar_per_vector().template at<1>();
 
-        using vector_type_t = vector_type_maker_t<DataType, ScalarPerVector>;
-        using vector_t      = typename vector_type_t::type;
+        // using vector_type_t = vector_type_maker_t<DataType, ScalarPerVector>;
+        // using vector_t      = typename vector_type_t::type;
+        using vector_t = array<DataType, ScalarPerVector>;
 
         private:
         static constexpr auto scalars_per_access_ = [] {
@@ -275,9 +279,8 @@ struct tile_window_with_static_distribution
     {
         using Traits = load_store_traits;
 
-        using vector_type_t = typename Traits::vector_type_t;
-        using vector_t      = typename vector_type_t::type;
-        using SFC_Ys        = typename Traits::SFC_Ys;
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
 
         constexpr auto tile_dstr = TileDstr{};
 
@@ -300,8 +303,6 @@ struct tile_window_with_static_distribution
                     get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
                         bottom_tensor_thread_coord, bool_constant<oob_conditional_check>{});
 
-                const vector_type_t vec{vec_value};
-
                 // write into distributed tensor
                 static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
                     constexpr auto idx_ys = generate_array(
@@ -315,7 +316,7 @@ struct tile_window_with_static_distribution
                         tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
 
                     dst_tensor.get_thread_buffer().template at<d>() =
-                        vec.template AsType<DataType>()[j];
+                        vec_value.template get_as<DataType>()[j];
                 });
 
                 // move thread coordinate
@@ -341,16 +342,17 @@ struct tile_window_with_static_distribution
     {
         using Traits = load_store_traits;
 
-        using vector_type_t = typename Traits::vector_type_t;
-        using vector_t      = typename vector_type_t::type;
-        using SFC_Ys        = typename Traits::SFC_Ys;
+        // using vector_type_t = typename Traits::vector_type_t;
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
         static constexpr index_t YElementSize =
             TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
         static_assert(YElementSize % Traits::ScalarPerVector == 0);
-        using vectorized_tbuf = StaticBuffer<address_space_enum::vgpr,
-                                             vector_t,
-                                             YElementSize / Traits::ScalarPerVector,
-                                             true>;
+        using vectorized_tbuf = array<vector_t, YElementSize / Traits::ScalarPerVector>;
+        // StaticBuffer<address_space_enum::vgpr,
+        //                                      vector_t,
+        //                                      YElementSize / Traits::ScalarPerVector,
+        //                                      true>;
 
         constexpr auto tile_dstr = TileDstr{};
 
@@ -426,9 +428,9 @@ struct tile_window_with_static_distribution
 
         using Traits = load_store_traits;
 
-        using vector_type_t = typename Traits::vector_type_t;
-        using vector_t      = typename vector_type_t::type;
-        using SFC_Ys        = typename Traits::SFC_Ys;
+        // using vector_type_t = typename Traits::vector_type_t;
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
 
         LdsDataType* smem = lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_;
 
@@ -468,9 +470,9 @@ struct tile_window_with_static_distribution
     {
         using Traits = load_store_traits;
 
-        using vector_type_t = typename Traits::vector_type_t;
-        using vector_t      = typename vector_type_t::type;
-        using SFC_Ys        = typename Traits::SFC_Ys;
+        // using vector_type_t = typename Traits::vector_type_t;
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
 
         constexpr auto tile_dstr = TileDstr{};
 
@@ -487,7 +489,8 @@ struct tile_window_with_static_distribution
                 constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess);
 
                 // read from distributed tensor
-                vector_type_t vec;
+                // vector_type_t vec;
+                vector_t vec_value;
 
                 static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
                     constexpr auto idx_ys = generate_array(
@@ -500,11 +503,11 @@ struct tile_window_with_static_distribution
                     constexpr index_t d =
                         tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
 
-                    vec.template AsType<DataType>()(j) =
+                    vec_value.template get_as<DataType>()(j) =
                         dstr_tensor.get_thread_buffer().template at<d>();
                 });
 
-                const vector_t vec_value = vec.template AsType<vector_t>().template at<0>();
+                // const vector_t vec_value = vec.template get_as<vector_t>().template at<0>();
 
                 // write into bottom tensor
                 get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
@@ -530,9 +533,9 @@ struct tile_window_with_static_distribution
     {
         using Traits = load_store_traits;
 
-        using vector_type_t = typename Traits::vector_type_t;
-        using vector_t      = typename vector_type_t::type;
-        using SFC_Ys        = typename Traits::SFC_Ys;
+        // using vector_type_t = typename Traits::vector_type_t;
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
 
         constexpr auto tile_dstr                    = TileDstr{};
         static constexpr bool oob_conditional_check = true;
@@ -550,7 +553,8 @@ struct tile_window_with_static_distribution
                 constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess);
 
                 // read from distributed tensor
-                vector_type_t vec;
+                // vector_type_t vec;
+                vector_t vec_value;
 
                 static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
                     constexpr auto idx_ys = generate_array(
@@ -563,11 +567,11 @@ struct tile_window_with_static_distribution
                     constexpr index_t d =
                         tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
 
-                    vec.template AsType<DataType>()(j) =
+                    vec_value.template get_as<DataType>()(j) =
                         dstr_tensor.get_thread_buffer().template at<d>();
                 });
 
-                const vector_t vec_value = vec.template AsType<vector_t>().template at<0>();
+                // const vector_t vec_value = vec.template get_as<vector_t>().template at<0>();
 
                 // write into bottom tensor
                 get_bottom_tensor_view()
diff --git a/include/ck_tile/core/utility/functional.hpp b/include/ck_tile/core/utility/functional.hpp
index c246c9c456..2cdce94063 100644
--- a/include/ck_tile/core/utility/functional.hpp
+++ b/include/ck_tile/core/utility/functional.hpp
@@ -191,4 +191,18 @@ CK_TILE_HOST_DEVICE constexpr auto unpack2(F&& f, X&& x, Y&& y)
         std::forward<F>(f), std::forward<X>(x), std::forward<Y>(y));
 }
 
+// z = predicate ? x : y
+template <bool predicate, typename X, typename Y>
+constexpr auto conditional_expr(X&& x, Y&& y)
+{
+    if constexpr(predicate)
+    {
+        return std::forward<X>(x);
+    }
+    else
+    {
+        return std::forward<Y>(y);
+    }
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/utility/to_sequence.hpp b/include/ck_tile/core/utility/to_sequence.hpp
index 1d2c73073d..4db6cfd4a0 100644
--- a/include/ck_tile/core/utility/to_sequence.hpp
+++ b/include/ck_tile/core/utility/to_sequence.hpp
@@ -9,12 +9,13 @@
 // clang happen to support this feature (__cpp_generic_lambdas >= 201707) in c++17 mode
 #define TO_SEQUENCE(a, n)                                                                      \
     _Pragma("clang diagnostic push")                                                           \
-        _Pragma("clang diagnostic ignored \"-Wc++20-extensions\"")[a]<ck_tile::index_t... Is>( \
-            ck_tile::sequence<Is...>)                                                          \
+    _Pragma("clang diagnostic ignored \"-Wc++20-extensions\"")                                 \
+        [a]<ck_tile::index_t... IDX_IDX_>(ck_tile::sequence<IDX_IDX_...>)                                  \
     {                                                                                          \
-        return ck_tile::sequence<a.at(ck_tile::number<Is>{})...>{};                            \
+        return ck_tile::sequence<a.at(ck_tile::number<IDX_IDX_>{})...>{};           \
     }                                                                                          \
-    (make_index_sequence<n>{}) _Pragma("clang diagnostic pop")
+    (ck_tile::make_index_sequence<n>{});                                                       \
+    _Pragma("clang diagnostic pop")
 
 #else
 // Macro function
diff --git a/include/ck_tile/core/utility/transpose_vectors.hpp b/include/ck_tile/core/utility/transpose_vectors.hpp
new file mode 100644
index 0000000000..acd5dd7b1d
--- /dev/null
+++ b/include/ck_tile/core/utility/transpose_vectors.hpp
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/container/array.hpp"
+#include "ck_tile/core/utility/bit_cast.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+
+namespace ck_tile {
+
+// S: scalar type (or it can be non-scalar type)
+// NX: # of vector before transpose
+// NY: # of vector after transpose
+// we got [NX, NY] amount of S data to be transposed into [NY, NX] amount of S data
+template <typename S_, index_t NX, index_t NY>
+struct transpose_vectors
+{
+    static constexpr index_t s_per_x = NY;
+    static constexpr index_t s_per_y = NX;
+
+    using S = remove_cvref_t<S_>;
+
+    using VX = array<S, s_per_x>;
+    using VY = array<S, s_per_y>;
+
+    CK_TILE_DEVICE void operator()(const array<VX, NX>& vx_tuple, array<VY, NY>& vy_tuple)
+    {
+        constexpr auto I1 = number<1>{};
+        constexpr auto I2 = number<2>{};
+        constexpr auto I3 = number<3>{};
+        constexpr auto I4 = number<4>{};
+
+        if constexpr(sizeof(S) == 2)
+        {
+            static_assert((NX % 2 == 0 && NY % 2 == 0), "wrong!");
+
+            using S2 = array<S, 2>; // typename array<S, 2>::type;
+
+            // loop over 2x2 tile and transpose data from vx_tuple into vy_tuple
+            static_for<0, NY, 2>{}([&](auto iy) {
+                static_for<0, NX, 2>{}([&](auto ix) {
+                    // 2 16bitx2 data from vx_tuple to be transposed
+                    const int32_t x_s2_0 =
+                        bit_cast<int32_t>(vx_tuple[ix].template get_as<S2>()[iy / I2]);
+                    const int32_t x_s2_1 =
+                        bit_cast<int32_t>(vx_tuple[ix + I1].template get_as<S2>()[iy / I2]);
+
+                    constexpr int32_t m0 = 0x05040100;
+                    constexpr int32_t m1 = 0x07060302;
+
+                    // transpose 2x2 16bit
+                    // ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
+                    //                   -- -- -- --     -- -- -- --      -  -  -  -
+                    //             index  7  6  5  4      3  2  1  0     33 77 44 88
+                    // index is reversed because of little endianness (least significant bits first)
+                    const int32_t y_s2_0 = __builtin_amdgcn_perm(x_s2_1, x_s2_0, m0);
+                    const int32_t y_s2_1 = __builtin_amdgcn_perm(x_s2_1, x_s2_0, m1);
+
+                    // 2 16bitx2 data after transposed
+                    vy_tuple(iy).template get_as<S2>()(ix / I2)      = bit_cast<S2>(y_s2_0);
+                    vy_tuple(iy + I1).template get_as<S2>()(ix / I2) = bit_cast<S2>(y_s2_1);
+                });
+            });
+        }
+        else if constexpr(sizeof(S) == 1)
+        {
+            static_assert((NX % 4 == 0 && NY % 4 == 0), "wrong!");
+
+            using S4 = array<S, 4>; // typename array<S, 4>::type;
+
+            // loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
+            static_for<0, NY, 4>{}([&](auto iy) {
+                static_for<0, NX, 4>{}([&](auto ix) {
+                    // 4 int8x4 data from vx_tuple
+                    const int32_t x_s4_0 =
+                        bit_cast<int32_t>(vx_tuple[ix].template get_as<S4>()[iy / I4]);
+                    const int32_t x_s4_1 =
+                        bit_cast<int32_t>(vx_tuple[ix + I1].template get_as<S4>()[iy / I4]);
+                    const int32_t x_s4_2 =
+                        bit_cast<int32_t>(vx_tuple[ix + I2].template get_as<S4>()[iy / I4]);
+                    const int32_t x_s4_3 =
+                        bit_cast<int32_t>(vx_tuple[ix + I3].template get_as<S4>()[iy / I4]);
+
+                    // transpose
+                    int32_t t_s4_0, t_s4_1;
+                    int32_t y_s4_0, y_s4_1, y_s4_2, y_s4_3;
+
+                    constexpr int32_t m0 = 0x05010400;
+                    constexpr int32_t m1 = 0x05040100;
+                    constexpr int32_t m2 = 0x07060302;
+                    constexpr int32_t m3 = 0x07030602;
+
+                    // ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
+                    //                   -- -- -- --     -- -- -- --      -  -  -  -
+                    //             index  7  6  5  4      3  2  1  0     33 77 44 88
+                    // index is reversed because of little endianness (least significant bits first)
+                    t_s4_0 = __builtin_amdgcn_perm(x_s4_1, x_s4_0, m0);
+                    t_s4_1 = __builtin_amdgcn_perm(x_s4_3, x_s4_2, m0);
+                    y_s4_0 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m1);
+                    y_s4_1 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m2);
+                    t_s4_0 = __builtin_amdgcn_perm(x_s4_1, x_s4_0, m3);
+                    t_s4_1 = __builtin_amdgcn_perm(x_s4_3, x_s4_2, m3);
+                    y_s4_2 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m1);
+                    y_s4_3 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m2);
+
+                    // 4 int8x4 data from vy_tuple
+                    vy_tuple(iy).template get_as<S4>()(ix / I4)      = bit_cast<S4>(y_s4_0);
+                    vy_tuple(iy + I1).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_1);
+                    vy_tuple(iy + I2).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_2);
+                    vy_tuple(iy + I3).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_3);
+                });
+            });
+        }
+        else
+        {
+            static_assert(false, "not implemented");
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/utility/type_convert.hpp b/include/ck_tile/core/utility/type_convert.hpp
deleted file mode 100644
index 4bc3393fd9..0000000000
--- a/include/ck_tile/core/utility/type_convert.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <stdint.h>
-#include <tuple>
-#include <type_traits>
-#include "ck_tile/core/config.hpp"
-#include "ck_tile/core/numeric/half.hpp"
-#include "ck_tile/core/numeric/bfloat16.hpp"
-#include "ck_tile/core/numeric/float8.hpp"
-
-namespace ck_tile {
-
-// Convert X to Y, both X and Y are non-const data types.
-template <typename Y,
-          typename X,
-          std::enable_if_t<!(std::is_const_v<Y> || std::is_const_v<X>), bool> = false>
-CK_TILE_HOST_DEVICE constexpr Y type_convert(X x)
-{
-    static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
-
-    return static_cast<Y>(x);
-}
-
-// Convert X to Y, either X or Y is a const data type.
-template <typename Y,
-          typename X,
-          std::enable_if_t<std::is_const_v<Y> || std::is_const_v<X>, bool> = false>
-CK_TILE_HOST_DEVICE constexpr Y type_convert(X x)
-{
-    static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
-
-    using non_const_y = std::remove_const_t<Y>;
-    using non_const_x = std::remove_const_t<X>;
-    return static_cast<Y>(type_convert<non_const_y, non_const_x>(x));
-}
-
-#define CK_TILE_TYPE_CONVERT(dtype_, stype_)                                           \
-    template <>                                                                        \
-    inline CK_TILE_HOST_DEVICE constexpr dtype_ type_convert<dtype_, stype_>(stype_ x) \
-    {                                                                                  \
-        return stype_##_to_##dtype_(x);                                                \
-    }
-
-CK_TILE_TYPE_CONVERT(float, fp16_t)
-CK_TILE_TYPE_CONVERT(float, bf16_t)
-CK_TILE_TYPE_CONVERT(float, fp8_t)
-CK_TILE_TYPE_CONVERT(float, bf8_t)
-
-CK_TILE_TYPE_CONVERT(fp16_t, float)
-CK_TILE_TYPE_CONVERT(bf16_t, float)
-CK_TILE_TYPE_CONVERT(fp8_t, float)
-CK_TILE_TYPE_CONVERT(bf8_t, float)
-
-} // namespace ck_tile
diff --git a/include/ck_tile/core/utility/type_traits.hpp b/include/ck_tile/core/utility/type_traits.hpp
index 7cdc3d2a28..8b8b01a2ac 100644
--- a/include/ck_tile/core/utility/type_traits.hpp
+++ b/include/ck_tile/core/utility/type_traits.hpp
@@ -69,4 +69,18 @@ struct nonesuch
 template <template <class...> class Op, class... Args>
 using is_detected = typename detail::detector<nonesuch, void, Op, Args...>::value_t;
 
+// FIXME: do we need this anymore?
+template <
+    typename PY,
+    typename PX,
+    typename std::enable_if<std::is_pointer_v<PY> && std::is_pointer_v<PX>, bool>::type = false>
+CK_TILE_HOST_DEVICE PY c_style_pointer_cast(PX p_x)
+{
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic ignored "-Wcast-align"
+    return (PY)p_x; // NOLINT(old-style-cast, cast-align)
+#pragma clang diagnostic pop
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/host/check_err.hpp b/include/ck_tile/host/check_err.hpp
index 5d39548602..38a345f311 100644
--- a/include/ck_tile/host/check_err.hpp
+++ b/include/ck_tile/host/check_err.hpp
@@ -97,7 +97,7 @@ check_err(const Range& out,
 template <typename Range, typename RefRange>
 typename std::enable_if<
     std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
-        std::is_same_v<ranges::range_value_t<Range>, bhalf_t>,
+        std::is_same_v<ranges::range_value_t<Range>, bf16_t>,
     bool>::type
 check_err(const Range& out,
           const RefRange& ref,
@@ -123,7 +123,7 @@ check_err(const Range& out,
     bool res{true};
     int err_count = 0;
     double err    = 0;
-    // TODO: This is a hack. We should have proper specialization for bhalf_t data type.
+    // TODO: This is a hack. We should have proper specialization for bf16_t data type.
     double max_err = std::numeric_limits<float>::min();
     for(std::size_t i = 0; i < ref.size(); ++i)
     {
@@ -214,7 +214,7 @@ check_err(const Range& out,
 template <typename Range, typename RefRange>
 std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
                   std::is_integral_v<ranges::range_value_t<Range>> &&
-                  !std::is_same_v<ranges::range_value_t<Range>, bhalf_t>)
+                  !std::is_same_v<ranges::range_value_t<Range>, bf16_t>)
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
                      || std::is_same_v<ranges::range_value_t<Range>, int4_t>
 #endif
diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp
index 00f62de934..bba96822b1 100644
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -147,8 +147,8 @@ float launch_and_time_kernel_with_preprocess(const stream_config& s,
 #endif
 }
 
-template <int MaxThreadPerBlock = CK_MAX_THREAD_PER_BLOCK,
-          int MinBlockPerCu     = CK_MIN_BLOCK_PER_CU,
+template <int MaxThreadPerBlock = CK_TILE_MAX_THREAD_PER_BLOCK,
+          int MinBlockPerCu     = CK_TILE_MIN_BLOCK_PER_CU,
           typename KernelImpl,
           typename... Args>
 float launch_kernel(const stream_config& s,
diff --git a/include/ck_tile/ops/common/README.md b/include/ck_tile/ops/common/README.md
new file mode 100644
index 0000000000..bc5b0be599
--- /dev/null
+++ b/include/ck_tile/ops/common/README.md
@@ -0,0 +1,4 @@
+## common
+this folder is designed not to be included directly by use, e.g. if use include `ck_tile/ops/fmha.hpp`, then everything under `common` should also be included.
+
+to achieve this we will duplicate the header include path under `common` to other module under `ops/*` inside remod.py. for internal developer, you can also include `ck_tile/ops/common.hpp` for convenience. (and so does external users...)
diff --git a/include/ck_tile/ops/epilogue.hpp b/include/ck_tile/ops/epilogue.hpp
index 497f6d1504..ab399dbf7a 100644
--- a/include/ck_tile/ops/epilogue.hpp
+++ b/include/ck_tile/ops/epilogue.hpp
@@ -4,4 +4,5 @@
 #pragma once
 
 #include "ck_tile/ops/epilogue/default_2d_epilogue.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
 
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index 6813a7a971..f886d470d5 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -17,4 +17,5 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp"
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
 
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 9685bd2da5..fff48d532a 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
 #include <string>
 #include <type_traits>
 
@@ -50,7 +51,7 @@ struct FmhaFwdKernel
     template <typename T> struct t2s;
     template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
     template <> struct t2s<ck_tile::half_t> { static constexpr const char * name = "fp16"; };
-    template <> struct t2s<ck_tile::bhalf_t> { static constexpr const char * name = "bf16"; };
+    template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
     template <> struct t2s<ck_tile::fp8_t> { static constexpr const char * name = "fp8"; };
     template <> struct t2s<ck_tile::bf8_t> { static constexpr const char * name = "bf8"; };
     // clang-format on
@@ -79,7 +80,7 @@ struct FmhaFwdKernel
             "r" + _TS_(gbr::at(ck_tile::number<0>{})) + "x" + _TS_(gbr::at(ck_tile::number<1>{})) + "x" + _TS_(gbr::at(ck_tile::number<2>{})) + "_" + 
             "w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" +
             (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
-            "v" + (ck_tile::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "" : "_" + pn) +
+            "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "" : "_" + pn) +
             (kHasBias ? "_bias" : "") + (kHasMask ? "_" + _SS_(FmhaMask::name) : "") + (kStoreLSE ? "_lse" : "" );
         #undef _SS_
         #undef _TS_
@@ -407,7 +408,7 @@ struct FmhaFwdKernel
 
             batch_offset_q = query_start * kargs.stride_q;
             batch_offset_k = key_start * kargs.stride_k;
-            if constexpr(ck_tile::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
             {
                 batch_offset_v = key_start * kargs.stride_v;
             }
@@ -519,7 +520,7 @@ struct FmhaFwdKernel
                 sequence<kPadSeqLenK, kPadHeadDimQ>{});
         }();
         const auto v_dram = [&]() {
-            if constexpr(ck_tile::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
             {
                 const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
                     v_ptr,
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
index 5a1f1b0520..90bd69956c 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
@@ -49,12 +49,12 @@ struct BlockFmhaPipelineProblem
     static constexpr bool kStoreLSE      = Traits::kStoreLSE;
     static constexpr index_t kBlockPerCu = Traits::kBlockPerCu;
     static constexpr bool kIsFp8 =
-        (is_same_v<QDataType, fp8_t> || is_same_v<QDataType, bf8_t>)&&(
-            is_same_v<KDataType, fp8_t> ||
-            is_same_v<KDataType, bf8_t>)&&(is_same_v<VDataType, fp8_t> ||
-                                           is_same_v<VDataType, bf8_t>)&&is_same_v<SaccDataType,
-                                                                                   float> &&
-        is_same_v<OaccDataType, float>;
+        (std::is_same_v<QDataType, fp8_t> || std::is_same_v<QDataType, bf8_t>)&&(
+            std::is_same_v<KDataType, fp8_t> ||
+            std::is_same_v<KDataType, bf8_t>)&&(std::is_same_v<VDataType, fp8_t> ||
+                                                std::is_same_v<VDataType, bf8_t>)&&std::
+            is_same_v<SaccDataType, float> &&
+        std::is_same_v<OaccDataType, float>;
 };
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
index f8294a4fdb..1a959e2d74 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
@@ -56,7 +56,7 @@ struct BlockFmhaPipelineQRKSVS
     static constexpr index_t kAlignmentK =
         kPadHeadDimQ ? 1 : Policy::template GetAlignmentK<Problem>();
     static constexpr index_t kAlignmentV = []() {
-        if constexpr(ck_tile::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
             return kPadHeadDimV ? 1 : Policy::template GetAlignmentV<Problem>();
         else
             return kPadSeqLenK ? 1 : Policy::template GetAlignmentV<Problem>();
@@ -127,9 +127,9 @@ struct BlockFmhaPipelineQRKSVS
                void* smem_ptr) const
     {
         static_assert(
-            is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
-                is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
-                is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
             "wrong!");
 
         static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
@@ -442,7 +442,7 @@ struct BlockFmhaPipelineQRKSVS
             });
 
             block_sync_lds();
-            if constexpr(ck_tile::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
             {
                 auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
                     Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
@@ -471,8 +471,7 @@ struct BlockFmhaPipelineQRKSVS
                                p, sequence<0, i_k1 * kK1>{}, sequence<kM0, (i_k1 + 1) * kK1>{}),
                            v_lds_window);
                     block_sync_lds();
-                    if constexpr(ck_tile::is_same_v<VLayout,
-                                                    ck_tile::tensor_layout::gemm::RowMajor>)
+                    if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
                     {
                         auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
                             Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
index 2b31c50373..c138bca883 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async_default_policy.hpp"
 
 namespace ck_tile {
@@ -59,7 +59,7 @@ struct BlockFmhaPipelineQRKSVSAsync
     static constexpr index_t kAlignmentQ = Policy::template GetAlignmentQ<Problem>();
     static constexpr index_t kAlignmentK = Policy::template GetAlignmentK<Problem>();
     static constexpr index_t kAlignmentV = []() {
-        if constexpr(ck_tile::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
             return Policy::template GetAlignmentV<Problem>();
         else
             return kPadSeqLenK ? 1 : Policy::template GetAlignmentV<Problem>();
@@ -138,9 +138,9 @@ struct BlockFmhaPipelineQRKSVSAsync
                void* smem_ptr) const
     {
         static_assert(
-            is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
-                is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
-                is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
             "wrong!");
 
         static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
@@ -415,7 +415,7 @@ struct BlockFmhaPipelineQRKSVSAsync
 
             __builtin_amdgcn_sched_barrier(0x7F);
             // store & prefetch next v, after the max reduction
-            if constexpr(ck_tile::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
             {
                 auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
                     Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
@@ -539,8 +539,7 @@ struct BlockFmhaPipelineQRKSVSAsync
                                sequence<(LdsSeq.at(number<k0_loops + i_k1>{})) * kN1, 0>{},
                                sequence<(LdsSeq.at(number<k0_loops + i_k1>{}) + 1) * kN1, kK1>{}));
 
-                    if constexpr(ck_tile::is_same_v<VLayout,
-                                                    ck_tile::tensor_layout::gemm::RowMajor>)
+                    if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
                     {
                         auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
                             Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp
index 433873d241..c9290f7ac9 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp
@@ -56,7 +56,7 @@ struct BlockFmhaPipelineQRKSVSFp8
     static constexpr index_t kAlignmentK =
         kPadHeadDimQ ? 1 : Policy::template GetAlignmentK<Problem>();
     static constexpr index_t kAlignmentV = []() {
-        if constexpr(ck_tile::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
             return kPadHeadDimV ? 1 : Policy::template GetAlignmentV<Problem>();
         else
             return kPadSeqLenK ? 1 : Policy::template GetAlignmentV<Problem>();
@@ -119,9 +119,9 @@ struct BlockFmhaPipelineQRKSVSFp8
                void* smem_ptr) const
     {
         static_assert(
-            is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
-                is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
-                is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
             "wrong!");
 
         static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
@@ -425,7 +425,7 @@ struct BlockFmhaPipelineQRKSVSFp8
             });
 
             block_sync_lds();
-            if constexpr(ck_tile::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
             {
                 auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
                     Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
@@ -453,8 +453,7 @@ struct BlockFmhaPipelineQRKSVSFp8
                                p, sequence<0, i_k1 * kK1>{}, sequence<kM0, (i_k1 + 1) * kK1>{}),
                            v_lds_window);
                     block_sync_lds();
-                    if constexpr(ck_tile::is_same_v<VLayout,
-                                                    ck_tile::tensor_layout::gemm::RowMajor>)
+                    if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
                     {
                         auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
                             Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
index 5b74861133..1cff2a24c3 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
@@ -114,9 +114,9 @@ struct BlockFmhaPipelineQSKSVS
                void* smem_ptr) const
     {
         static_assert(
-            is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
-                is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
-                is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
+            std::is_same_v<QDataType, remove_cvref_t<typename QDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<KDataType, remove_cvref_t<typename KDramBlockWindowTmp::DataType>> &&
+                std::is_same_v<VDataType, remove_cvref_t<typename VDramBlockWindowTmp::DataType>>,
             "wrong!");
 
         static_assert(kM0 == QDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
@@ -434,7 +434,7 @@ struct BlockFmhaPipelineQSKSVS
             });
 
             block_sync_lds();
-            if constexpr(ck_tile::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
             {
                 auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
                     Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
@@ -463,8 +463,7 @@ struct BlockFmhaPipelineQSKSVS
                                p, sequence<0, i_k1 * kK1>{}, sequence<kM0, (i_k1 + 1) * kK1>{}),
                            v_lds_window);
                     block_sync_lds();
-                    if constexpr(ck_tile::is_same_v<VLayout,
-                                                    ck_tile::tensor_layout::gemm::RowMajor>)
+                    if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
                     {
                         auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
                             Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index 9630bd9251..ed30642ff3 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -4,7 +4,11 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/gemm/pipeline/block_gemm_pipeline_problem.hpp"
+#include "ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
 
 // TODO: remove this
 #define K_LDS_LOAD_USE_OFFSET_TRANSFORM 0
@@ -76,24 +80,24 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
                                                    Problem::BlockFmhaShape::kK0>>;
 
         constexpr auto warp_gemm = []() {
-            if constexpr(is_same_v<typename Problem::QDataType, half_t> &&
-                         is_same_v<typename Problem::KDataType, half_t> &&
-                         is_same_v<typename Problem::SaccDataType, float>)
+            if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
+                         std::is_same_v<typename Problem::KDataType, half_t> &&
+                         std::is_same_v<typename Problem::SaccDataType, float>)
             {
-                return warp::WarpGemmMfmaF16F16F32M16N16K32SwizzleBTransposedCDistribution{};
+                return WarpGemmMfmaF16F16F32M16N16K32SwizzleBTransposedCDistribution{};
             }
-            else if constexpr(is_same_v<typename Problem::QDataType, bhalf_t> &&
-                              is_same_v<typename Problem::KDataType, bhalf_t> &&
-                              is_same_v<typename Problem::SaccDataType, float>)
+            else if constexpr(std::is_same_v<typename Problem::QDataType, bf16_t> &&
+                              std::is_same_v<typename Problem::KDataType, bf16_t> &&
+                              std::is_same_v<typename Problem::SaccDataType, float>)
             {
-                return warp::WarpGemmMfmaBf16Bf16F32M16N16K32SwizzleBTransposedCDistribution{};
+                return WarpGemmMfmaBf16Bf16F32M16N16K32SwizzleBTransposedCDistribution{};
             }
             else if constexpr(Problem::kIsFp8)
             {
                 constexpr index_t swizzle_factor = 4; // TODO: hard coded here
-                return warp::WarpGemmImpl<
-                    warp::WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
-                        warp::WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<
+                return WarpGemmImpl<
+                    WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+                        WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<
                             typename Problem::QDataType,
                             typename Problem::KDataType>,
                         2,
@@ -201,24 +205,24 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ false>
                                                    Problem::BlockFmhaShape::kK0>>;
 
         constexpr auto warp_gemm = []() {
-            if constexpr(is_same_v<typename Problem::QDataType, half_t> &&
-                         is_same_v<typename Problem::KDataType, half_t> &&
-                         is_same_v<typename Problem::SaccDataType, float>)
+            if constexpr(std::is_same_v<typename Problem::QDataType, half_t> &&
+                         std::is_same_v<typename Problem::KDataType, half_t> &&
+                         std::is_same_v<typename Problem::SaccDataType, float>)
             {
-                return warp::WarpGemmMfmaF16F16F32M16N16K32SwizzleBTransposedCDistribution{};
+                return WarpGemmMfmaF16F16F32M16N16K32SwizzleBTransposedCDistribution{};
             }
-            else if constexpr(is_same_v<typename Problem::QDataType, bhalf_t> &&
-                              is_same_v<typename Problem::KDataType, bhalf_t> &&
-                              is_same_v<typename Problem::SaccDataType, float>)
+            else if constexpr(std::is_same_v<typename Problem::QDataType, bf16_t> &&
+                              std::is_same_v<typename Problem::KDataType, bf16_t> &&
+                              std::is_same_v<typename Problem::SaccDataType, float>)
             {
-                return warp::WarpGemmMfmaBf16Bf16F32M16N16K32SwizzleBTransposedCDistribution{};
+                return WarpGemmMfmaBf16Bf16F32M16N16K32SwizzleBTransposedCDistribution{};
             }
             else if constexpr(Problem::kIsFp8)
             {
                 constexpr index_t swizzle_factor = 4; // TODO: hard coded here
-                return warp::WarpGemmImpl<
-                    warp::WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
-                        warp::WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<
+                return WarpGemmImpl<
+                    WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
+                        WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<
                             typename Problem::QDataType,
                             typename Problem::KDataType>,
                         2,
@@ -337,7 +341,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
     {
         using VLayout   = remove_cvref_t<typename Problem::BlockFmhaShape::VLayout>;
         using VDataType = remove_cvref_t<typename Problem::VDataType>;
-        if constexpr(ck_tile::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
         {
             constexpr index_t kBlockSize   = Problem::kBlockSize;
             constexpr index_t kNPerBlock   = Problem::BlockFmhaShape::kN1;
@@ -762,7 +766,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
         constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
 
-        if constexpr(ck_tile::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
         {
             constexpr index_t N1 = GetAlignmentV<Problem>();
             constexpr index_t N0 = kNPerBlock / N1; // P
@@ -857,7 +861,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
     {
         // This descriptor only used when V layout is seqlen * hdim
         using VLayout = remove_cvref_t<typename Problem::BlockFmhaShape::VLayout>;
-        static_assert(ck_tile::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>);
+        static_assert(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>);
         constexpr index_t kBlockSize = Problem::kBlockSize;
         constexpr index_t kNPerBlock = Problem::BlockFmhaShape::kN1;
         constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK1;
@@ -914,15 +918,15 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         auto warp_gemm = [&]() {
             if constexpr(Problem::kIsFp8)
             {
-                return warp::WarpGemmImpl<
-                    warp::WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
-                        warp::WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<
+                return WarpGemmImpl<
+                    WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
+                        WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<
                             typename Problem::PDataType,
                             typename Problem::VDataType>,
                         2>>{};
                 // return
-                // warp::WarpGemmImpl<warp::WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
-                //         warp::WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<typename
+                // WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution_SwizzleB<
+                //         WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<typename
                 //         Problem::PDataType, typename Problem::VDataType>>>{};
             }
             else
diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp
index 857305964f..c7ebcf9606 100644
--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -28,3 +28,4 @@
 #include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1.hpp
index 5d900d6f7c..c6eafd238b 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1.hpp
@@ -44,9 +44,9 @@ struct BlockGemmARegBGmemCRegV1
                                    void* smem_ptr) const
     {
         static_assert(
-            is_same_v<ADataType, remove_cv_t<typename ABlockTensor::DataType>> &&
-                is_same_v<BDataType, remove_cv_t<typename BBlockGmemWindowTmp::DataType>> &&
-                is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
+            std::is_same_v<ADataType, remove_cv_t<typename ABlockTensor::DataType>> &&
+                std::is_same_v<BDataType, remove_cv_t<typename BBlockGmemWindowTmp::DataType>> &&
+                std::is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
             "wrong!");
 
         constexpr index_t MPerBlock = ABlockTensor{}.get_lengths()[number<0>{}];
@@ -90,9 +90,10 @@ struct BlockGemmARegBGmemCRegV1
                                    const BBlockGmemWindowTmp& b_block_gmem_window_tmp,
                                    void* smem_ptr) const
     {
-        static_assert(is_same_v<ADataType, remove_cv_t<typename ABlockTensor::DataType>> &&
-                          is_same_v<BDataType, remove_cv_t<typename BBlockGmemWindowTmp::DataType>>,
-                      "wrong!");
+        static_assert(
+            std::is_same_v<ADataType, remove_cv_t<typename ABlockTensor::DataType>> &&
+                std::is_same_v<BDataType, remove_cv_t<typename BBlockGmemWindowTmp::DataType>>,
+            "wrong!");
 
         constexpr index_t MPerBlock = ABlockTensor{}.get_lengths()[number<0>{}];
         constexpr index_t NPerBlock = BBlockGmemWindowTmp{}.get_window_lengths()[number<0>{}];
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp
index 9e1f529426..dc24204895 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp
@@ -28,10 +28,11 @@ struct BlockGemmARegBSmemCRegV1
                                    const ABlockTensorTmp& a_block_tensor_tmp,
                                    const BBlockWindowTmp& b_block_window_tmp) const
     {
-        static_assert(is_same_v<ADataType, remove_cv_t<typename ABlockTensorTmp::DataType>> &&
-                          is_same_v<BDataType, remove_cv_t<typename BBlockWindowTmp::DataType>> &&
-                          is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
-                      "wrong!");
+        static_assert(
+            std::is_same_v<ADataType, remove_cv_t<typename ABlockTensorTmp::DataType>> &&
+                std::is_same_v<BDataType, remove_cv_t<typename BBlockWindowTmp::DataType>> &&
+                std::is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
+            "wrong!");
 
         constexpr index_t MPerBlock = ABlockTensorTmp{}.get_lengths()[number<0>{}];
         constexpr index_t NPerBlock = BBlockWindowTmp{}.get_window_lengths()[number<0>{}];
@@ -126,9 +127,9 @@ struct BlockGemmARegBSmemCRegV1
 
         // check C-block-distribution
         static_assert(
-            is_same_v<remove_cvref_t<decltype(c_block_dstr_encode)>,
-                      remove_cvref_t<decltype(CBlockTensor::get_tile_distribution()
-                                                  .get_static_tile_distribution_encoding())>>,
+            std::is_same_v<remove_cvref_t<decltype(c_block_dstr_encode)>,
+                           remove_cvref_t<decltype(CBlockTensor::get_tile_distribution()
+                                                       .get_static_tile_distribution_encoding())>>,
             "wrong!");
 
         using AWarpDstr = typename WG::AWarpDstr;
@@ -184,9 +185,10 @@ struct BlockGemmARegBSmemCRegV1
     CK_TILE_DEVICE auto operator()(const ABlockTensorTmp& a_block_tensor_tmp,
                                    const BBlockWindowTmp& b_block_window_tmp) const
     {
-        static_assert(is_same_v<ADataType, remove_cv_t<typename ABlockTensorTmp::DataType>> &&
-                          is_same_v<BDataType, remove_cv_t<typename BBlockWindowTmp::DataType>>,
-                      "wrong!");
+        static_assert(
+            std::is_same_v<ADataType, remove_cv_t<typename ABlockTensorTmp::DataType>> &&
+                std::is_same_v<BDataType, remove_cv_t<typename BBlockWindowTmp::DataType>>,
+            "wrong!");
 
         constexpr index_t MPerBlock = ABlockTensorTmp{}.get_lengths()[number<0>{}];
         constexpr index_t NPerBlock = BBlockWindowTmp{}.get_window_lengths()[number<0>{}];
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp
index d19d167cfc..22fc5a92f1 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp
@@ -14,9 +14,9 @@ struct BlockGemmARegBSmemCRegV1DefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
     {
-        if constexpr(is_same_v<typename Problem::ADataType, half_t> &&
-                     is_same_v<typename Problem::BDataType, half_t> &&
-                     is_same_v<typename Problem::CDataType, float>)
+        if constexpr(std::is_same_v<typename Problem::ADataType, half_t> &&
+                     std::is_same_v<typename Problem::BDataType, half_t> &&
+                     std::is_same_v<typename Problem::CDataType, float>)
         {
 #if 0
             constexpr index_t kBlockSize = Problem::kBlockSize;
@@ -43,9 +43,9 @@ struct BlockGemmARegBSmemCRegV1DefaultPolicy
             return make_tuple(WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution{}, 4, 1);
 #endif
         }
-        else if constexpr(is_same_v<typename Problem::ADataType, bhalf_t> &&
-                          is_same_v<typename Problem::BDataType, bhalf_t> &&
-                          is_same_v<typename Problem::CDataType, float>)
+        else if constexpr(std::is_same_v<typename Problem::ADataType, bf16_t> &&
+                          std::is_same_v<typename Problem::BDataType, bf16_t> &&
+                          std::is_same_v<typename Problem::CDataType, float>)
         {
             return make_tuple(WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution{}, 4, 1);
         }
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp
index c03379cc83..32ef2b51de 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp
@@ -28,10 +28,11 @@ struct BlockGemmARegBSmemCRegV2
                                    const ABlockTensorTmp& a_block_tensor_tmp,
                                    const BBlockWindowTmp& b_block_window_tmp) const
     {
-        static_assert(is_same_v<ADataType, remove_cv_t<typename ABlockTensorTmp::DataType>> &&
-                          is_same_v<BDataType, remove_cv_t<typename BBlockWindowTmp::DataType>> &&
-                          is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
-                      "wrong!");
+        static_assert(
+            std::is_same_v<ADataType, remove_cv_t<typename ABlockTensorTmp::DataType>> &&
+                std::is_same_v<BDataType, remove_cv_t<typename BBlockWindowTmp::DataType>> &&
+                std::is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
+            "wrong!");
 
         constexpr index_t MPerBlock = ABlockTensorTmp{}.get_lengths()[number<0>{}];
         constexpr index_t NPerBlock = BBlockWindowTmp{}.get_window_lengths()[number<0>{}];
@@ -126,9 +127,9 @@ struct BlockGemmARegBSmemCRegV2
 
         // check C-block-distribution
         static_assert(
-            is_same_v<remove_cvref_t<decltype(c_block_dstr_encode)>,
-                      remove_cvref_t<decltype(CBlockTensor::get_tile_distribution()
-                                                  .get_static_tile_distribution_encoding())>>,
+            std::is_same_v<remove_cvref_t<decltype(c_block_dstr_encode)>,
+                           remove_cvref_t<decltype(CBlockTensor::get_tile_distribution()
+                                                       .get_static_tile_distribution_encoding())>>,
             "wrong!");
 
         using AWarpDstr = typename WG::AWarpDstr;
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp
index de93c7dad4..e5156a5970 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp
@@ -28,9 +28,9 @@ struct BlockGemmASmemBSmemCRegV1
                                    const ABlockWindowTmp& a_block_window_tmp,
                                    const BBlockWindowTmp& b_block_window_tmp) const
     {
-        static_assert(is_same_v<ADataType, typename ABlockWindowTmp::DataType> &&
-                          is_same_v<BDataType, typename BBlockWindowTmp::DataType> &&
-                          is_same_v<CDataType, typename CBlockTensor::DataType>,
+        static_assert(std::is_same_v<ADataType, typename ABlockWindowTmp::DataType> &&
+                          std::is_same_v<BDataType, typename BBlockWindowTmp::DataType> &&
+                          std::is_same_v<CDataType, typename CBlockTensor::DataType>,
                       "wrong!");
 
         constexpr index_t MPerBlock = ABlockWindowTmp{}.get_window_lengths()[number<0>{}];
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
index 60a25549ed..87d7ce040c 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
@@ -14,9 +14,9 @@ struct BlockGemmASmemBSmemCRegV1DefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
     {
-        if constexpr(is_same_v<typename Problem::ADataType, half_t> &&
-                     is_same_v<typename Problem::BDataType, half_t> &&
-                     is_same_v<typename Problem::CDataType, float>)
+        if constexpr(std::is_same_v<typename Problem::ADataType, half_t> &&
+                     std::is_same_v<typename Problem::BDataType, half_t> &&
+                     std::is_same_v<typename Problem::CDataType, float>)
         {
 #if 0
             constexpr index_t kBlockSize = Problem::kBlockSize;
@@ -42,9 +42,9 @@ struct BlockGemmASmemBSmemCRegV1DefaultPolicy
             return make_tuple(WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution{}, 4, 1);
 #endif
         }
-        else if constexpr(is_same_v<typename Problem::ADataType, bhalf_t> &&
-                          is_same_v<typename Problem::BDataType, bhalf_t> &&
-                          is_same_v<typename Problem::CDataType, float>)
+        else if constexpr(std::is_same_v<typename Problem::ADataType, bf16_t> &&
+                          std::is_same_v<typename Problem::BDataType, bf16_t> &&
+                          std::is_same_v<typename Problem::CDataType, float>)
         {
             return make_tuple(WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution{}, 4, 1);
         }
diff --git a/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v1.hpp
index f916cc31f8..29b03e2828 100644
--- a/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -47,8 +47,8 @@ struct BlockGemmPipelineAGmemBGmemCRegV1
                                         void* p_smem) const
     {
         static_assert(
-            is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
-                is_same_v<BDataType, remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
+            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
+                std::is_same_v<BDataType, remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
             "wrong!");
 
         static_assert(kMPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
diff --git a/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2.hpp
index 3d1b9d4596..ef0611dd60 100644
--- a/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/block_gemm_pipeline_agmem_bgmem_creg_v2.hpp
@@ -47,8 +47,8 @@ struct BlockGemmPipelineAGmemBGmemCRegV2
                                         void* p_smem) const
     {
         static_assert(
-            is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
-                is_same_v<BDataType, remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
+            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
+                std::is_same_v<BDataType, remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
             "wrong!");
 
         static_assert(kMPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
index 705af423c4..e25dc98bbd 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -4,6 +4,8 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp"
 
 namespace ck_tile {
 
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
index f7d048a015..79ed1b6e15 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp"
 
 namespace ck_tile {
 
@@ -74,8 +75,8 @@ struct WarpGemmAtrributeMfmaIterateK
     using BDataType = typename Impl::BDataType;
     using CDataType = typename Impl::CDataType;
 
-    using AVecType = typename vector_type_maker<typename Impl::AVecType, kKIter>::type::type;
-    using BVecType = typename vector_type_maker<typename Impl::BVecType, kKIter>::type::type;
+    using AVecType = array<ADataType, Impl::AVecType::size() * kKIter>;
+    using BVecType = array<BDataType, Impl::BVecType::size() * kKIter>;
     using CVecType = typename Impl::CVecType;
 
     static constexpr index_t kM = Impl::kM;
@@ -111,33 +112,27 @@ struct WarpGemmAtrributeMfmaIterateK
     CK_TILE_DEVICE void
     operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
     {
-        const auto a_vector = typename vector_type_maker<AVecType, 1>::type{a_vec};
-        const auto b_vector = typename vector_type_maker<BVecType, 1>::type{b_vec};
-
         static_for<0, kKIter, 1>{}([&](auto iKIter) {
             Impl{}(c_vec,
-                   a_vector.template AsType<typename Impl::AVecType>()[iKIter],
-                   b_vector.template AsType<typename Impl::BVecType>()[iKIter]);
+                   a_vec.template get_as<typename Impl::AVecType>()[iKIter],
+                   b_vec.template get_as<typename Impl::BVecType>()[iKIter]);
         });
     }
 
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
-        const auto a_vector = typename vector_type_maker<AVecType, 1>::type{a_vec};
-        const auto b_vector = typename vector_type_maker<BVecType, 1>::type{b_vec};
-
         constexpr auto I0 = number<0>{};
 
         // c = a * b
-        auto c_vec = Impl{}(a_vector.template AsType<typename Impl::AVecType>()[I0],
-                            b_vector.template AsType<typename Impl::BVecType>()[I0]);
+        auto c_vec = Impl{}(a_vec.template get_as<typename Impl::AVecType>()[I0],
+                            b_vec.template get_as<typename Impl::BVecType>()[I0]);
 
         // c += a * b
         static_for<1, kKIter, 1>{}([&](auto iKIter) {
             Impl{}(c_vec,
-                   a_vector.template AsType<typename Impl::AVecType>()[iKIter],
-                   b_vector.template AsType<typename Impl::BVecType>()[iKIter]);
+                   a_vec.template get_as<typename Impl::AVecType>()[iKIter],
+                   b_vec.template get_as<typename Impl::BVecType>()[iKIter]);
         });
 
         return c_vec;
@@ -274,8 +269,8 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
     using BDataType = typename Impl::ADataType;
     using CDataType = typename Impl::CDataType;
 
-    using AVecType = typename vector_type_maker<typename Impl::BVecType, kKIter>::type::type;
-    using BVecType = typename vector_type_maker<typename Impl::AVecType, kKIter>::type::type;
+    using AVecType = array<ADataType, Impl::AVecType::size() * kKIter>;
+    using BVecType = array<BDataType, Impl::BVecType::size() * kKIter>;
     using CVecType = typename Impl::CVecType;
 
     static constexpr index_t kM = Impl::kN;
@@ -311,34 +306,27 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
     CK_TILE_DEVICE void
     operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
     {
-        const auto a_vector = typename vector_type_maker<AVecType, 1>::type{a_vec};
-
-        const auto b_vector = typename vector_type_maker<BVecType, 1>::type{b_vec};
-
         // swap A and B, value and type
         static_for<0, kKIter, 1>{}([&](auto iKIter) {
             Impl{}(c_vec,
-                   b_vector.template AsType<typename Impl::AVecType>()[iKIter],
-                   a_vector.template AsType<typename Impl::BVecType>()[iKIter]);
+                   a_vec.template get_as<typename Impl::AVecType>()[iKIter],
+                   b_vec.template get_as<typename Impl::BVecType>()[iKIter]);
         });
     }
 
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
-        const auto a_vector = typename vector_type_maker<AVecType, 1>::type{a_vec};
-        const auto b_vector = typename vector_type_maker<BVecType, 1>::type{b_vec};
-
         constexpr auto I0 = number<0>{};
 
         // swap A and B, value and type
-        auto c_vec = Impl{}(b_vector.template AsType<typename Impl::AVecType>()[I0],
-                            a_vector.template AsType<typename Impl::BVecType>()[I0]);
+        auto c_vec = Impl{}(a_vec.template get_as<typename Impl::AVecType>()[I0],
+                            b_vec.template get_as<typename Impl::BVecType>()[I0]);
 
         static_for<1, kKIter, 1>{}([&](auto iKIter) {
             Impl{}(c_vec,
-                   b_vector.template AsType<typename Impl::AVecType>()[iKIter],
-                   a_vector.template AsType<typename Impl::BVecType>()[iKIter]);
+                   a_vec.template get_as<typename Impl::AVecType>()[iKIter],
+                   b_vec.template get_as<typename Impl::BVecType>()[iKIter]);
         });
 
         return c_vec;
@@ -355,8 +343,8 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
     using BDataType = typename Impl::ADataType;
     using CDataType = typename Impl::CDataType;
 
-    using AVecType = typename vector_type_maker<typename Impl::BVecType, kKIter>::type::type;
-    using BVecType = typename vector_type_maker<typename Impl::AVecType, kKIter>::type::type;
+    using AVecType = array<ADataType, Impl::AVecType::size() * kKIter>;
+    using BVecType = array<BDataType, Impl::BVecType::size() * kKIter>;
     using CVecType = typename Impl::CVecType;
 
     static constexpr index_t kM      = Impl::kN;
@@ -418,34 +406,27 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
     CK_TILE_DEVICE void
     operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
     {
-        const auto a_vector = typename vector_type_maker<AVecType, 1>::type{a_vec};
-
-        const auto b_vector = typename vector_type_maker<BVecType, 1>::type{b_vec};
-
         // swap A and B, value and type
         static_for<0, kKIter, 1>{}([&](auto iKIter) {
             Impl{}(c_vec,
-                   b_vector.template AsType<typename Impl::AVecType>()[iKIter],
-                   a_vector.template AsType<typename Impl::BVecType>()[iKIter]);
+                   b_vec.template get_as<typename Impl::AVecType>()[iKIter],
+                   a_vec.template get_as<typename Impl::BVecType>()[iKIter]);
         });
     }
 
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
-        const auto a_vector = typename vector_type_maker<AVecType, 1>::type{a_vec};
-        const auto b_vector = typename vector_type_maker<BVecType, 1>::type{b_vec};
-
         constexpr auto I0 = number<0>{};
 
         // swap A and B, value and type
-        auto c_vec = Impl{}(b_vector.template AsType<typename Impl::AVecType>()[I0],
-                            a_vector.template AsType<typename Impl::BVecType>()[I0]);
+        auto c_vec = Impl{}(b_vec.template get_as<typename Impl::AVecType>()[I0],
+                            a_vec.template get_as<typename Impl::BVecType>()[I0]);
 
         static_for<1, kKIter, 1>{}([&](auto iKIter) {
             Impl{}(c_vec,
-                   b_vector.template AsType<typename Impl::AVecType>()[iKIter],
-                   a_vector.template AsType<typename Impl::BVecType>()[iKIter]);
+                   b_vec.template get_as<typename Impl::AVecType>()[iKIter],
+                   a_vec.template get_as<typename Impl::BVecType>()[iKIter]);
         });
 
         return c_vec;
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
index d67c396651..ecc4165e09 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -10,13 +10,13 @@ namespace ck_tile {
 // FP16
 struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8
 {
-    using ADataType = half_t;
-    using BDataType = half_t;
+    using ADataType = fp16_t;
+    using BDataType = fp16_t;
     using CDataType = float;
 
-    using AVecType = typename vector_type<half_t, 4>::type;
-    using BVecType = typename vector_type<half_t, 4>::type;
-    using CVecType = typename vector_type<float, 16>::type;
+    using AVecType = array<fp16_t, 4>;
+    using BVecType = array<fp16_t, 4>;
+    using CVecType = array<float, 16>;
 
     static constexpr index_t kM = 32;
     static constexpr index_t kN = 32;
@@ -36,25 +36,37 @@ struct WarpGemmAttributeMfmaImplF16F16F32M32N32K8
     CK_TILE_DEVICE void
     operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
     {
-        c_vec = __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, c_vec, 0, 0, 0);
+        c_vec.template get_as<fp32x16_t>()[number<0>{}] =
+            __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec.template get_as<fp16x4_t>()[number<0>{}],
+                                                 b_vec.template get_as<fp16x4_t>()[number<0>{}],
+                                                 c_vec.template get_as<fp32x16_t>()[number<0>{}],
+                                                 0,
+                                                 0,
+                                                 0);
     }
 
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
-        return __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec, b_vec, CVecType{0.f}, 0, 0, 0);
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_32x32x8f16(a_vec.template get_as<fp16x4_t>()[number<0>{}],
+                                                 b_vec.template get_as<fp16x4_t>()[number<0>{}],
+                                                 fp32x16_t{0.f},
+                                                 0,
+                                                 0,
+                                                 0));
     }
 };
 
 struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16
 {
-    using ADataType = half_t;
-    using BDataType = half_t;
+    using ADataType = fp16_t;
+    using BDataType = fp16_t;
     using CDataType = float;
 
-    using AVecType = typename vector_type<half_t, 4>::type;
-    using BVecType = typename vector_type<half_t, 4>::type;
-    using CVecType = typename vector_type<float, 4>::type;
+    using AVecType = array<fp16_t, 4>;
+    using BVecType = array<fp16_t, 4>;
+    using CVecType = array<float, 4>;
 
     static constexpr index_t kM = 16;
     static constexpr index_t kN = 16;
@@ -74,26 +86,38 @@ struct WarpGemmAttributeMfmaImplF16F16F32M16N16K16
     CK_TILE_DEVICE void
     operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
     {
-        c_vec = __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, c_vec, 0, 0, 0);
+        c_vec.template get_as<fp32x4_t>()[number<0>{}] =
+            __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec.template get_as<fp16x4_t>()[number<0>{}],
+                                                  b_vec.template get_as<fp16x4_t>()[number<0>{}],
+                                                  c_vec.template get_as<fp32x4_t>()[number<0>{}],
+                                                  0,
+                                                  0,
+                                                  0);
     }
 
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
-        return __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec, b_vec, CVecType{0.f}, 0, 0, 0);
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_16x16x16f16(a_vec.template get_as<fp16x4_t>()[number<0>{}],
+                                                  b_vec.template get_as<fp16x4_t>()[number<0>{}],
+                                                  fp32x4_t{0.f},
+                                                  0,
+                                                  0,
+                                                  0));
     }
 };
 
 // Bf16
 struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
 {
-    using ADataType = bhalf_t;
-    using BDataType = bhalf_t;
+    using ADataType = bf16_t;
+    using BDataType = bf16_t;
     using CDataType = float;
 
-    using AVecType = typename vector_type<bhalf_t, 4>::type;
-    using BVecType = typename vector_type<bhalf_t, 4>::type;
-    using CVecType = typename vector_type<float, 16>::type;
+    using AVecType = array<bf16_t, 4>;
+    using BVecType = array<bf16_t, 4>;
+    using CVecType = array<float, 16>;
 
     static constexpr index_t kM = 32;
     static constexpr index_t kN = 32;
@@ -113,25 +137,37 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8
     CK_TILE_DEVICE void
     operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
     {
-        c_vec = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
+        c_vec.template get_as<fp32x16_t>()[number<0>{}] = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(
+            a_vec.template get_as<bf16x4_t>()[number<0>{}],
+            b_vec.template get_as<bf16x4_t>()[number<0>{}],
+            c_vec.template get_as<fp32x16_t>()[number<0>{}],
+            0,
+            0,
+            0);
     }
 
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
-        return __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a_vec, b_vec, CVecType{0.f}, 0, 0, 0);
+        return bit_cast<CVecType>(
+            __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a_vec.template get_as<bf16x4_t>()[number<0>{}],
+                                                     b_vec.template get_as<bf16x4_t>()[number<0>{}],
+                                                     fp32x16_t{0.f},
+                                                     0,
+                                                     0,
+                                                     0));
     }
 };
 
 struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
 {
-    using ADataType = bhalf_t;
-    using BDataType = bhalf_t;
+    using ADataType = bf16_t;
+    using BDataType = bf16_t;
     using CDataType = float;
 
-    using AVecType = typename vector_type<bhalf_t, 4>::type;
-    using BVecType = typename vector_type<bhalf_t, 4>::type;
-    using CVecType = typename vector_type<float, 4>::type;
+    using AVecType = array<bf16_t, 4>;
+    using BVecType = array<bf16_t, 4>;
+    using CVecType = array<float, 4>;
 
     static constexpr index_t kM = 16;
     static constexpr index_t kN = 16;
@@ -151,13 +187,25 @@ struct WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16
     CK_TILE_DEVICE void
     operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
     {
-        c_vec = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a_vec, b_vec, c_vec, 0, 0, 0);
+        c_vec.template get_as<fp32x4_t>()[number<0>{}] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
+            a_vec.template get_as<bf16x4_t>()[number<0>{}],
+            b_vec.template get_as<bf16x4_t>()[number<0>{}],
+            c_vec.template get_as<fp32x4_t>()[number<0>{}],
+            0,
+            0,
+            0);
     }
 
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
-        return __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a_vec, b_vec, CVecType{0.f}, 0, 0, 0);
+        return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
+            a_vec.template get_as<bf16x4_t>()[number<0>{}],
+            b_vec.template get_as<bf16x4_t>()[number<0>{}],
+            fp32x4_t{0.f},
+            0,
+            0,
+            0));
     }
 };
 
@@ -169,9 +217,9 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
     using BDataType = BType_;
     using CDataType = float;
 
-    using AVecType = typename vector_type<ADataType, 8>::type;
-    using BVecType = typename vector_type<BDataType, 8>::type;
-    using CVecType = typename vector_type<CDataType, 16>::type;
+    using AVecType = array<ADataType, 8>;
+    using BVecType = array<BDataType, 8>;
+    using CVecType = array<CDataType, 16>;
 
     static constexpr index_t kM = 32;
     static constexpr index_t kN = 32;
@@ -192,27 +240,49 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
     operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
     {
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
-        if constexpr(is_same_v<ADataType, fp8_t> && is_same_v<BDataType, fp8_t>)
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
-        else if constexpr(is_same_v<ADataType, fp8_t> && is_same_v<BDataType, bf8_t>)
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
-        else if constexpr(is_same_v<ADataType, bf8_t> && is_same_v<BDataType, fp8_t>)
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
-        else if constexpr(is_same_v<ADataType, bf8_t> && is_same_v<BDataType, bf8_t>)
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            c_vec.template get_as<fp32x16_t>()[number<0>{}] =
+                __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
+                    bit_cast<long>(a_vec),
+                    bit_cast<long>(b_vec),
+                    c_vec.template get_as<fp32x16_t>()[number<0>{}],
+                    0,
+                    0,
+                    0);
+        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            c_vec.template get_as<fp32x16_t>()[number<0>{}] =
+                __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(
+                    bit_cast<long>(a_vec),
+                    bit_cast<long>(b_vec),
+                    c_vec.template get_as<fp32x16_t>()[number<0>{}],
+                    0,
+                    0,
+                    0);
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            c_vec.template get_as<fp32x16_t>()[number<0>{}] =
+                __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(
+                    bit_cast<long>(a_vec),
+                    bit_cast<long>(b_vec),
+                    c_vec.template get_as<fp32x16_t>()[number<0>{}],
+                    0,
+                    0,
+                    0);
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            c_vec.template get_as<fp32x16_t>()[number<0>{}] =
+                __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(
+                    bit_cast<long>(a_vec),
+                    bit_cast<long>(b_vec),
+                    c_vec.template get_as<fp32x16_t>()[number<0>{}],
+                    0,
+                    0,
+                    0);
 #else
-        vector_type<ADataType, 8> a_(a_vec);
-        vector_type<BDataType, 8> b_(b_vec);
-
         static_for<0, 8, 1>{}([&](auto k) {
-            float a_f32 = type_convert<float>(a_.template AsType<ADataType>()[number<k>{}]);
-            float b_f32 = type_convert<float>(b_.template AsType<BDataType>()[number<k>{}]);
+            float a_f32 = type_convert<float>(a_vec.template get_as<ADataType>()[number<k>{}]);
+            float b_f32 = type_convert<float>(b_vec.template get_as<BDataType>()[number<k>{}]);
 
-            c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0);
+            c_vec.template get_as<fp32x16_t>()[number<0>{}] = __builtin_amdgcn_mfma_f32_32x32x2f32(
+                a_f32, b_f32, c_vec.template get_as<fp32x16_t>()[number<0>{}], 0, 0, 0);
         });
 #endif
     }
@@ -220,18 +290,18 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base
     // c_vec = a_vec * b_vec
     CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
     {
-        if constexpr(is_same_v<ADataType, fp8_t> && is_same_v<BDataType, fp8_t>)
-            return __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0);
-        else if constexpr(is_same_v<ADataType, fp8_t> && is_same_v<BDataType, bf8_t>)
-            return __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0);
-        else if constexpr(is_same_v<ADataType, bf8_t> && is_same_v<BDataType, fp8_t>)
-            return __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0);
-        else if constexpr(is_same_v<ADataType, bf8_t> && is_same_v<BDataType, bf8_t>)
-            return __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(
-                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0);
+        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
+        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
+            return bit_cast<CVecType>(__builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), CVecType{0.f}, 0, 0, 0));
     }
 };
 
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
index 5309794b23..8d12130308 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
 
 namespace ck_tile {
 
@@ -29,14 +30,14 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution; };
 
 // bf16
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bhalf_t, ck_tile::bhalf_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bhalf_t, ck_tile::bhalf_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bhalf_t, ck_tile::bhalf_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bhalf_t, ck_tile::bhalf_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bhalf_t, ck_tile::bhalf_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bhalf_t, ck_tile::bhalf_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bhalf_t, ck_tile::bhalf_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; };
-template<> struct WarpGemmMfmaDispatcher<ck_tile::bhalf_t, ck_tile::bhalf_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 16, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, false> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 16, 16, 32, true> { using Type = WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution; };
 
 // fp8
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
index 94d0e02931..02c8812d49 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_impl.hpp
@@ -33,9 +33,9 @@ struct WarpGemmImpl
 
     CK_TILE_DEVICE void operator()(CWarpTensor& c, const AWarpTensor& a, const BWarpTensor& b) const
     {
-        using AVec = typename vector_type<ADataType, AWarpTensor::get_thread_buffer_size()>::type;
-        using BVec = typename vector_type<BDataType, BWarpTensor::get_thread_buffer_size()>::type;
-        using CVec = typename vector_type<CDataType, CWarpTensor::get_thread_buffer_size()>::type;
+        using AVec = array<ADataType, AWarpTensor::get_thread_buffer_size()>;
+        using BVec = array<BDataType, BWarpTensor::get_thread_buffer_size()>;
+        using CVec = array<CDataType, CWarpTensor::get_thread_buffer_size()>;
 
         constexpr auto I0 = number<0>{};
 
@@ -53,9 +53,9 @@ struct WarpGemmImpl
     {
         CWarpTensor c;
 
-        using AVec = typename vector_type<ADataType, AWarpTensor::get_thread_buffer_size()>::type;
-        using BVec = typename vector_type<BDataType, BWarpTensor::get_thread_buffer_size()>::type;
-        using CVec = typename vector_type<CDataType, CWarpTensor::get_thread_buffer_size()>::type;
+        using AVec = array<ADataType, AWarpTensor::get_thread_buffer_size()>;
+        using BVec = array<BDataType, BWarpTensor::get_thread_buffer_size()>;
+        using CVec = array<CDataType, CWarpTensor::get_thread_buffer_size()>;
 
         constexpr auto I0 = number<0>{};
 
diff --git a/include/ck_tile/ops/reduce.hpp b/include/ck_tile/ops/reduce.hpp
index 59d761fb17..a5ba745d29 100644
--- a/include/ck_tile/ops/reduce.hpp
+++ b/include/ck_tile/ops/reduce.hpp
@@ -4,3 +4,4 @@
 #pragma once
 
 #include "ck_tile/ops/reduce/block/block_reduce.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp
index 7c8a9d4398..176b870ec9 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -188,7 +188,7 @@ CK_TILE_DEVICE auto block_tile_reduce(const InDistributedTensor_& in_tensor,
     using InDataType  = typename InDistributedTensor_::DataType;
     using AccDataType = remove_cvref_t<AccDataType_>;
 
-    static_assert(is_same_v<InDataType, remove_cvref_t<InDataType_>>, "wrong!");
+    static_assert(std::is_same_v<InDataType, remove_cvref_t<InDataType_>>, "wrong!");
 
     // declare acc_tensor
     constexpr auto acc_dstr =
diff --git a/include/ck_tile/remod.py b/include/ck_tile/remod.py
index fee4d594bd..033d7d4d24 100644
--- a/include/ck_tile/remod.py
+++ b/include/ck_tile/remod.py
@@ -2,9 +2,11 @@ import pathlib
 from pathlib import Path
 import subprocess
 import os
+import copy
 
 NS = 'ck_tile'
 OPS = 'ops'
+OPS_COMMON = 'common' # common header will be duplicated into ops/* other module
 
 HEADER_COMMON = """// SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
@@ -54,6 +56,15 @@ class submodule_t:
                     f.write(f'#include \"{header_path}\"\n')
                 f.write('\n')
         # print(self.m)
+        # restructure common
+        for k, v in self.m.items():
+            if k == OPS and OPS_COMMON in v.keys():
+                common_list = copy.deepcopy(v[OPS_COMMON])
+                # v.pop(OPS_COMMON)
+                for km in v.keys():
+                    if km != OPS_COMMON:
+                        v[km].extend(common_list)
+
         for k, v in self.m.items():
             if k == OPS:
                 for km, kv in v.items():