Export ROCm/rocm-libraries@2d4a3223cb

2026-05-05 14:11:29 +00:00 · 2026-03-11 23:03:20 -04:00
commit e6cd3f1e3f
6330 changed files with 1132789 additions and 0 deletions
--- a/include/ck_tile/host/arg_parser.hpp
+++ b/include/ck_tile/host/arg_parser.hpp
@@ -0,0 +1,240 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <string>
+
+#include <iomanip>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
+
+namespace ck_tile {
+/*
+ * a host side utility, arg parser for, either
+ * -[key0] = [value0, value1, value2]
+ * or
+ * -[key0]=[value0] -[key1]=[value1] ...
+ */
+class ArgParser
+{
+
+    public:
+    class Arg
+    {
+        public:
+        std::string name;
+        std::string value;
+        std::string help_text;
+    };
+
+    ArgParser() {}
+    ArgParser& insert(const std::string& _name,
+                      const std::string& _default_value,
+                      const std::string& _help_text)
+    {
+        Arg in;
+        in.name      = _name;
+        in.value     = _default_value;
+        in.help_text = _help_text;
+
+        if(input_map.count(_name) != 0)
+        {
+            printf("arg:%s already exist\n", _name.c_str());
+        }
+        else
+        {
+            input_map[_name] = in;
+            keys.push_back(_name);
+        }
+        return *this;
+    }
+    void print() const
+    {
+        // find max key length
+        std::string::size_type max_key_length = 11;
+        for(auto& key : keys)
+        {
+            if(max_key_length < key.length())
+            {
+                max_key_length = key.length();
+            }
+        }
+
+        printf("args:\n");
+        for(auto& key : keys)
+        {
+            auto value = input_map.at(key);
+            std::vector<std::string> help_text_lines;
+            size_t pos = 0;
+            for(size_t next_pos = value.help_text.find('\n', pos); next_pos != std::string::npos;)
+            {
+                help_text_lines.push_back(std::string(value.help_text.begin() + pos,
+                                                      value.help_text.begin() + next_pos++));
+                pos      = next_pos;
+                next_pos = value.help_text.find('\n', pos);
+            }
+            help_text_lines.push_back(
+                std::string(value.help_text.begin() + pos, value.help_text.end()));
+
+            std::string default_value = std::string("(default:") + value.value + std::string(")");
+            std::cout << std::setw(1 + max_key_length - value.name.length()) << "-" << key
+                      << std::setw(4) << " " << help_text_lines[0] << " " << default_value
+                      << std::endl;
+
+            for(auto help_next_line = std::next(help_text_lines.begin());
+                help_next_line != help_text_lines.end();
+                ++help_next_line)
+            {
+                std::cout << std::setw(1 + max_key_length + 4) << " " << *help_next_line
+                          << std::endl;
+            }
+        }
+    }
+    bool parse(int argc, char* argv[], int start_index = 1)
+    {
+        if(argc < start_index)
+        {
+            printf("not enough args\n");
+            return false;
+        }
+        for(int i = start_index; i < argc; i++)
+        {
+            char* cur_arg = argv[i];
+            if(cur_arg[0] != '-')
+            {
+                printf("illegal input\n");
+                print();
+                return false;
+            }
+            else
+            {
+                std::string text(cur_arg + 1);
+                if(text == "?")
+                {
+                    print();
+                    return false;
+                }
+                auto pos = text.find('=');
+                if(pos == std::string::npos)
+                {
+                    printf("arg should be [key]=[value] pair, here:%s\n", text.c_str());
+                    return false;
+                }
+                if(pos >= (text.size() - 1))
+                {
+                    printf("cant find value after \"=\", here:%s\n", text.c_str());
+                    return false;
+                }
+                auto key   = text.substr(0, pos);
+                auto value = text.substr(pos + 1);
+                if(input_map.count(key) == 0)
+                {
+                    printf("no such arg:%s\n", key.c_str());
+                    return false;
+                }
+                input_map[key].value = value;
+            }
+        }
+        return true;
+    }
+
+    std::string get_str(const std::string& name) const
+    {
+        std::string value = input_map.at(name).value;
+        return value;
+    }
+
+    int get_int(const std::string& name) const
+    {
+        int value = atoi(input_map.at(name).value.c_str());
+        return value;
+    }
+
+    uint32_t get_uint32(const std::string& name) const
+    {
+        uint32_t value = strtoul(input_map.at(name).value.c_str(), nullptr, 10);
+        return value;
+    }
+
+    uint64_t get_uint64(const std::string& name) const
+    {
+        uint64_t value = strtoull(input_map.at(name).value.c_str(), nullptr, 10);
+        return value;
+    }
+
+    bool get_bool(const std::string& name) const
+    {
+        auto v = input_map.at(name).value;
+        if(v.compare("t") == 0 || v.compare("true") == 0)
+            return true;
+        if(v.compare("f") == 0 || v.compare("false") == 0)
+            return false;
+        int value = atoi(v.c_str());
+        return value == 0 ? false : true;
+    }
+
+    float get_float(const std::string& name) const
+    {
+        double value = atof(input_map.at(name).value.c_str());
+        return static_cast<float>(value);
+    }
+
+    double get_double(const std::string& name) const
+    {
+        double value = atof(input_map.at(name).value.c_str());
+        return value;
+    }
+
+    std::vector<std::string> get_string_vec(const std::string& name,
+                                            const std::string& delimiter = ",") const
+    {
+        if(get_str(name).empty())
+        {
+            return {};
+        }
+        std::string s = get_str(name);
+        std::vector<std::string> tokens;
+        size_t pos = 0;
+        std::string token;
+        while((pos = s.find(delimiter)) != std::string::npos)
+        {
+            token = s.substr(0, pos);
+            tokens.push_back(token);
+            s.erase(0, pos + delimiter.length());
+        }
+        tokens.push_back(s);
+
+        return tokens;
+    }
+
+    std::vector<int> get_int_vec(const std::string& name, const std::string& delimiter = ",") const
+    {
+        if(get_str(name).empty())
+        {
+            return {};
+        }
+        const std::vector<std::string> args = get_string_vec(name, delimiter);
+        std::vector<int> tokens;
+        tokens.reserve(static_cast<int>(args.size()));
+        for(const std::string& token : args)
+        {
+            int value = atoi(token.c_str());
+            tokens.push_back(value);
+        }
+        return tokens;
+    }
+
+    private:
+    std::unordered_map<std::string, Arg> input_map;
+    std::vector<std::string> keys;
+};
+} // namespace ck_tile
+#pragma clang diagnostic pop
--- a/include/ck_tile/host/check_err.hpp
+++ b/include/ck_tile/host/check_err.hpp
@@ -0,0 +1,782 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <iomanip>
+#include <iterator>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/ranges.hpp"
+
+namespace ck_tile {
+
+/** @brief Maximum number of error values to display when checking errors */
+constexpr int ERROR_DETAIL_LIMIT = 16;
+
+/** @brief 8-bit floating point type */
+using F8 = ck_tile::fp8_t;
+/** @brief 8-bit brain floating point type */
+using BF8 = ck_tile::bf8_t;
+/** @brief 16-bit floating point (half precision) type */
+using F16 = ck_tile::half_t;
+/** @brief 16-bit brain floating point type */
+using BF16 = ck_tile::bf16_t;
+/** @brief 32-bit floating point (single precision) type */
+using F32 = float;
+/** @brief 8-bit signed integer type */
+using I8 = int8_t;
+/** @brief 32-bit signed integer type */
+using I32 = int32_t;
+
+/**
+ * @brief Calculate relative error threshold for numerical comparisons
+ *
+ * Calculates the relative error threshold based on the mantissa bits and characteristics
+ * of the data types involved in the computation.
+ *
+ * @tparam ComputeDataType Type used for computation
+ * @tparam OutDataType Type used for output
+ * @tparam AccDataType Type used for accumulation (defaults to ComputeDataType)
+ * @param number_of_accumulations Number of accumulation operations performed
+ * @return Relative error threshold based on data type characteristics
+ */
+template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
+CK_TILE_HOST double get_relative_threshold(const int number_of_accumulations = 1)
+{
+
+    static_assert(is_any_of<ComputeDataType,
+                            F8,
+                            BF8,
+                            F16,
+                            BF16,
+                            F32,
+                            pk_fp4_t,
+                            pk_fp4_raw_t,
+                            pk_int4_t,
+                            I8,
+                            I32,
+                            int>::value,
+                  "Warning: Unhandled ComputeDataType for setting up the relative threshold!");
+
+    double compute_error = 0;
+    if constexpr(is_any_of<ComputeDataType, pk_int4_t, I8, I32, int>::value)
+    {
+        return 0;
+    }
+    else
+    {
+        compute_error = std::pow(2, -numeric_traits<ComputeDataType>::mant) * 0.5;
+    }
+
+    static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
+                  "Warning: Unhandled OutDataType for setting up the relative threshold!");
+
+    double output_error = 0;
+    if constexpr(is_any_of<OutDataType, pk_int4_t, I8, I32, int>::value)
+    {
+        return 0;
+    }
+    else
+    {
+        output_error = std::pow(2, -numeric_traits<OutDataType>::mant) * 0.5;
+    }
+    double midway_error = std::max(compute_error, output_error);
+
+    static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
+                  "Warning: Unhandled AccDataType for setting up the relative threshold!");
+
+    double acc_error = 0;
+    if constexpr(is_any_of<AccDataType, pk_int4_t, I8, I32, int>::value)
+    {
+        return 0;
+    }
+    else
+    {
+        acc_error = std::pow(2, -numeric_traits<AccDataType>::mant) * 0.5 * number_of_accumulations;
+    }
+    return std::max(acc_error, midway_error);
+}
+
+/**
+ * @brief Calculate absolute error threshold for numerical comparisons
+ *
+ * Calculates the absolute error threshold based on the maximum possible value and
+ * the characteristics of the data types involved in the computation.
+ *
+ * @tparam ComputeDataType Type used for computation
+ * @tparam OutDataType Type used for output
+ * @tparam AccDataType Type used for accumulation (defaults to ComputeDataType)
+ * @param max_possible_num Maximum possible value in the computation
+ * @param number_of_accumulations Number of accumulation operations performed
+ * @return Absolute error threshold based on data type characteristics and maximum value
+ */
+template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
+CK_TILE_HOST double get_absolute_threshold(const double max_possible_num,
+                                           const int number_of_accumulations = 1)
+{
+
+    static_assert(is_any_of<ComputeDataType,
+                            F8,
+                            BF8,
+                            F16,
+                            BF16,
+                            F32,
+                            pk_fp4_t,
+                            pk_fp4_raw_t,
+                            pk_int4_t,
+                            I8,
+                            I32,
+                            int>::value,
+                  "Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
+
+    // Use discrete exponent (floor of log2) to match actual floating-point exponent levels
+    // This ensures ULP calculation matches the discrete precision levels of FP representation
+    int discrete_expo =
+        std::floor(static_cast<int>(std::floor(std::log2(std::abs(max_possible_num)))));
+    double compute_error = 0;
+    if constexpr(is_any_of<ComputeDataType, pk_int4_t, I8, I32, int>::value)
+    {
+        return 0;
+    }
+    else
+    {
+        compute_error = std::pow(2, discrete_expo - numeric_traits<ComputeDataType>::mant) * 0.5;
+    }
+
+    static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
+                  "Warning: Unhandled OutDataType for setting up the absolute threshold!");
+
+    double output_error = 0;
+    if constexpr(is_any_of<OutDataType, pk_int4_t, I8, I32, int>::value)
+    {
+        return 0;
+    }
+    else
+    {
+        // Use full ULP (1.0) instead of half ULP (0.5) for output_error to account for
+        // hardware vs software conversion differences (e.g., hardware __bf16 vs software
+        // float_to_bf16 can differ by up to 1 ULP at tie cases)
+        output_error = std::pow(2, discrete_expo - numeric_traits<OutDataType>::mant) * 1.0;
+    }
+    double midway_error = std::max(compute_error, output_error);
+
+    static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
+                  "Warning: Unhandled AccDataType for setting up the absolute threshold!");
+
+    double acc_error = 0;
+    if constexpr(is_any_of<AccDataType, pk_int4_t, I8, I32, int>::value)
+    {
+        return 0;
+    }
+    else
+    {
+        acc_error = std::pow(2, discrete_expo - numeric_traits<AccDataType>::mant) * 0.5 *
+                    number_of_accumulations;
+    }
+    return std::max(acc_error, midway_error);
+}
+
+/**
+ * @brief Stream operator overload for vector output
+ *
+ * Provides a formatted string representation of a vector, useful for debugging and logging.
+ *
+ * @tparam T Type of vector elements
+ * @param os Output stream
+ * @param v Vector to output
+ * @return Reference to the output stream
+ */
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    using size_type = typename std::vector<T>::size_type;
+
+    os << "[";
+    for(size_type idx = 0; idx < v.size(); ++idx)
+    {
+        if(0 < idx)
+        {
+            os << ", ";
+        }
+        os << v[idx];
+    }
+    return os << "]";
+}
+
+/**
+ * @brief Check for size mismatch between output and reference ranges
+ *
+ * Verifies that the output and reference ranges are the same size.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if sizes mismatch
+ * @return True if sizes mismatch, false otherwise
+ */
+template <typename Range, typename RefRange>
+CK_TILE_HOST bool check_size_mismatch(const Range& out,
+                                      const RefRange& ref,
+                                      const std::string& msg = "Error: Incorrect results!")
+{
+    if(out.size() != ref.size())
+    {
+        std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl;
+        return true;
+    }
+    return false;
+}
+
+/**
+ * @brief Report error statistics for numerical comparisons
+ *
+ * Outputs statistics about numerical comparison errors including count and maximum error.
+ *
+ * @param err_count Number of errors found
+ * @param max_err Maximum error value encountered
+ * @param total_size Total number of elements compared
+ */
+CK_TILE_HOST void report_error_stats(int err_count, double max_err, std::size_t total_size)
+{
+    const float error_percent =
+        static_cast<float>(err_count) / static_cast<float>(total_size) * 100.f;
+    std::cerr << "max err: " << max_err;
+    std::cerr << ", number of errors: " << err_count;
+    std::cerr << ", " << error_percent << "% wrong values" << std::endl;
+}
+
+/**
+ * @brief Check errors between floating point ranges using the specified tolerances.
+ *
+ * Compares two ranges of floating point values within specified relative and absolute tolerances.
+ * This overload handles standard floating point types except half precision floating point.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if check fails
+ * @param rtol Relative tolerance
+ * @param atol Absolute tolerance
+ * @param allow_infinity_ref Whether to allow infinity in reference values
+ * @return True if check passes, false otherwise
+ */
+template <typename Range, typename RefRange>
+typename std::enable_if<
+    std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
+        std::is_floating_point_v<ranges::range_value_t<Range>> &&
+        !std::is_same_v<ranges::range_value_t<Range>, half_t>,
+    bool>::type CK_TILE_HOST
+check_err(const Range& out,
+          const RefRange& ref,
+          const std::string& msg  = "Error: Incorrect results!",
+          double rtol             = 1e-5,
+          double atol             = 3e-6,
+          bool allow_infinity_ref = false)
+{
+
+    if(check_size_mismatch(out, ref, msg))
+        return false;
+
+    const auto is_infinity_error = [=](auto o, auto r) {
+        const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
+        const bool both_infinite_and_same =
+            std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
+
+        return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
+    };
+
+    bool res{true};
+    int err_count  = 0;
+    double err     = 0;
+    double max_err = std::numeric_limits<double>::min();
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        const double o = *std::next(std::begin(out), i);
+        const double r = *std::next(std::begin(ref), i);
+        err            = std::abs(o - r);
+        if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
+        {
+            max_err = err > max_err ? err : max_err;
+            err_count++;
+            if(err_count < ERROR_DETAIL_LIMIT)
+            {
+                std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                          << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+            }
+            res = false;
+        }
+    }
+    if(!res)
+    {
+        report_error_stats(err_count, max_err, ref.size());
+    }
+    return res;
+}
+
+/**
+ * @brief Check errors between floating point ranges using the specified tolerances
+ *
+ * Compares two ranges of brain floating point values within specified relative and absolute
+ * tolerances.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if check fails
+ * @param rtol Relative tolerance
+ * @param atol Absolute tolerance
+ * @param allow_infinity_ref Whether to allow infinity in reference values
+ * @return True if check passes, false otherwise
+ */
+template <typename Range, typename RefRange>
+typename std::enable_if<
+    std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
+        std::is_same_v<ranges::range_value_t<Range>, bf16_t>,
+    bool>::type CK_TILE_HOST
+check_err(const Range& out,
+          const RefRange& ref,
+          const std::string& msg  = "Error: Incorrect results!",
+          double rtol             = 1e-3,
+          double atol             = 1e-3,
+          bool allow_infinity_ref = false)
+{
+    if(check_size_mismatch(out, ref, msg))
+        return false;
+
+    const auto is_infinity_error = [=](auto o, auto r) {
+        const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
+        const bool both_infinite_and_same =
+            std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
+
+        return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
+    };
+
+    bool res{true};
+    int err_count = 0;
+    double err    = 0;
+    // TODO: This is a hack. We should have proper specialization for bf16_t data type.
+    double max_err = std::numeric_limits<float>::min();
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        const double o = type_convert<float>(*std::next(std::begin(out), i));
+        const double r = type_convert<float>(*std::next(std::begin(ref), i));
+        err            = std::abs(o - r);
+        if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
+        {
+            max_err = err > max_err ? err : max_err;
+            err_count++;
+            if(err_count < ERROR_DETAIL_LIMIT)
+            {
+                std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                          << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+            }
+            res = false;
+        }
+    }
+    if(!res)
+    {
+        report_error_stats(err_count, max_err, ref.size());
+    }
+    return res;
+}
+
+/**
+ * @brief Check errors between half precision floating point ranges
+ *
+ * Compares two ranges of half precision floating point values within specified tolerances.
+ * This specialization handles the specific requirements and characteristics of half precision
+ * floating point comparisons.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if check fails
+ * @param rtol Relative tolerance
+ * @param atol Absolute tolerance
+ * @param allow_infinity_ref Whether to allow infinity in reference values
+ * @return True if check passes, false otherwise
+ */
+template <typename Range, typename RefRange>
+typename std::enable_if<
+    std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
+        std::is_same_v<ranges::range_value_t<Range>, half_t>,
+    bool>::type CK_TILE_HOST
+check_err(const Range& out,
+          const RefRange& ref,
+          const std::string& msg  = "Error: Incorrect results!",
+          double rtol             = 1e-3,
+          double atol             = 1e-3,
+          bool allow_infinity_ref = false)
+{
+    if(check_size_mismatch(out, ref, msg))
+        return false;
+
+    const auto is_infinity_error = [=](auto o, auto r) {
+        const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
+        const bool both_infinite_and_same =
+            std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
+
+        return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
+    };
+
+    bool res{true};
+    int err_count  = 0;
+    double err     = 0;
+    double max_err = static_cast<double>(std::numeric_limits<ranges::range_value_t<Range>>::min());
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        const double o = type_convert<float>(*std::next(std::begin(out), i));
+        const double r = type_convert<float>(*std::next(std::begin(ref), i));
+        err            = std::abs(o - r);
+        if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
+        {
+            max_err = err > max_err ? err : max_err;
+            err_count++;
+            if(err_count < ERROR_DETAIL_LIMIT)
+            {
+                std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                          << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+            }
+            res = false;
+        }
+    }
+    if(!res)
+    {
+        report_error_stats(err_count, max_err, ref.size());
+    }
+    return res;
+}
+
+/**
+ * @brief Check errors between integer ranges
+ *
+ * Compares two ranges of integer values with an absolute tolerance.
+ * This specialization handles integer types and optionally int4_t when the
+ * experimental bit int extension is enabled.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if check fails
+ * @param atol Absolute tolerance
+ * @return True if check passes, false otherwise
+ */
+template <typename Range, typename RefRange>
+std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
+                  std::is_integral_v<ranges::range_value_t<Range>> &&
+                  !std::is_same_v<ranges::range_value_t<Range>, bf16_t>)
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+                     || std::is_same_v<ranges::range_value_t<Range>, int4_t>
+#endif
+                 ,
+                 bool>
+    CK_TILE_HOST check_err(const Range& out,
+                           const RefRange& ref,
+                           const std::string& msg = "Error: Incorrect results!",
+                           double                 = 0,
+                           double atol            = 0)
+{
+    if(check_size_mismatch(out, ref, msg))
+        return false;
+
+    bool res{true};
+    int err_count   = 0;
+    int64_t err     = 0;
+    int64_t max_err = std::numeric_limits<int64_t>::min();
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        const int64_t o = *std::next(std::begin(out), i);
+        const int64_t r = *std::next(std::begin(ref), i);
+        err             = std::abs(o - r);
+
+        if(err > atol)
+        {
+            max_err = err > max_err ? err : max_err;
+            err_count++;
+            if(err_count < ERROR_DETAIL_LIMIT)
+            {
+                std::cerr << msg << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r
+                          << std::endl;
+            }
+            res = false;
+        }
+    }
+    if(!res)
+    {
+        report_error_stats(err_count, static_cast<double>(max_err), ref.size());
+    }
+    return res;
+}
+
+/**
+ * @brief Check errors between FP8 ranges
+ *
+ * Specialized comparison for 8-bit floating point values that takes into account
+ * the unique characteristics and limitations of FP8 arithmetic, including
+ * rounding point distances and special handling of infinity values.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if check fails
+ * @param max_rounding_point_distance Maximum allowed distance between rounding points
+ * @param atol Absolute tolerance
+ * @param allow_infinity_ref Whether to allow infinity in reference values
+ * @return True if check passes, false otherwise
+ */
+template <typename Range, typename RefRange>
+std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
+                  std::is_same_v<ranges::range_value_t<Range>, fp8_t>),
+                 bool>
+    CK_TILE_HOST check_err(const Range& out,
+                           const RefRange& ref,
+                           const std::string& msg               = "Error: Incorrect results!",
+                           unsigned max_rounding_point_distance = 1,
+                           double atol                          = 1e-1,
+                           bool allow_infinity_ref              = false)
+{
+    if(check_size_mismatch(out, ref, msg))
+        return false;
+
+    const auto is_infinity_error = [=](auto o, auto r) {
+        const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
+        const bool both_infinite_and_same =
+            std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
+
+        return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
+    };
+
+    static const auto get_rounding_point_distance = [](fp8_t o, fp8_t r) -> unsigned {
+        static const auto get_sign_bit = [](fp8_t v) -> bool {
+            return 0x80 & bit_cast<uint8_t>(v);
+        };
+
+        if(get_sign_bit(o) ^ get_sign_bit(r))
+        {
+            return std::numeric_limits<unsigned>::max();
+        }
+        else
+        {
+            return std::abs(bit_cast<int8_t>(o) - bit_cast<int8_t>(r));
+        }
+    };
+
+    bool res{true};
+    int err_count  = 0;
+    double err     = 0;
+    double max_err = std::numeric_limits<float>::min();
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        const fp8_t o_fp8   = *std::next(std::begin(out), i);
+        const fp8_t r_fp8   = *std::next(std::begin(ref), i);
+        const double o_fp64 = type_convert<float>(o_fp8);
+        const double r_fp64 = type_convert<float>(r_fp8);
+        err                 = std::abs(o_fp64 - r_fp64);
+        if(!(less_equal<double>{}(err, atol) ||
+             get_rounding_point_distance(o_fp8, r_fp8) <= max_rounding_point_distance) ||
+           is_infinity_error(o_fp64, r_fp64))
+        {
+            max_err = err > max_err ? err : max_err;
+            err_count++;
+            if(err_count < ERROR_DETAIL_LIMIT)
+            {
+                std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                          << "] != ref[" << i << "]: " << o_fp64 << " != " << r_fp64 << std::endl;
+            }
+            res = false;
+        }
+    }
+    if(!res)
+    {
+        report_error_stats(err_count, max_err, ref.size());
+    }
+    return res;
+}
+
+/**
+ * @brief Check errors between BF8 ranges
+ *
+ * Specialized comparison for 8-bit brain floating point values that considers
+ * the specific numerical properties and error characteristics of the BF8 format.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if check fails
+ * @param rtol Relative tolerance
+ * @param atol Absolute tolerance
+ * @param allow_infinity_ref Whether to allow infinity in reference values
+ * @return True if check passes, false otherwise
+ */
+template <typename Range, typename RefRange>
+std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
+                  std::is_same_v<ranges::range_value_t<Range>, bf8_t>),
+                 bool>
+    CK_TILE_HOST check_err(const Range& out,
+                           const RefRange& ref,
+                           const std::string& msg  = "Error: Incorrect results!",
+                           double rtol             = 1e-3,
+                           double atol             = 1e-3,
+                           bool allow_infinity_ref = false)
+{
+    if(check_size_mismatch(out, ref, msg))
+        return false;
+
+    const auto is_infinity_error = [=](auto o, auto r) {
+        const bool either_not_finite = !std::isfinite(o) || !std::isfinite(r);
+        const bool both_infinite_and_same =
+            std::isinf(o) && std::isinf(r) && (bit_cast<uint64_t>(o) == bit_cast<uint64_t>(r));
+
+        return either_not_finite && !(allow_infinity_ref && both_infinite_and_same);
+    };
+
+    bool res{true};
+    int err_count  = 0;
+    double err     = 0;
+    double max_err = std::numeric_limits<float>::min();
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        const double o = type_convert<float>(*std::next(std::begin(out), i));
+        const double r = type_convert<float>(*std::next(std::begin(ref), i));
+        err            = std::abs(o - r);
+        if(err > atol + rtol * std::abs(r) || is_infinity_error(o, r))
+        {
+            max_err = err > max_err ? err : max_err;
+            err_count++;
+            if(err_count < ERROR_DETAIL_LIMIT)
+            {
+                std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                          << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+            }
+            res = false;
+        }
+    }
+    if(!res)
+    {
+        report_error_stats(err_count, max_err, ref.size());
+    }
+    return res;
+}
+
+/**
+ * @brief Check errors between pk_fp4_t ranges
+ *
+ * Compares two ranges of pk_fp4_t without tolerance.
+ * This specialization handles ck_tile::pk_fp4_t type.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if check fails
+ * @return True if check passes, false otherwise
+ */
+template <typename Range, typename RefRange>
+std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
+                  std::is_same_v<ranges::range_value_t<Range>, pk_fp4_t>),
+                 bool>
+    CK_TILE_HOST check_err(const Range& out,
+                           const RefRange& ref,
+                           const std::string& msg = "Error: Incorrect results!",
+                           double                 = 0,
+                           double                 = 0)
+{
+    if(check_size_mismatch(out, ref, msg))
+        return false;
+
+    int err_count = 0;
+
+    auto update_err = [&](pk_fp4_raw_t o, pk_fp4_raw_t r, std::size_t index) {
+        if(o != r)
+        {
+            std::cerr << msg << " out[" << index << "] != ref[" << index
+                      << "]: " << type_convert<float>(pk_fp4_t{o})
+                      << " != " << type_convert<float>(pk_fp4_t{r}) << std::endl;
+            ++err_count;
+        }
+    };
+
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        const pk_fp4_t o = *std::next(std::begin(out), i);
+        const pk_fp4_t r = *std::next(std::begin(ref), i);
+        update_err(o._unpack(number<0>{}), r._unpack(number<0>{}), i * 2);
+        update_err(o._unpack(number<1>{}), r._unpack(number<1>{}), i * 2 + 1);
+    }
+    if(err_count > 0)
+    {
+        report_error_stats(err_count, numeric<pk_fp4_t>::max(), ref.size());
+    }
+    return err_count == 0;
+}
+
+/**
+ * @brief Check errors between pk_fp6x16_t ranges
+ *
+ * Compares two ranges of pk_fp6x16_t without tolerance.
+ * This specialization handles ck_tile::pk_fp6x16_t type.
+ *
+ * @tparam Range Type of output range
+ * @tparam RefRange Type of reference range
+ * @param out Output range to check
+ * @param ref Reference range to check against
+ * @param msg Error message to display if check fails
+ * @return True if check passes, false otherwise
+ */
+template <typename Range, typename RefRange>
+std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
+                  std::is_same_v<ranges::range_value_t<Range>, pk_fp6x16_t>),
+                 bool>
+    CK_TILE_HOST check_err(const Range& out,
+                           const RefRange& ref,
+                           const std::string& msg = "Error: Incorrect results!",
+                           double                 = 0,
+                           double                 = 0)
+{
+    if(check_size_mismatch(out, ref, msg))
+        return false;
+
+    int err_count   = 0;
+    float max_err   = 0.0f;
+    auto update_err = [&](float o, float r, std::size_t index) {
+        if(std::fabs(o - r) > 1e-8)
+        {
+            std::cerr << msg << " out[" << index << "] != ref[" << index << "]: " << o
+                      << " != " << r << std::endl;
+            ++err_count;
+            max_err = max_err < std::fabs(o - r) ? o : max_err;
+        }
+    };
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        const pk_fp6x16_t o = *std::next(std::begin(out), i);
+        const pk_fp6x16_t r = *std::next(std::begin(ref), i);
+        for(std::size_t j = 0; j < numeric_traits<pk_fp6x16_t>::PackedSize; j++)
+        {
+            update_err(o.unpack(j), r.unpack(j), i * numeric_traits<pk_fp6x16_t>::PackedSize + j);
+        }
+    }
+    if(err_count > 0)
+    {
+        report_error_stats(err_count, max_err, ref.size());
+    }
+    return err_count == 0;
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/concat.hpp
+++ b/include/ck_tile/host/concat.hpp
@@ -0,0 +1,123 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+
+namespace ck_tile {
+
+template <typename T>
+struct IsCharArray : std::false_type
+{
+};
+
+template <std::size_t N>
+struct IsCharArray<char[N]> : std::true_type
+{
+};
+
+template <std::size_t N>
+struct IsCharArray<const char[N]> : std::true_type
+{
+};
+
+template <std::size_t N>
+struct IsCharArray<char (&)[N]> : std::true_type
+{
+};
+
+template <std::size_t N>
+struct IsCharArray<const char (&)[N]> : std::true_type
+{
+};
+
+template <typename... Ts>
+inline constexpr bool AllConvertibleToStringView =
+    ((std::is_convertible_v<Ts, std::string_view> || IsCharArray<Ts>::value ||
+      std::is_same_v<Ts, char>) &&
+     ...);
+
+template <typename... Ts>
+[[nodiscard]] auto
+concat(const Ts&... xs) -> std::enable_if_t<!AllConvertibleToStringView<Ts...>, std::string>
+{
+    using ::operator<<;
+    thread_local std::ostringstream oss;
+    oss.str("");
+
+    (oss << ... << xs);
+    return oss.str();
+}
+
+template <std::size_t N>
+[[nodiscard]] constexpr inline std::size_t getSize(char (&)[N]) noexcept
+{
+    return N;
+}
+
+template <std::size_t N>
+[[nodiscard]] constexpr inline std::size_t getSize(const char (&)[N]) noexcept
+{
+    return N;
+}
+
+[[nodiscard]] constexpr inline std::size_t getSize(const char* s) noexcept
+{
+    const char* end = s;
+    while(*end++ != 0) {}
+    return end - s - 1;
+}
+
+[[nodiscard]] constexpr inline std::size_t getSize(const char&) noexcept { return 1; }
+
+[[nodiscard]] inline std::size_t getSize(const std::string& s) noexcept { return s.size(); }
+
+[[nodiscard]] constexpr inline std::size_t getSize(const std::string_view& s) noexcept
+{
+    return s.size();
+}
+
+template <typename... Ts>
+auto concatInto(std::string& result,
+                const Ts&... xs) -> std::enable_if_t<AllConvertibleToStringView<Ts...>, void>
+{
+    const std::size_t space = (1 + ... + getSize(xs));
+    result.reserve(result.size() + space);
+    ((result += xs), ...);
+}
+
+template <typename... Ts>
+[[nodiscard]] auto
+concat(const Ts&... xs) -> std::enable_if_t<AllConvertibleToStringView<Ts...>, std::string>
+{
+    std::string result;
+    concatInto(result, xs...);
+    return result;
+}
+
+// Function for types convertible to std::string_view
+template <typename Sep, typename First, typename... Rest>
+[[nodiscard]] auto concat(Sep sep, const First& first, const Rest&... rest)
+    -> std::enable_if_t<AllConvertibleToStringView<First, Rest...>, std::string>
+{
+    std::string result;
+    result += first;
+    ((result += sep, result += rest), ...);
+    return result;
+}
+
+// Function for other types
+template <typename Sep, typename First, typename... Rest>
+[[nodiscard]] auto concat(Sep sep, const First& first, const Rest&... rest)
+    -> std::enable_if_t<!AllConvertibleToStringView<First, Rest...>, std::string>
+{
+    using ::operator<<;
+    thread_local std::ostringstream oss;
+    oss.str("");
+    oss << first;
+    ((oss << sep << rest), ...);
+    return oss.str();
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/convolution_host_tensor_descriptor_helper.hpp
+++ b/include/ck_tile/host/convolution_host_tensor_descriptor_helper.hpp
@@ -0,0 +1,236 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/host/convolution_parameter.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+namespace conv {
+namespace detail {
+
+template <typename OldLayout>
+CK_TILE_HOST std::vector<std::size_t> get_layout_transpose_gnchw_to_old()
+{
+    using namespace ck_tile::tensor_layout::convolution;
+
+    if constexpr(is_any_of<OldLayout, GNCW, GKCX, GNKW>::value)
+    {
+        return {0, 1, 2, 3};
+    }
+    else if constexpr(is_any_of<OldLayout, GNCHW, GKCYX, GNKHW>::value)
+    {
+        return {0, 1, 2, 3, 4};
+    }
+    else if constexpr(is_any_of<OldLayout, GNCDHW, GKCZYX, GNKDHW>::value)
+    {
+        return {0, 1, 2, 3, 4, 5};
+    }
+    if constexpr(is_any_of<OldLayout, GNWC, GKXC, GNWK>::value)
+    {
+        return {0, 1, 3, 2};
+    }
+    else if constexpr(is_any_of<OldLayout, GNHWC, GKYXC, GNHWK>::value)
+    {
+        return {0, 1, 4, 2, 3};
+    }
+    else if constexpr(is_any_of<OldLayout, GNDHWC, GKZYXC, GNDHWK>::value)
+    {
+        return {0, 1, 5, 2, 3, 4};
+    }
+    else if constexpr(is_any_of<OldLayout, NWGC, KXGC, NWGK>::value)
+    {
+        return {2, 0, 3, 1};
+    }
+    else if constexpr(is_any_of<OldLayout, NHWGC, KYXGC, NHWGK>::value)
+    {
+        return {3, 0, 4, 1, 2};
+    }
+    else if constexpr(is_any_of<OldLayout, NDHWGC, KZYXGC, NDHWGK>::value)
+    {
+        return {4, 0, 5, 1, 2, 3};
+    }
+    else
+    {
+        printf("%s\n", __func__);
+        throw std::runtime_error("wrong! unsupported layout");
+    }
+}
+
+} // namespace detail
+
+// make tensor descriptor for packed input tensor, and order the dimension in the order of GNCHW
+// regardless of physical layout
+template <typename InLayout>
+CK_TILE_HOST HostTensorDescriptor
+make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvParam& param)
+{
+    using namespace ck_tile::tensor_layout::convolution;
+
+    std::vector<std::size_t> physical_lengths;
+
+    if constexpr(is_any_of<InLayout, GNCW, GNCHW, GNCDHW>::value)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.N_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.end(),
+                                param.input_spatial_lengths_.begin(),
+                                param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else if constexpr(is_any_of<InLayout, GNWC, GNHWC, GNDHWC>::value)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.N_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.begin() + 2,
+                                param.input_spatial_lengths_.begin(),
+                                param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else if constexpr(is_any_of<InLayout, NWGC, NHWGC, NDHWGC>::value)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
+                                                    static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.begin() + 1,
+                                param.input_spatial_lengths_.begin(),
+                                param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else
+    {
+        printf("%s\n", __func__);
+        printf("%s\n", InLayout::name);
+        throw std::runtime_error("wrong! unsupported layout");
+    }
+
+    return transpose_host_tensor_descriptor_given_new2old(
+        HostTensorDescriptor(physical_lengths),
+        detail::get_layout_transpose_gnchw_to_old<InLayout>());
+}
+
+// make tensor descriptor for packed weight tensor, and order the dimension in the order of GKCYX
+// regardless of physical layout
+template <typename WeiLayout>
+CK_TILE_HOST HostTensorDescriptor
+make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvParam& param)
+{
+    using namespace ck_tile::tensor_layout::convolution;
+
+    std::vector<std::size_t> physical_lengths;
+
+    if constexpr(is_any_of<WeiLayout, KXC, KYXC, KZYXC>::value)
+    {
+        if(param.G_ != 1)
+        {
+            throw std::runtime_error("wrong! G != 1");
+        }
+
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.end(),
+                                param.filter_spatial_lengths_.begin(),
+                                param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else if constexpr(is_any_of<WeiLayout, GKCX, GKCYX, GKCZYX>::value)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.K_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.end(),
+                                param.filter_spatial_lengths_.begin(),
+                                param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else if constexpr(is_any_of<WeiLayout, GKXC, GKYXC, GKZYXC>::value)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.K_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.begin() + 2,
+                                param.filter_spatial_lengths_.begin(),
+                                param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else if constexpr(is_any_of<WeiLayout, KXGC, KYXGC, KZYXGC>::value)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
+                                                    static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.begin() + 1,
+                                param.filter_spatial_lengths_.begin(),
+                                param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else
+    {
+        printf("%s\n", __func__);
+        printf("%s\n", WeiLayout::name);
+        throw std::runtime_error("wrong! unsupported layout");
+    }
+
+    return transpose_host_tensor_descriptor_given_new2old(
+        HostTensorDescriptor(physical_lengths),
+        detail::get_layout_transpose_gnchw_to_old<WeiLayout>());
+}
+
+// make tensor descriptor for packed output tensor, and order the dimension in the order of GNKHW
+// regardless of physical layout
+template <typename OutLayout>
+CK_TILE_HOST HostTensorDescriptor
+make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvParam& param)
+{
+    using namespace ck_tile::tensor_layout::convolution;
+
+    std::vector<std::size_t> physical_lengths;
+
+    if constexpr(is_any_of<OutLayout, GNKW, GNKHW, GNKDHW>::value)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.N_),
+                                                    static_cast<std::size_t>(param.K_)};
+
+        physical_lengths.insert(physical_lengths.end(),
+                                param.output_spatial_lengths_.begin(),
+                                param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    // separate from legacy code above
+    else if constexpr(is_any_of<OutLayout, GNWK, GNHWK, GNDHWK>::value)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.N_),
+                                                    static_cast<std::size_t>(param.K_)};
+
+        physical_lengths.insert(physical_lengths.begin() + 2,
+                                param.output_spatial_lengths_.begin(),
+                                param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else if constexpr(is_any_of<OutLayout, NWGK, NHWGK, NDHWGK>::value)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
+                                                    static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.K_)};
+
+        physical_lengths.insert(physical_lengths.begin() + 1,
+                                param.output_spatial_lengths_.begin(),
+                                param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else
+    {
+        printf("%s\n", __func__);
+        printf("%s\n", OutLayout::name);
+        throw std::runtime_error("wrong! unsupported layout");
+    }
+
+    return transpose_host_tensor_descriptor_given_new2old(
+        HostTensorDescriptor(physical_lengths),
+        detail::get_layout_transpose_gnchw_to_old<OutLayout>());
+}
+
+} // namespace conv
+} // namespace ck_tile
--- a/include/ck_tile/host/convolution_parameter.hpp
+++ b/include/ck_tile/host/convolution_parameter.hpp
@@ -0,0 +1,277 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cstdlib>
+#include <numeric>
+#include <iterator>
+#include <vector>
+
+namespace ck_tile {
+namespace conv {
+
+struct ConvParam
+{
+    ConvParam(ck_tile::index_t n_dim,
+              ck_tile::index_t group_count,
+              ck_tile::index_t n_batch,
+              ck_tile::index_t n_out_channels,
+              ck_tile::index_t n_in_channels,
+              const std::vector<ck_tile::index_t>& filters_len,
+              const std::vector<ck_tile::index_t>& input_len,
+              const std::vector<ck_tile::index_t>& strides,
+              const std::vector<ck_tile::index_t>& dilations,
+              const std::vector<ck_tile::index_t>& left_pads,
+              const std::vector<ck_tile::index_t>& right_pads)
+        : num_dim_spatial_(static_cast<ck_tile::long_index_t>(n_dim)),
+          G_(static_cast<ck_tile::long_index_t>(group_count)),
+          N_(static_cast<ck_tile::long_index_t>(n_batch)),
+          K_(static_cast<ck_tile::long_index_t>(n_out_channels)),
+          C_(static_cast<ck_tile::long_index_t>(n_in_channels)),
+          filter_spatial_lengths_(num_dim_spatial_),
+          input_spatial_lengths_(num_dim_spatial_),
+          output_spatial_lengths_(num_dim_spatial_),
+          conv_filter_strides_(num_dim_spatial_),
+          conv_filter_dilations_(num_dim_spatial_),
+          input_left_pads_(num_dim_spatial_),
+          input_right_pads_(num_dim_spatial_)
+    {
+        if(static_cast<ck_tile::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
+           static_cast<ck_tile::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
+           static_cast<ck_tile::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
+           static_cast<ck_tile::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
+           static_cast<ck_tile::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
+           static_cast<ck_tile::index_t>(input_right_pads_.size()) != num_dim_spatial_)
+        {
+            throw(std::runtime_error(
+                "ConvParam::ConvParam: "
+                "parameter size is different from number of declared dimensions!"));
+        }
+
+        for(ck_tile::index_t i = 0; i < num_dim_spatial_; ++i)
+        {
+            filter_spatial_lengths_[i] = static_cast<ck_tile::long_index_t>(filters_len[i]);
+            input_spatial_lengths_[i]  = static_cast<ck_tile::long_index_t>(input_len[i]);
+            conv_filter_strides_[i]    = static_cast<ck_tile::long_index_t>(strides[i]);
+            conv_filter_dilations_[i]  = static_cast<ck_tile::long_index_t>(dilations[i]);
+            input_left_pads_[i]        = static_cast<ck_tile::long_index_t>(left_pads[i]);
+            input_right_pads_[i]       = static_cast<ck_tile::long_index_t>(right_pads[i]);
+
+            // XEff = (X - 1) * conv_dilation_w + 1;
+            // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+            const ck_tile::long_index_t x_eff =
+                (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
+
+            output_spatial_lengths_[i] =
+                (input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
+                    conv_filter_strides_[i] +
+                1;
+        }
+    }
+
+    ConvParam(ck_tile::long_index_t n_dim,
+              ck_tile::long_index_t group_count,
+              ck_tile::long_index_t n_batch,
+              ck_tile::long_index_t n_out_channels,
+              ck_tile::long_index_t n_in_channels,
+              const std::vector<ck_tile::long_index_t>& filters_len,
+              const std::vector<ck_tile::long_index_t>& input_len,
+              const std::vector<ck_tile::long_index_t>& strides,
+              const std::vector<ck_tile::long_index_t>& dilations,
+              const std::vector<ck_tile::long_index_t>& left_pads,
+              const std::vector<ck_tile::long_index_t>& right_pads)
+        : num_dim_spatial_(n_dim),
+          G_(group_count),
+          N_(n_batch),
+          K_(n_out_channels),
+          C_(n_in_channels),
+          filter_spatial_lengths_(filters_len),
+          input_spatial_lengths_(input_len),
+          output_spatial_lengths_(num_dim_spatial_),
+          conv_filter_strides_(strides),
+          conv_filter_dilations_(dilations),
+          input_left_pads_(left_pads),
+          input_right_pads_(right_pads)
+    {
+        if(static_cast<ck_tile::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
+           static_cast<ck_tile::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
+           static_cast<ck_tile::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
+           static_cast<ck_tile::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
+           static_cast<ck_tile::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
+           static_cast<ck_tile::index_t>(input_right_pads_.size()) != num_dim_spatial_)
+        {
+            throw(std::runtime_error(
+                "ConvParam::ConvParam: "
+                "parameter size is different from number of declared dimensions!"));
+        }
+
+        for(ck_tile::index_t i = 0; i < num_dim_spatial_; ++i)
+        {
+            // XEff = (X - 1) * conv_dilation_w + 1;
+            // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+            const ck_tile::long_index_t x_eff =
+                (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
+
+            output_spatial_lengths_[i] =
+                (input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
+                    conv_filter_strides_[i] +
+                1;
+        }
+    }
+
+    ck_tile::long_index_t num_dim_spatial_;
+    ck_tile::long_index_t G_;
+    ck_tile::long_index_t N_;
+    ck_tile::long_index_t K_;
+    ck_tile::long_index_t C_;
+
+    std::vector<ck_tile::long_index_t> filter_spatial_lengths_;
+    std::vector<ck_tile::long_index_t> input_spatial_lengths_;
+    std::vector<ck_tile::long_index_t> output_spatial_lengths_;
+
+    std::vector<ck_tile::long_index_t> conv_filter_strides_;
+    std::vector<ck_tile::long_index_t> conv_filter_dilations_;
+
+    std::vector<ck_tile::long_index_t> input_left_pads_;
+    std::vector<ck_tile::long_index_t> input_right_pads_;
+
+    std::vector<ck_tile::long_index_t> GetOutputSpatialLengths() const
+    {
+        return output_spatial_lengths_;
+    }
+
+    std::size_t GetFlops() const
+    {
+        // 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
+        return static_cast<std::size_t>(2) * G_ * N_ * K_ * C_ *
+               std::accumulate(std::begin(output_spatial_lengths_),
+                               std::next(std::begin(output_spatial_lengths_), num_dim_spatial_),
+                               1,
+                               std::multiplies<>()) *
+               std::accumulate(std::begin(filter_spatial_lengths_),
+                               std::next(std::begin(filter_spatial_lengths_), num_dim_spatial_),
+                               1,
+                               std::multiplies<>());
+    }
+
+    template <typename InDataType>
+    std::size_t GetInputByte() const
+    {
+        // sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
+        return sizeof(InDataType) *
+               (G_ * N_ * C_ *
+                std::accumulate(std::begin(input_spatial_lengths_),
+                                std::next(std::begin(input_spatial_lengths_), num_dim_spatial_),
+                                1,
+                                std::multiplies<>()));
+    }
+
+    template <typename WeiDataType>
+    std::size_t GetWeightByte() const
+    {
+        // sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
+        return sizeof(WeiDataType) *
+               (G_ * K_ * C_ *
+                std::accumulate(std::begin(filter_spatial_lengths_),
+                                std::next(std::begin(filter_spatial_lengths_), num_dim_spatial_),
+                                1,
+                                std::multiplies<>()));
+    }
+
+    template <typename OutDataType>
+    std::size_t GetOutputByte() const
+    {
+        // sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
+        return sizeof(OutDataType) * (G_ * N_ * K_ *
+                                      std::accumulate(std::begin(output_spatial_lengths_),
+                                                      std::end(output_spatial_lengths_),
+                                                      static_cast<std::size_t>(1),
+                                                      std::multiplies<std::size_t>()));
+    }
+
+    template <typename InDataType, typename WeiDataType, typename OutDataType>
+    std::size_t GetByte() const
+    {
+        return GetInputByte<InDataType>() + GetWeightByte<WeiDataType>() +
+               GetOutputByte<OutDataType>();
+    }
+};
+
+CK_TILE_HOST std::string get_conv_param_parser_helper_msg()
+{
+    std::string msg;
+
+    msg += "Following arguments (depending on number of spatial dims):\n"
+           " Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
+           " G, N, K, C, \n"
+           " <filter spatial dimensions>, (ie Y, X for 2D)\n"
+           " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
+           " <strides>, (ie Sy, Sx for 2D)\n"
+           " <dilations>, (ie Dy, Dx for 2D)\n"
+           " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
+           " <right padding>, (ie RightPy, RightPx for 2D)\n";
+
+    return msg;
+}
+
+CK_TILE_HOST ck_tile::conv::ConvParam
+parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[])
+{
+    const ck_tile::long_index_t G = std::stol(argv[arg_idx++]);
+    const ck_tile::long_index_t N = std::stol(argv[arg_idx++]);
+    const ck_tile::long_index_t K = std::stol(argv[arg_idx++]);
+    const ck_tile::long_index_t C = std::stol(argv[arg_idx++]);
+
+    std::vector<ck_tile::long_index_t> filter_spatial_lengths(num_dim_spatial);
+    std::vector<ck_tile::long_index_t> input_spatial_lengths(num_dim_spatial);
+    std::vector<ck_tile::long_index_t> conv_filter_strides(num_dim_spatial);
+    std::vector<ck_tile::long_index_t> conv_filter_dilations(num_dim_spatial);
+    std::vector<ck_tile::long_index_t> input_left_pads(num_dim_spatial);
+    std::vector<ck_tile::long_index_t> input_right_pads(num_dim_spatial);
+
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        filter_spatial_lengths[i] = std::stol(argv[arg_idx++]);
+    }
+
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        input_spatial_lengths[i] = std::stol(argv[arg_idx++]);
+    }
+
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        conv_filter_strides[i] = std::stol(argv[arg_idx++]);
+    }
+
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        conv_filter_dilations[i] = std::stol(argv[arg_idx++]);
+    }
+
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        input_left_pads[i] = std::stol(argv[arg_idx++]);
+    }
+
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        input_right_pads[i] = std::stol(argv[arg_idx++]);
+    }
+
+    return ck_tile::conv::ConvParam{num_dim_spatial,
+                                    G,
+                                    N,
+                                    K,
+                                    C,
+                                    filter_spatial_lengths,
+                                    input_spatial_lengths,
+                                    conv_filter_strides,
+                                    conv_filter_dilations,
+                                    input_left_pads,
+                                    input_right_pads};
+}
+
+} // namespace conv
+} // namespace ck_tile
--- a/include/ck_tile/host/device_memory.hpp
+++ b/include/ck_tile/host/device_memory.hpp
@@ -0,0 +1,195 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <stdint.h>
+#include <stdexcept>
+#include "ck_tile/host/hip_check_error.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+template <typename T>
+__global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
+{
+    for(uint64_t i = threadIdx.x; i < buffer_element_size; i += blockDim.x)
+    {
+        p[i] = x;
+    }
+}
+
+/**
+ * @brief Manages device memory allocation and host-device data transfers
+ *
+ * DeviceMem encapsulates GPU memory management operations using HIP runtime API.
+ * It provides functionality for allocating device memory, transferring data between
+ * host and device, and performing basic memory operations.
+ *
+ * Key features:
+ * - Automatic memory allocation and deallocation
+ * - Host-to-device and device-to-host data transfers
+ * - Memory initialization operations
+ * - Integration with HostTensor for simplified data handling
+ *
+ * Usage example:
+ * ```
+ * // Allocate device memory
+ * BHostTensor<float> AHostData({256});
+ * DeviceMem d_mem(BHostData.get_element_space_size_in_bytes());
+ *
+ * // Transfer data to device
+ * HostTensor<float> AHostTensor({256});
+ * d_mem.ToDevice(AHostData.data());
+ *
+ * // Retrieve data from device
+ * HostTensor<float> ResultHostTensor({256});
+ * d_mem.FromDevice(ResultHostTensor.data());
+ * ```
+ */
+struct DeviceMem
+
+{
+    DeviceMem() : mpDeviceBuf(nullptr), mMemSize(0) {}
+    DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
+    {
+        if(mMemSize != 0)
+        {
+            HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+        }
+        else
+        {
+            mpDeviceBuf = nullptr;
+        }
+    }
+    template <typename T>
+    DeviceMem(const HostTensor<T>& t) : mMemSize(t.get_element_space_size_in_bytes())
+    {
+        if(mMemSize != 0)
+        {
+            HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+        }
+        else
+        {
+            mpDeviceBuf = nullptr;
+        }
+        ToDevice(t.data());
+    }
+    void Realloc(std::size_t mem_size)
+    {
+        if(mpDeviceBuf)
+        {
+            HIP_CHECK_ERROR(hipFree(mpDeviceBuf));
+        }
+        mMemSize = mem_size;
+        if(mMemSize != 0)
+        {
+            HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+        }
+        else
+        {
+            mpDeviceBuf = nullptr;
+        }
+    }
+    void* GetDeviceBuffer() const { return mpDeviceBuf; }
+    std::size_t GetBufferSize() const { return mMemSize; }
+    void ToDevice(const void* p) const
+    {
+        if(mpDeviceBuf)
+        {
+            HIP_CHECK_ERROR(
+                hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
+        }
+        // else
+        // {
+        //     throw std::runtime_error("ToDevice with an empty pointer");
+        // }
+    }
+    void ToDevice(const void* p, const std::size_t cpySize) const
+    {
+        if(mpDeviceBuf)
+        {
+            HIP_CHECK_ERROR(
+                hipMemcpy(mpDeviceBuf, const_cast<void*>(p), cpySize, hipMemcpyHostToDevice));
+        }
+    }
+    void FromDevice(void* p) const
+    {
+        if(mpDeviceBuf)
+        {
+            HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+        }
+        // else
+        // {
+        //     throw std::runtime_error("FromDevice with an empty pointer");
+        // }
+    }
+    void FromDevice(void* p, const std::size_t cpySize) const
+    {
+        if(mpDeviceBuf)
+        {
+            HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, cpySize, hipMemcpyDeviceToHost));
+        }
+    }
+
+    // construct a host tensor with type T
+    template <typename T>
+    HostTensor<T> ToHost(std::size_t cpySize)
+    {
+        // TODO: host tensor could be slightly larger than the device tensor
+        // we just copy all data from GPU buffer
+        std::size_t host_elements = (cpySize + sizeof(T) - 1) / sizeof(T);
+        HostTensor<T> h_({host_elements});
+        if(mpDeviceBuf)
+        {
+            HIP_CHECK_ERROR(hipMemcpy(h_.data(), mpDeviceBuf, cpySize, hipMemcpyDeviceToHost));
+        }
+        return h_;
+    }
+    template <typename T>
+    HostTensor<T> ToHost()
+    {
+        return ToHost<T>(mMemSize);
+    }
+
+    void SetZero() const
+    {
+        if(mpDeviceBuf)
+        {
+            HIP_CHECK_ERROR(hipMemset(mpDeviceBuf, 0, mMemSize));
+        }
+    }
+    template <typename T>
+    void SetValue(T x) const
+    {
+        if(mpDeviceBuf)
+        {
+            if(mMemSize % sizeof(T) != 0)
+            {
+                throw std::runtime_error("wrong! not entire DeviceMem will be set");
+            }
+
+            // TODO: call a gpu kernel to set the value (?)
+            set_buffer_value<T><<<1, 1024>>>(static_cast<T*>(mpDeviceBuf), x, mMemSize / sizeof(T));
+        }
+    }
+    ~DeviceMem()
+    {
+        if(mpDeviceBuf)
+        {
+            try
+            {
+                HIP_CHECK_ERROR(hipFree(mpDeviceBuf));
+            }
+            catch(std::runtime_error& re)
+            {
+                std::cerr << re.what() << std::endl;
+            }
+        }
+    }
+
+    void* mpDeviceBuf;    ///< pointer to device buffer
+    std::size_t mMemSize; ///< size of device buffer in bytes
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/host/device_prop.hpp
+++ b/include/ck_tile/host/device_prop.hpp
@@ -0,0 +1,89 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#ifndef __HIPCC_RTC__
+#include <string>
+#include <string_view>
+#include <hip/hip_runtime.h>
+
+namespace ck_tile {
+
+constexpr unsigned int fnv1a_hash(std::string_view str, unsigned int h = 2166136261u)
+{
+    return str.empty() ? h
+                       : fnv1a_hash(str.substr(1),
+                                    (h ^ static_cast<unsigned char>(str.front())) * 16777619u);
+}
+inline std::string get_device_name()
+{
+    hipDeviceProp_t props{};
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+    status = hipGetDeviceProperties(&props, device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+    const std::string raw_name(props.gcnArchName);
+    const auto name = raw_name.substr(0, raw_name.find(':')); // str.substr(0, npos) returns str.
+    switch(fnv1a_hash(name))
+    {
+    // https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
+    case fnv1a_hash("Ellesmere"):
+    case fnv1a_hash("Baffin"):
+    case fnv1a_hash("RacerX"):
+    case fnv1a_hash("Polaris10"):
+    case fnv1a_hash("Polaris11"):
+    case fnv1a_hash("Tonga"):
+    case fnv1a_hash("Fiji"):
+    case fnv1a_hash("gfx800"):
+    case fnv1a_hash("gfx802"):
+    case fnv1a_hash("gfx804"): return "gfx803";
+    case fnv1a_hash("Vega10"):
+    case fnv1a_hash("gfx901"): return "gfx900";
+    case fnv1a_hash("10.3.0 Sienna_Cichlid 18"): return "gfx1030";
+    default: return name;
+    }
+}
+
+inline bool is_gfx11_supported()
+{
+    return get_device_name() == "gfx1100" || get_device_name() == "gfx1101" ||
+           get_device_name() == "gfx1102" || get_device_name() == "gfx1103" ||
+           get_device_name() == "gfx1150" || get_device_name() == "gfx1151" ||
+           get_device_name() == "gfx1152" || get_device_name() == "gfx1153";
+}
+
+inline bool is_gfx12_supported()
+{
+    return get_device_name() == "gfx1200" || get_device_name() == "gfx1201";
+}
+
+inline bool is_gfx95_supported() { return get_device_name() == "gfx950"; }
+
+inline size_t get_num_cus()
+{
+    hipDeviceProp_t props{};
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess)
+    {
+        return 0;
+    }
+    status = hipGetDeviceProperties(&props, device);
+    if(status != hipSuccess)
+    {
+        return 0;
+    }
+    return static_cast<size_t>(props.multiProcessorCount);
+}
+
+} // namespace ck_tile
+
+#endif
--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -0,0 +1,549 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <iterator>
+#include <optional>
+#include <random>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+#include <unordered_set>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/joinable_thread.hpp"
+
+namespace ck_tile {
+
+/**
+ * @brief Functor for filling a range with randomly generated values from a uniform distribution.
+ *
+ * This struct provides functionality to fill iterators or ranges with random values
+ * generated from a uniform distribution. It supports both single-threaded and
+ * multi-threaded operation.
+ *
+ * @tparam T The target type for the generated values.
+ *
+ * @note The multi-threaded implementation is not guaranteed to provide perfectly
+ * distributed values across threads.
+ *
+ * @example
+ *
+ *     // Direct usage without creating a separate variable:
+ *     ck_tile::FillUniformDistribution<>{-1.f, 1.f}(a_host_tensor);
+ */
+template <typename T = void>
+struct FillUniformDistribution
+{
+    float a_{-5.f};
+    float b_{5.f};
+    std::optional<uint32_t> seed_{11939};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        if(first == last)
+            return;
+        using T_iter = std::decay_t<decltype(*first)>;
+        static_assert(std::is_same_v<T, T_iter> || std::is_void_v<T>,
+                      "Iterator value type must match template type T");
+        constexpr auto PackedSize = numeric_traits<T_iter>::PackedSize;
+        const auto total          = static_cast<size_t>(std::distance(first, last));
+        const auto total_bytes    = total * sizeof(T_iter);
+
+        // max 80 threads; at least 2MB per thread
+        const size_t available_cpu_cores    = get_available_cpu_cores();
+        constexpr uint64_t MAX_THREAD_COUNT = 80;
+        const size_t num_thread             = min(
+            MAX_THREAD_COUNT, available_cpu_cores, integer_divide_ceil(total_bytes, 0x200000UL));
+        constexpr size_t BLOCK_BYTES   = 64;
+        constexpr size_t BLOCK_SIZE    = BLOCK_BYTES / sizeof(T_iter);
+        const size_t num_blocks        = integer_divide_ceil(total_bytes, BLOCK_BYTES);
+        const size_t blocks_per_thread = integer_divide_ceil(num_blocks, num_thread);
+
+        // use minstd_rand for better performance on discard()
+        std::minstd_rand gen(seed_.has_value() ? *seed_ : std::random_device{}());
+        std::uniform_real_distribution<float> dis(a_, b_);
+
+        std::vector<joinable_thread> threads;
+        threads.reserve(num_thread - 1); // last job run in the main thread
+        for(int it = num_thread - 1; it >= 0; --it)
+        {
+            const size_t ib_begin = it * blocks_per_thread;
+            const size_t ib_end   = min(ib_begin + blocks_per_thread, num_blocks);
+
+            auto job = [=]() {
+                auto g_ = gen; // copy
+                auto d_ = dis; // copy
+                g_.discard(ib_begin * BLOCK_SIZE * PackedSize);
+                auto t_fn = [&]() {
+                    if constexpr(PackedSize == 2)
+                        return type_convert<T_iter>(fp32x2_t{d_(g_), d_(g_)});
+                    else
+                        return type_convert<T_iter>(d_(g_));
+                };
+
+                size_t ib = ib_begin;
+                for(; ib < ib_end - 1; ++ib) // full blocks
+                    static_for<0, BLOCK_SIZE, 1>{}([&](auto iw_) {
+                        constexpr size_t iw             = iw_.value;
+                        *(first + ib * BLOCK_SIZE + iw) = t_fn();
+                    });
+                for(size_t iw = 0; iw < BLOCK_SIZE; ++iw) // last block
+                    if(ib * BLOCK_SIZE + iw < total)
+                        *(first + ib * BLOCK_SIZE + iw) = t_fn();
+            };
+
+            if(it > 0)
+                threads.emplace_back(std::move(job));
+            else
+                job(); // last job run in the main thread
+        }
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
+template <>
+struct FillUniformDistribution<ck_tile::pk_int4_t>
+{
+    float a_{-8.f}; // same type as primary template so that
+                    // `FillUniformDistribution<Type>{-5.0f, 5.0f}` works for all types
+    float b_{7.f};
+    std::optional<uint32_t> seed_{11939};
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        if(a_ < -8.0f || b_ > 7.0f)
+        {
+            throw std::runtime_error(
+                "a_ or b_ of FillUniformDistribution<ck_tile::pk_int4_t> is out of range.");
+        }
+
+        int min_value             = static_cast<int>(a_);
+        int max_value             = static_cast<int>(b_);
+        constexpr auto int4_array = std::array<uint8_t, 16>{0x88,
+                                                            0x99,
+                                                            0xaa,
+                                                            0xbb,
+                                                            0xcc,
+                                                            0xdd,
+                                                            0xee,
+                                                            0xff,
+                                                            0x00,
+                                                            0x11,
+                                                            0x22,
+                                                            0x33,
+                                                            0x44,
+                                                            0x55,
+                                                            0x66,
+                                                            0x77};
+        std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
+        std::uniform_int_distribution<std::int32_t> dis(0, max_value - min_value + 1);
+        while(first != last)
+        {
+            int randomInt = dis(gen);
+            *first        = int4_array[randomInt + (min_value + 8)];
+            ++first;
+        }
+    }
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
+namespace impl {
+
+// clang-format off
+template<index_t bytes> struct RawIntegerType_ {};
+template<> struct RawIntegerType_<1> { using type = uint8_t;};
+template<> struct RawIntegerType_<2> { using type = uint16_t;};
+template<> struct RawIntegerType_<4> { using type = uint32_t;};
+template<> struct RawIntegerType_<8> { using type = uint64_t;};
+// clang-format on
+
+template <typename T>
+using RawIntegerType = typename RawIntegerType_<sizeof(T)>::type;
+} // namespace impl
+
+// Note: this struct will have no const-ness will generate random
+template <typename T>
+struct FillUniformDistribution_Unique
+{
+    float a_{-5.f};
+    float b_{5.f};
+    std::optional<uint32_t> seed_{11939};
+
+    std::mt19937 gen_{};
+    std::unordered_set<impl::RawIntegerType<T>> set_{};
+
+    FillUniformDistribution_Unique(float a                      = -5.f,
+                                   float b                      = 5.f,
+                                   std::optional<uint32_t> seed = {11939})
+        : a_(a),
+          b_(b),
+          seed_(seed),
+          gen_{seed_.has_value() ? *seed_ : std::random_device{}()},
+          set_{}
+    {
+    }
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last)
+    {
+        std::mt19937& gen = gen_;
+        std::uniform_real_distribution<float> dis(a_, b_);
+        auto& set = set_;
+        std::generate(first, last, [&dis, &gen, &set]() {
+            T v = static_cast<T>(0);
+            do
+            {
+                v = ck_tile::type_convert<T>(dis(gen));
+            } while(set.count(bit_cast<impl::RawIntegerType<T>>(v)) == 1);
+            set.insert(bit_cast<impl::RawIntegerType<T>>(v));
+
+            return v;
+        });
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range)
+        -> std::void_t<decltype(std::declval<FillUniformDistribution_Unique&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+
+    void clear() { set_.clear(); }
+};
+
+template <typename T>
+struct FillNormalDistribution
+{
+    float mean_{0.f};
+    float variance_{1.f};
+    std::optional<uint32_t> seed_{11939};
+    // ATTENTION: threaded does not guarantee the distribution between thread
+    bool threaded = false;
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        if(threaded)
+        {
+            uint32_t num_thread  = std::thread::hardware_concurrency();
+            auto total           = static_cast<std::size_t>(std::distance(first, last));
+            auto work_per_thread = static_cast<std::size_t>((total + num_thread - 1) / num_thread);
+
+            std::vector<joinable_thread> threads(num_thread);
+            for(std::size_t it = 0; it < num_thread; ++it)
+            {
+                std::size_t iw_begin = it * work_per_thread;
+                std::size_t iw_end   = std::min((it + 1) * work_per_thread, total);
+                auto thread_f        = [this, total, iw_begin, iw_end, &first] {
+                    if(iw_begin > total || iw_end > total)
+                        return;
+                    // need to make each thread unique, add an offset to current seed
+                    std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin)
+                                                       : std::random_device{}());
+                    std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
+                    std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() {
+                        return ck_tile::type_convert<T>(dis(gen));
+                    });
+                };
+                threads[it] = joinable_thread(thread_f);
+            }
+        }
+        else
+        {
+            std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
+            std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
+            std::generate(
+                first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
+        }
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillNormalDistribution&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
+// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
+// However this produces segfaults in std::mt19937 which look like inifite loop.
+//      template <typename T>
+//      struct FillUniformDistributionIntegerValue
+//      {
+//          int a_{-5};
+//          int b_{5};
+//
+//          template <typename ForwardIter>
+//          void operator()(ForwardIter first, ForwardIter last) const
+//          {
+//              std::mt19937 gen(11939);
+//              std::uniform_int_distribution<int> dis(a_, b_);
+//              std::generate(
+//                  first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(dis(gen)); });
+//          }
+//      };
+
+// Workaround for uniform_int_distribution not working as expected. See note above.<
+template <typename T>
+struct FillUniformDistributionIntegerValue
+{
+    float a_{-5.f};
+    float b_{5.f};
+    std::optional<uint32_t> seed_{11939};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
+        std::uniform_real_distribution<float> dis(a_, b_);
+        std::generate(
+            first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(std::round(dis(gen))); });
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillUniformDistributionIntegerValue&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
+template <typename T>
+struct FillNormalDistributionIntegerValue
+{
+    float mean_{0.f};
+    float variance_{1.f};
+    std::optional<uint32_t> seed_{11939};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}());
+        std::normal_distribution<float> dis(mean_, std::sqrt(variance_));
+        std::generate(
+            first, last, [&dis, &gen]() { return ck_tile::type_convert<T>(std::round(dis(gen))); });
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillNormalDistributionIntegerValue&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
+template <typename T>
+struct FillMonotonicSeq
+{
+    T init_value_{0};
+    T step_{1};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::generate(first, last, [=, *this, n = init_value_]() mutable {
+            auto tmp = n;
+            if constexpr(std::is_same_v<decltype(tmp), pk_int4_t>)
+            {
+                n.data += step_.data;
+            }
+            else
+            {
+                n += step_;
+            }
+            return tmp;
+        });
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillMonotonicSeq&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
+template <typename T, bool IsAscending = true>
+struct FillStepRange
+{
+    float start_value_{0};
+    float end_value_{3};
+    float step_{1};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::generate(first, last, [=, *this, n = start_value_]() mutable {
+            auto tmp = n;
+            n += step_;
+            if constexpr(IsAscending)
+            {
+                if(n > end_value_)
+                    n = start_value_;
+            }
+            else
+            {
+                if(n < end_value_)
+                    n = start_value_;
+            }
+
+            return type_convert<T>(tmp);
+        });
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillStepRange&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
+template <typename T>
+struct FillConstant
+{
+    T value_{0};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::fill(first, last, value_);
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillConstant&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
+//----------------------------------------------------------------------------------------------
+/// @brief      Transforms given input to fit 2:4 structured sparsity pattern so
+///             every subgroup of 4 elements contain at most 2 non-zero elements
+template <typename T>
+struct AdjustToStructuredSparsity
+{
+    size_t start{0};
+    // masks represent all valid 2:4 structured sparsity permutations
+    // clang-format off
+    static constexpr int32_t masks[] = {0, 0, 1, 1,
+                                        0, 1, 0, 1,
+                                        0, 1, 1, 0,
+                                        1, 0, 0, 1,
+                                        1, 0, 1, 0,
+                                        1, 1, 0, 0,
+                                        0, 0, 0, 1,
+                                        0, 0, 1, 0,
+                                        0, 1, 0, 0,
+                                        1, 0, 0, 0};
+    // clang-format on
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::transform(first, last, first, [=, *this, index = start](T val) mutable {
+            auto tmp = val * masks[index % (sizeof(masks) / sizeof(int32_t))];
+            index += 1;
+
+            return type_convert<T>(tmp);
+        });
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const AdjustToStructuredSparsity&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
+template <typename T, bool UseCos = true, bool UseAbs = false>
+struct FillTrigValue
+{
+    template <typename T_, bool UseCos_ = true, bool UseAbs_ = false>
+    struct LinearTrigGen
+    {
+        int i{0};
+        auto operator()()
+        {
+            float v = 0;
+            if constexpr(UseCos_)
+            {
+                v = cos(i);
+            }
+            else
+            {
+                v = sin(i);
+            }
+            if constexpr(UseAbs_)
+                v = abs(v);
+            i++;
+            return ck_tile::type_convert<T_>(v);
+        }
+    };
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        LinearTrigGen<T, UseCos, UseAbs> gen;
+        std::generate(first, last, gen);
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillTrigValue&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/host/flush_icache.hpp
+++ b/include/ck_tile/host/flush_icache.hpp
@@ -0,0 +1,36 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+namespace ck_tile {
+// GPU kernel to invalidate instruction cache for accurate benchmarking.
+// s_icache_inv: Asynchronously invalidates the L1 instruction cache on this compute unit,
+//               forcing subsequent kernel runs to fetch instructions from HBM instead of cache.
+// 16x s_nop:    Wait cycles (~16 cycles) to ensure cache invalidation completes before kernel
+//               exits. Without these NOPs, the flush may not finish, leading to inconsistent
+//               timing measurements where some instructions remain cached.
+static __global__ void flush_cache()
+{
+    asm __volatile__("s_icache_inv \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t"
+                     "s_nop 0 \n\t" ::
+                         :);
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/high_res_cpu_clock.hpp
+++ b/include/ck_tile/host/high_res_cpu_clock.hpp
@@ -0,0 +1,103 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <stdint.h>
+#if defined(_WIN32) || defined(_WIN64)
+// Windows
+#if !defined(WIN32_LEAN_AND_MEAN)
+#define WIN32_LEAN_AND_MEAN
+#endif
+#if !defined(NOMINMAX)
+#define NOMINMAX
+#endif
+#include <Windows.h>
+#endif
+
+namespace ck_tile {
+
+// Time structure to hold nanoseconds since epoch or arbitrary start point
+struct timepoint_t
+{
+    int64_t nanoseconds;
+};
+
+// Platform-specific includes and implementation
+#if defined(_WIN32) || defined(_WIN64)
+
+static inline timepoint_t high_res_now()
+{
+    // Cache the performance counter frequency; it is constant for the system lifetime.
+    static LARGE_INTEGER frequency = []() {
+        LARGE_INTEGER f;
+        QueryPerformanceFrequency(&f);
+        return f;
+    }();
+
+    LARGE_INTEGER counter;
+    timepoint_t tp;
+    QueryPerformanceCounter(&counter);
+
+    // Convert to nanoseconds using floating-point to avoid 64-bit integer overflow
+    tp.nanoseconds =
+        static_cast<int64_t>((static_cast<long double>(counter.QuadPart) * 1000000000.0L) /
+                             static_cast<long double>(frequency.QuadPart));
+
+    return tp;
+}
+
+#elif defined(__linux__) || defined(__unix__) || defined(_POSIX_VERSION)
+// Linux/Unix/POSIX
+#include <time.h>
+
+static inline timepoint_t high_res_now()
+{
+    struct timespec ts;
+    timepoint_t tp;
+
+    // Use CLOCK_MONOTONIC for consistent timing unaffected by system time changes
+    // Use CLOCK_REALTIME if you need wall-clock time
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+
+    tp.nanoseconds = static_cast<int64_t>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
+
+    return tp;
+}
+
+#else
+// Fallback for other platforms
+#include <time.h>
+
+static inline timepoint_t high_res_now()
+{
+    timepoint_t tp;
+    time_t t       = time(NULL);
+    tp.nanoseconds = static_cast<int64_t>(t * 1000000000LL);
+    return tp;
+}
+
+#endif
+
+// Duration calculation functions
+static inline int64_t duration_ns(timepoint_t start, timepoint_t end)
+{
+    return end.nanoseconds - start.nanoseconds;
+}
+
+static inline int64_t duration_us(timepoint_t start, timepoint_t end)
+{
+    return (end.nanoseconds - start.nanoseconds) / 1000LL;
+}
+
+static inline int64_t duration_ms(timepoint_t start, timepoint_t end)
+{
+    return (end.nanoseconds - start.nanoseconds) / 1000000LL;
+}
+
+static inline double duration_sec(timepoint_t start, timepoint_t end)
+{
+    return static_cast<double>(end.nanoseconds - start.nanoseconds) / 1000000000.0;
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/hip_check_error.hpp
+++ b/include/ck_tile/host/hip_check_error.hpp
@@ -0,0 +1,36 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+#include <sstream>
+#include <stdexcept>
+#include <hip/hip_runtime.h>
+
+namespace ck_tile {
+// To be removed, which really does not tell the location of failed HIP functional call
+CK_TILE_HOST void hip_check_error(hipError_t x)
+{
+    if(x != hipSuccess)
+    {
+        std::ostringstream ss;
+        ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__
+           << "in function: " << __func__;
+        throw std::runtime_error(ss.str());
+    }
+}
+} // namespace ck_tile
+
+#define HIP_CHECK_ERROR(retval_or_funcall)                                         \
+    do                                                                             \
+    {                                                                              \
+        hipError_t _tmpVal = retval_or_funcall;                                    \
+        if(_tmpVal != hipSuccess)                                                  \
+        {                                                                          \
+            std::ostringstream ostr;                                               \
+            ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
+                 << hipGetErrorString(_tmpVal);                                    \
+            throw std::runtime_error(ostr.str());                                  \
+        }                                                                          \
+    } while(0)
--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
@@ -0,0 +1,865 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <iomanip>
+#include <numeric>
+#include <utility>
+#include <vector>
+#include <functional>
+#include <fstream>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/joinable_thread.hpp"
+#include "ck_tile/host/ranges.hpp"
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
+
+namespace ck_tile {
+
+template <typename Range>
+CK_TILE_HOST std::ostream& LogRange(std::ostream& os,
+                                    Range&& range,
+                                    std::string delim,
+                                    int precision = std::cout.precision(),
+                                    int width     = 0)
+{
+    bool first = true;
+    for(auto&& v : range)
+    {
+        if(first)
+            first = false;
+        else
+            os << delim;
+        os << std::setw(width) << std::setprecision(precision) << v;
+    }
+    return os;
+}
+
+template <typename T, typename Range>
+CK_TILE_HOST std::ostream& LogRangeAsType(std::ostream& os,
+                                          Range&& range,
+                                          std::string delim,
+                                          int precision = std::cout.precision(),
+                                          int width     = 0)
+{
+    bool first = true;
+    for(auto&& v : range)
+    {
+        if(first)
+            first = false;
+        else
+            os << delim;
+        os << std::setw(width) << std::setprecision(precision) << static_cast<T>(v);
+    }
+    return os;
+}
+
+template <typename F, typename T, std::size_t... Is>
+CK_TILE_HOST auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
+{
+    return f(std::get<Is>(args)...);
+}
+
+template <typename F, typename T>
+CK_TILE_HOST auto call_f_unpack_args(F f, T args)
+{
+    constexpr std::size_t N = std::tuple_size<T>{};
+
+    return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
+}
+
+template <typename F, typename T, std::size_t... Is>
+CK_TILE_HOST auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
+{
+    return F(std::get<Is>(args)...);
+}
+
+template <typename F, typename T>
+CK_TILE_HOST auto construct_f_unpack_args(F, T args)
+{
+    constexpr std::size_t N = std::tuple_size<T>{};
+
+    return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
+}
+
+/**
+ * @brief Descriptor for tensors in host memory.
+ *
+ * HostTensorDescriptor manages the shape (dimensions) and memory layout (strides)
+ * of a tensor in host memory. It provides functionality to:
+ * - Store tensor dimensions and strides
+ * - Calculate default strides for contiguous memory layout
+ * - Convert multi-dimensional indices to linear memory offsets
+ * - Query tensor metadata (dimensions, element counts, etc.)
+ *
+ * The class supports both automatic stride calculation for contiguous memory layout
+ * and custom strides for more complex memory patterns.
+ */
+struct HostTensorDescriptor
+{
+    HostTensorDescriptor() = default;
+
+    void CalculateStrides()
+    {
+        mStrides.clear();
+        mStrides.resize(mLens.size(), 0);
+        if(mStrides.empty())
+            return;
+
+        mStrides.back() = 1;
+        std::partial_sum(mLens.rbegin(),
+                         mLens.rend() - 1,
+                         mStrides.rbegin() + 1,
+                         std::multiplies<std::size_t>());
+    }
+
+    template <typename X, typename = std::enable_if_t<std::is_convertible_v<X, std::size_t>>>
+    HostTensorDescriptor(const std::initializer_list<X>& lens) : mLens(lens.begin(), lens.end())
+    {
+        this->CalculateStrides();
+    }
+
+    template <typename Lengths,
+              typename = std::enable_if_t<
+                  std::is_convertible_v<ck_tile::ranges::range_value_t<Lengths>, std::size_t>>>
+    HostTensorDescriptor(const Lengths& lens) : mLens(lens.begin(), lens.end())
+    {
+        this->CalculateStrides();
+    }
+
+    template <typename X,
+              typename Y,
+              typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
+                                          std::is_convertible_v<Y, std::size_t>>>
+    HostTensorDescriptor(const std::initializer_list<X>& lens,
+                         const std::initializer_list<Y>& strides)
+        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
+    {
+    }
+
+    template <typename Lengths,
+              typename Strides,
+              typename = std::enable_if_t<
+                  std::is_convertible_v<ck_tile::ranges::range_value_t<Lengths>, std::size_t> &&
+                  std::is_convertible_v<ck_tile::ranges::range_value_t<Strides>, std::size_t>>>
+    HostTensorDescriptor(const Lengths& lens, const Strides& strides)
+        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
+    {
+    }
+
+    std::size_t get_num_of_dimension() const { return mLens.size(); }
+    /**
+     * @brief Calculates the total number of elements in the tensor.
+     *
+     * Computes the product of all dimension lengths to determine the
+     * total element count in the tensor.
+     *
+     * @pre The lengths array (mLens) and strides array (mStrides) must have
+     *      the same size.
+     *
+     * @return The total number of elements in the tensor.
+     */
+    std::size_t get_element_size() const
+    {
+        assert(mLens.size() == mStrides.size());
+        return std::accumulate(
+            mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
+    }
+    /**
+     * @brief Calculates the total element space required for the tensor in memory.
+     *
+     * This method computes the minimum size of contiguous memory needed to store
+     * all elements of the tensor, taking into account the tensor's dimensions and
+     * strides. The calculation is based on the formula: 1 + max((length_i - 1) * stride_i)
+     * across all dimensions.
+     *
+     * Dimensions with length 0 are skipped in this calculation.
+     *
+     * @return The size of the tensor's element space (number of elements).
+     */
+    std::size_t get_element_space_size() const
+    {
+        std::size_t space = 1;
+        for(std::size_t i = 0; i < mLens.size(); ++i)
+        {
+            if(mLens[i] == 0)
+                continue;
+
+            space += (mLens[i] - 1) * mStrides[i];
+        }
+        return space;
+    }
+
+    std::size_t get_length(std::size_t dim) const { return mLens[dim]; }
+
+    const std::vector<std::size_t>& get_lengths() const { return mLens; }
+
+    std::size_t get_stride(std::size_t dim) const { return mStrides[dim]; }
+
+    const std::vector<std::size_t>& get_strides() const { return mStrides; }
+
+    /**
+     * @brief Calculates the linear offset from multi-dimensional indices.
+     *
+     * Converts a set of N-dimensional indices into a single linear offset by computing
+     * the inner product of the indices with the tensor's strides.
+     *
+     * @tparam Is Parameter pack of index types (should be convertible to std::size_t)
+     * @param is Variable number of indices, one for each dimension of the tensor
+     * @return std::size_t Linear offset corresponding to the given multi-dimensional indices
+     *
+     * @pre The number of indices must match the number of dimensions in the tensor
+     */
+    template <typename... Is>
+    std::size_t GetOffsetFromMultiIndex(Is... is) const
+    {
+        assert(sizeof...(Is) == this->get_num_of_dimension());
+        std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
+        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
+    }
+
+    /**
+     * @brief Calculates the linear memory offset from a multi-dimensional index
+     *
+     * Computes the linear offset by performing an inner product between the provided
+     * multi-dimensional indices and the tensor's strides.
+     *
+     * @param iss Vector containing the multi-dimensional indices
+     * @return The calculated linear offset as a size_t
+     */
+    std::size_t GetOffsetFromMultiIndex(const std::vector<std::size_t>& iss) const
+    {
+        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc)
+    {
+        os << "dim " << desc.get_num_of_dimension() << ", ";
+
+        os << "lengths {";
+        LogRange(os, desc.get_lengths(), ", ");
+        os << "}, ";
+
+        os << "strides {";
+        LogRange(os, desc.get_strides(), ", ");
+        os << "}";
+
+        return os;
+    }
+
+    private:
+    std::vector<std::size_t> mLens;    ///< Lengths of each dimension
+    std::vector<std::size_t> mStrides; ///< Strides for each dimension
+};
+
+template <typename New2Old>
+CK_TILE_HOST HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(
+    const HostTensorDescriptor& a, const New2Old& new2old)
+{
+    std::vector<std::size_t> new_lengths(a.get_num_of_dimension());
+    std::vector<std::size_t> new_strides(a.get_num_of_dimension());
+
+    for(std::size_t i = 0; i < a.get_num_of_dimension(); i++)
+    {
+        new_lengths[i] = a.get_lengths()[new2old[i]];
+        new_strides[i] = a.get_strides()[new2old[i]];
+    }
+
+    return HostTensorDescriptor(new_lengths, new_strides);
+}
+
+template <typename F, typename... Xs>
+struct ParallelTensorFunctor
+{
+    F mF;
+    static constexpr std::size_t NDIM = sizeof...(Xs);
+    std::array<std::size_t, NDIM> mLens;
+    std::array<std::size_t, NDIM> mStrides;
+    std::size_t mN1d;
+
+    ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
+    {
+        mStrides.back() = 1;
+        std::partial_sum(mLens.rbegin(),
+                         mLens.rend() - 1,
+                         mStrides.rbegin() + 1,
+                         std::multiplies<std::size_t>());
+        mN1d = mStrides[0] * mLens[0];
+    }
+
+    std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
+    {
+        std::array<std::size_t, NDIM> indices;
+
+        for(std::size_t idim = 0; idim < NDIM; ++idim)
+        {
+            indices[idim] = i / mStrides[idim];
+            i -= indices[idim] * mStrides[idim];
+        }
+
+        return indices;
+    }
+
+    void operator()(std::size_t num_thread = 1) const
+    {
+        std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
+
+        std::vector<joinable_thread> threads(num_thread);
+
+        for(std::size_t it = 0; it < num_thread; ++it)
+        {
+            std::size_t iw_begin = it * work_per_thread;
+            std::size_t iw_end   = std::min((it + 1) * work_per_thread, mN1d);
+
+            auto f = [this, iw_begin, iw_end] {
+                for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
+                {
+                    call_f_unpack_args(this->mF, this->GetNdIndices(iw));
+                }
+            };
+            threads[it] = joinable_thread(f);
+        }
+    }
+};
+
+template <typename F, typename... Xs>
+CK_TILE_HOST auto make_ParallelTensorFunctor(F f, Xs... xs)
+{
+    return ParallelTensorFunctor<F, Xs...>(f, xs...);
+}
+
+template <typename T>
+struct HostTensor
+{
+    using Descriptor = HostTensorDescriptor;
+    using Data       = std::vector<T>;
+
+    template <typename X>
+    HostTensor(std::initializer_list<X> lens) : mDesc(lens), mData(get_element_space_size())
+    {
+    }
+
+    template <typename X, typename Y>
+    HostTensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
+        : mDesc(lens, strides), mData(get_element_space_size())
+    {
+    }
+
+    template <typename Lengths>
+    HostTensor(const Lengths& lens) : mDesc(lens), mData(get_element_space_size())
+    {
+    }
+
+    template <typename Lengths, typename Strides>
+    HostTensor(const Lengths& lens, const Strides& strides)
+        : mDesc(lens, strides), mData(get_element_space_size())
+    {
+    }
+
+    HostTensor(const Descriptor& desc) : mDesc(desc), mData(get_element_space_size()) {}
+
+    template <typename OutT>
+    HostTensor<OutT> CopyAsType() const
+    {
+        HostTensor<OutT> ret(mDesc);
+        std::transform(mData.cbegin(), mData.cend(), ret.mData.begin(), [](auto value) {
+            return ck_tile::type_convert<OutT>(value);
+        });
+        return ret;
+    }
+
+    HostTensor()                  = delete;
+    HostTensor(const HostTensor&) = default;
+    HostTensor(HostTensor&&)      = default;
+
+    ~HostTensor() = default;
+
+    HostTensor& operator=(const HostTensor&) = default;
+    HostTensor& operator=(HostTensor&&)      = default;
+
+    template <typename FromT>
+    explicit HostTensor(const HostTensor<FromT>& other) : HostTensor(other.template CopyAsType<T>())
+    {
+    }
+
+    std::size_t get_length(std::size_t dim) const { return mDesc.get_length(dim); }
+
+    decltype(auto) get_lengths() const { return mDesc.get_lengths(); }
+
+    std::size_t get_stride(std::size_t dim) const { return mDesc.get_stride(dim); }
+
+    decltype(auto) get_strides() const { return mDesc.get_strides(); }
+
+    std::size_t get_num_of_dimension() const { return mDesc.get_num_of_dimension(); }
+
+    std::size_t get_element_size() const { return mDesc.get_element_size(); }
+
+    std::size_t get_element_space_size() const
+    {
+        constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
+        return mDesc.get_element_space_size() / PackedSize;
+    }
+
+    std::size_t get_element_space_size_in_bytes() const
+    {
+        return sizeof(T) * get_element_space_size();
+    }
+
+    void SetZero()
+    {
+        if constexpr(std::is_same_v<T, e8m0_t>)
+            std::fill(mData.begin(), mData.end(), e8m0_t{1.f});
+        else
+            std::fill(mData.begin(), mData.end(), 0);
+    }
+
+    template <typename F>
+    void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
+    {
+        if(rank == mDesc.get_num_of_dimension())
+        {
+            f(*this, idx);
+            return;
+        }
+        // else
+        for(size_t i = 0; i < mDesc.get_lengths()[rank]; i++)
+        {
+            idx[rank] = i;
+            ForEach_impl(std::forward<F>(f), idx, rank + 1);
+        }
+    }
+
+    template <typename F>
+    void ForEach(F&& f)
+    {
+        std::vector<size_t> idx(mDesc.get_num_of_dimension(), 0);
+        ForEach_impl(std::forward<F>(f), idx, size_t(0));
+    }
+
+    template <typename F>
+    void ForEach_impl(const F&& f, std::vector<size_t>& idx, size_t rank) const
+    {
+        if(rank == mDesc.get_num_of_dimension())
+        {
+            f(*this, idx);
+            return;
+        }
+        // else
+        for(size_t i = 0; i < mDesc.get_lengths()[rank]; i++)
+        {
+            idx[rank] = i;
+            ForEach_impl(std::forward<const F>(f), idx, rank + 1);
+        }
+    }
+
+    template <typename F>
+    void ForEach(const F&& f) const
+    {
+        std::vector<size_t> idx(mDesc.get_num_of_dimension(), 0);
+        ForEach_impl(std::forward<const F>(f), idx, size_t(0));
+    }
+
+    template <typename G>
+    void GenerateTensorValue(G g, std::size_t num_thread = 1)
+    {
+        switch(mDesc.get_num_of_dimension())
+        {
+        case 1: {
+            auto f = [&](auto i) { (*this)(i) = g(i); };
+            make_ParallelTensorFunctor(f, mDesc.get_lengths()[0])(num_thread);
+            break;
+        }
+        case 2: {
+            auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
+            make_ParallelTensorFunctor(f, mDesc.get_lengths()[0], mDesc.get_lengths()[1])(
+                num_thread);
+            break;
+        }
+        case 3: {
+            auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
+            make_ParallelTensorFunctor(f,
+                                       mDesc.get_lengths()[0],
+                                       mDesc.get_lengths()[1],
+                                       mDesc.get_lengths()[2])(num_thread);
+            break;
+        }
+        case 4: {
+            auto f = [&](auto i0, auto i1, auto i2, auto i3) {
+                (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
+            };
+            make_ParallelTensorFunctor(f,
+                                       mDesc.get_lengths()[0],
+                                       mDesc.get_lengths()[1],
+                                       mDesc.get_lengths()[2],
+                                       mDesc.get_lengths()[3])(num_thread);
+            break;
+        }
+        case 5: {
+            auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
+                (*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
+            };
+            make_ParallelTensorFunctor(f,
+                                       mDesc.get_lengths()[0],
+                                       mDesc.get_lengths()[1],
+                                       mDesc.get_lengths()[2],
+                                       mDesc.get_lengths()[3],
+                                       mDesc.get_lengths()[4])(num_thread);
+            break;
+        }
+        case 6: {
+            auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4, auto i5) {
+                (*this)(i0, i1, i2, i3, i4, i5) = g(i0, i1, i2, i3, i4, i5);
+            };
+            make_ParallelTensorFunctor(f,
+                                       mDesc.get_lengths()[0],
+                                       mDesc.get_lengths()[1],
+                                       mDesc.get_lengths()[2],
+                                       mDesc.get_lengths()[3],
+                                       mDesc.get_lengths()[4],
+                                       mDesc.get_lengths()[5])(num_thread);
+            break;
+        }
+        default: throw std::runtime_error("unspported dimension");
+        }
+    }
+
+    template <typename... Is>
+    std::size_t GetOffsetFromMultiIndex(Is... is) const
+    {
+        constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
+        return mDesc.GetOffsetFromMultiIndex(is...) / PackedSize;
+    }
+
+    template <typename... Is>
+    T& operator()(Is... is)
+    {
+        return mData[GetOffsetFromMultiIndex(is...)];
+    }
+
+    template <typename... Is>
+    const T& operator()(Is... is) const
+    {
+        return mData[GetOffsetFromMultiIndex(is...)];
+    }
+
+    T& operator()(const std::vector<std::size_t>& idx)
+    {
+        return mData[GetOffsetFromMultiIndex(idx)];
+    }
+
+    const T& operator()(const std::vector<std::size_t>& idx) const
+    {
+        return mData[GetOffsetFromMultiIndex(idx)];
+    }
+
+    HostTensor<T> transpose(std::vector<size_t> axes = {}) const
+    {
+        if(axes.empty())
+        {
+            axes.resize(this->get_num_of_dimension());
+            std::iota(axes.rbegin(), axes.rend(), 0);
+        }
+        if(axes.size() != mDesc.get_num_of_dimension())
+        {
+            throw std::runtime_error(
+                "HostTensor::transpose(): size of axes must match tensor dimension");
+        }
+        std::vector<size_t> tlengths, tstrides;
+        for(const auto& axis : axes)
+        {
+            tlengths.push_back(get_lengths()[axis]);
+            tstrides.push_back(get_strides()[axis]);
+        }
+        HostTensor<T> ret(*this);
+        ret.mDesc = HostTensorDescriptor(tlengths, tstrides);
+        return ret;
+    }
+
+    HostTensor<T> transpose(std::vector<size_t> axes = {})
+    {
+        return const_cast<HostTensor<T> const*>(this)->transpose(axes);
+    }
+
+    typename Data::iterator begin() { return mData.begin(); }
+
+    typename Data::iterator end() { return mData.end(); }
+
+    typename Data::pointer data() { return mData.data(); }
+
+    typename Data::const_iterator begin() const { return mData.begin(); }
+
+    typename Data::const_iterator end() const { return mData.end(); }
+
+    typename Data::const_pointer data() const { return mData.data(); }
+
+    typename Data::size_type size() const { return mData.size(); }
+
+    T max() const { return *std::max_element(mData.begin(), mData.end()); }
+
+    // return a slice of this tensor
+    // for simplicity we just copy the data and return a new tensor
+    auto slice(std::vector<size_t> s_begin, std::vector<size_t> s_end) const
+    {
+        assert(s_begin.size() == s_end.size());
+        assert(s_begin.size() == get_num_of_dimension());
+
+        std::vector<size_t> s_len(s_begin.size());
+        std::transform(
+            s_end.begin(), s_end.end(), s_begin.begin(), s_len.begin(), std::minus<size_t>{});
+        HostTensor<T> sliced_tensor(s_len);
+
+        sliced_tensor.ForEach([&](auto& self, auto idx) {
+            std::vector<size_t> src_idx(idx.size());
+            std::transform(
+                idx.begin(), idx.end(), s_begin.begin(), src_idx.begin(), std::plus<size_t>{});
+            self(idx) = operator()(src_idx);
+        });
+
+        return sliced_tensor;
+    }
+
+    template <typename U = T>
+    auto AsSpan() const
+    {
+        constexpr std::size_t FromSize = sizeof(T);
+        constexpr std::size_t ToSize   = sizeof(U);
+
+        using Element = std::add_const_t<std::remove_reference_t<U>>;
+        return ck_tile::span<Element>{reinterpret_cast<Element*>(data()),
+                                      size() * FromSize / ToSize};
+    }
+
+    template <typename U = T>
+    auto AsSpan()
+    {
+        constexpr std::size_t FromSize = sizeof(T);
+        constexpr std::size_t ToSize   = sizeof(U);
+
+        using Element = std::remove_reference_t<U>;
+        return ck_tile::span<Element>{reinterpret_cast<Element*>(data()),
+                                      size() * FromSize / ToSize};
+    }
+
+    /**
+     * @brief Print only the first N elements of the tensor
+     *
+     * @param os Output stream to write to
+     * @param n Number of elements to print (default: 5)
+     * @return std::ostream& Reference to the output stream
+     */
+    std::ostream& print_first_n(std::ostream& os, std::size_t n = 5) const
+    {
+        os << mDesc;
+        os << "[";
+        for(typename Data::size_type idx = 0; idx < std::min(n, mData.size()); ++idx)
+        {
+            if(0 < idx)
+            {
+                os << ", ";
+            }
+            if constexpr(std::is_same_v<T, bf16_t> || std::is_same_v<T, fp16_t> ||
+                         std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>)
+            {
+                os << type_convert<float>(mData[idx]);
+            }
+            else if constexpr(std::is_same_v<T, ck_tile::pk_int4_t>)
+            {
+                auto unpacked = pk_int4_t_to_int8x2_t(mData[idx]);
+                os << "pk(" << static_cast<int>(unpacked[0]) << ", "
+                   << static_cast<int>(unpacked[1]) << ")";
+            }
+            else if constexpr(std::is_same_v<T, int8_t>)
+            {
+                os << static_cast<int>(mData[idx]);
+            }
+            else
+            {
+                os << mData[idx];
+            }
+        }
+        if(mData.size() > n)
+        {
+            os << ", ...";
+        }
+        os << "]";
+        return os;
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const HostTensor<T>& t)
+    {
+        os << t.mDesc;
+        os << "[";
+        for(typename Data::size_type idx = 0; idx < t.mData.size(); ++idx)
+        {
+            if(0 < idx)
+            {
+                os << ", ";
+            }
+            if constexpr(std::is_same_v<T, bf16_t> || std::is_same_v<T, fp16_t> ||
+                         std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>)
+            {
+                os << type_convert<float>(t.mData[idx]) << " #### ";
+            }
+            else if constexpr(std::is_same_v<T, ck_tile::pk_int4_t>)
+            {
+                auto unpacked = pk_int4_t_to_int8x2_t(t.mData[idx]);
+                os << "pk(" << static_cast<int>(unpacked[0]) << ", "
+                   << static_cast<int>(unpacked[1]) << ") #### ";
+            }
+            else
+            {
+                os << t.mData[idx];
+            }
+        }
+        os << "]";
+        return os;
+    }
+
+    // read data from a file, as dtype
+    // the file could dumped from torch as (targeting tensor is t here)
+    // numpy.savetxt("f.txt", t.view(-1).numpy())
+    // numpy.savetxt("f.txt", t.cpu().view(-1).numpy()) # from cuda to cpu to save
+    // numpy.savetxt("f.txt", t.cpu().view(-1).numpy(), fmt="%d")   # save as int
+    // will output f.txt, each line is a value
+    // dtype=float or int, internally will cast to real type
+    void loadtxt(std::string file_name, std::string dtype = "float")
+    {
+        std::ifstream file(file_name);
+
+        if(file.is_open())
+        {
+            std::string line;
+
+            index_t cnt = 0;
+            while(std::getline(file, line))
+            {
+                if(cnt >= static_cast<index_t>(mData.size()))
+                {
+                    throw std::runtime_error(std::string("data read from file:") + file_name +
+                                             " is too big");
+                }
+
+                if(dtype == "float")
+                {
+                    mData[cnt] = type_convert<T>(std::stof(line));
+                }
+                else if(dtype == "int" || dtype == "int32")
+                {
+                    mData[cnt] = type_convert<T>(std::stoi(line));
+                }
+                cnt++;
+            }
+            file.close();
+            if(cnt < static_cast<index_t>(mData.size()))
+            {
+                std::cerr << "Warning! reading from file:" << file_name
+                          << ", does not match the size of this tensor" << std::endl;
+            }
+        }
+        else
+        {
+            // Print an error message to the standard error
+            // stream if the file cannot be opened.
+            throw std::runtime_error(std::string("unable to open file:") + file_name);
+        }
+    }
+
+    // can save to a txt file and read from torch as:
+    // torch.from_numpy(np.loadtxt('f.txt', dtype=np.int32/np.float32...)).view([...]).contiguous()
+    void savetxt(std::string file_name, std::string dtype = "float")
+    {
+        std::ofstream file(file_name);
+
+        if(file.is_open())
+        {
+            for(auto& itm : mData)
+            {
+                if(dtype == "float")
+                    file << type_convert<float>(itm) << std::endl;
+                else if(dtype == "int")
+                    file << type_convert<int>(itm) << std::endl;
+                else if(dtype == "int8_t")
+                    file << static_cast<int>(type_convert<ck_tile::int8_t>(itm)) << std::endl;
+                else
+                    // TODO: we didn't implement operator<< for all custom
+                    // data types, here fall back to float in case compile error
+                    file << type_convert<float>(itm) << std::endl;
+            }
+            file.close();
+        }
+        else
+        {
+            // Print an error message to the standard error
+            // stream if the file cannot be opened.
+            throw std::runtime_error(std::string("unable to open file:") + file_name);
+        }
+    }
+
+    Descriptor mDesc;
+    Data mData;
+};
+
+/**
+ * @brief Creates a host tensor descriptor with specified dimensions and layout
+ *
+ * Constructs a HostTensorDescriptor with appropriate strides based on whether the tensor
+ * layout is row-major or column-major. This is determined via the compile-time template
+ * parameter `is_row_major`.
+ *
+ * @tparam is_row_major Compile-time flag indicating if the layout is row-major (true) or
+ * column-major (false)
+ *
+ * @param row Number of rows in the tensor
+ * @param col Number of columns in the tensor
+ * @param stride Stride between adjacent rows (for row-major) or columns (for column-major)
+ *
+ * @return HostTensorDescriptor with shape {row, col} and strides:
+ *         - For row-major: {stride, 1}
+ *         - For column-major: {1, stride}
+ */
+template <bool is_row_major>
+auto host_tensor_descriptor(std::size_t row,
+                            std::size_t col,
+                            std::size_t stride,
+                            bool_constant<is_row_major>)
+{
+    using namespace ck_tile::literals;
+
+    if constexpr(is_row_major)
+    {
+        return HostTensorDescriptor({row, col}, {stride, 1_uz});
+    }
+    else
+    {
+        return HostTensorDescriptor({row, col}, {1_uz, stride});
+    }
+}
+
+template <bool is_row_major>
+auto get_default_stride(std::size_t row,
+                        std::size_t col,
+                        std::size_t stride,
+                        bool_constant<is_row_major>)
+{
+    if(stride == 0)
+    {
+        if constexpr(is_row_major)
+        {
+            return col;
+        }
+        else
+        {
+            return row;
+        }
+    }
+    else
+        return stride;
+}
+} // namespace ck_tile
+#pragma clang diagnostic pop
--- a/include/ck_tile/host/joinable_thread.hpp
+++ b/include/ck_tile/host/joinable_thread.hpp
@@ -0,0 +1,76 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#ifdef __linux__
+#include <sched.h>
+#endif
+#include <thread>
+#include <utility>
+
+namespace ck_tile {
+
+struct joinable_thread : std::thread
+{
+    template <typename... Xs>
+    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
+    {
+    }
+
+    joinable_thread(joinable_thread&&)            = default;
+    joinable_thread& operator=(joinable_thread&&) = default;
+
+    ~joinable_thread()
+    {
+        if(this->joinable())
+            this->join();
+    }
+};
+
+inline unsigned int get_available_cpu_cores()
+{
+#if defined(__linux__)
+    cpu_set_t cpu_set;
+    if(sched_getaffinity(0, sizeof(cpu_set_t), &cpu_set) == 0)
+    {
+        unsigned int cpu_count = CPU_COUNT(&cpu_set);
+        if(cpu_count > 0)
+            return cpu_count;
+    }
+#endif
+    // Fallback if sched_getaffinity unavailable or fails
+    return std::thread::hardware_concurrency();
+}
+
+class cpu_core_guard
+{
+#if defined(__linux__)
+    cpu_set_t original_cpu_set_;
+
+    public:
+    cpu_core_guard(unsigned int num_cores) : original_cpu_set_()
+    {
+        // save original cpu set
+        sched_getaffinity(0, sizeof(cpu_set_t), &original_cpu_set_);
+
+        // set new cpu set
+        cpu_set_t new_cpu_set;
+        CPU_ZERO(&new_cpu_set);
+        for(unsigned int i = 0; i < num_cores; ++i)
+        {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+            CPU_SET(i, &new_cpu_set); // NOLINT(old-style-cast)
+#pragma clang diagnostic pop
+        }
+        sched_setaffinity(0, sizeof(cpu_set_t), &new_cpu_set);
+    }
+    ~cpu_core_guard()
+    {
+        // restore original cpu set
+        sched_setaffinity(0, sizeof(cpu_set_t), &original_cpu_set_);
+    }
+#endif
+};
+} // namespace ck_tile
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -0,0 +1,305 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <numeric>
+#include <functional>
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/utility/ignore.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+#include "ck_tile/host/stream_config.hpp"
+#include "ck_tile/host/timer.hpp"
+#include "ck_tile/host/flush_icache.hpp"
+#include "ck_tile/host/rotating_buffers.hpp"
+#include <cstddef>
+#include <hip/hip_runtime.h>
+
+namespace ck_tile {
+
+template <typename T, typename = void>
+inline constexpr bool kattr_no_packed_fp32_ops_v = false;
+template <typename T>
+inline constexpr bool
+    kattr_no_packed_fp32_ops_v<T, std::void_t<decltype(T::kattr_no_packed_fp32_ops)>> =
+        T::kattr_no_packed_fp32_ops;
+
+template <bool no_packed_fp32_ops>
+struct kernel_attr
+{
+    // The kernel function attribute "no-packed-fp32-ops": Disable the use of packed FP32
+    // instructions so that they can be co-executed with matrix operations
+    static constexpr bool kattr_no_packed_fp32_ops = no_packed_fp32_ops;
+};
+
+#if CK_TILE_USE_LAUNCH_BOUNDS
+#define KENTRY_LAUNCH_BOUNDS __launch_bounds__(Kernel::kBlockSize, MinBlockPerCu)
+#else
+#define KENTRY_LAUNCH_BOUNDS
+#endif
+#if defined(__HIP_DEVICE_COMPILE__)
+#define KENTRY_BODY Kernel{}(args...)
+#define KENTRY_ATTR_NO_PACKED_FP32_OPS __attribute__((target("no-packed-fp32-ops")))
+#else
+#define KENTRY_BODY (..., (ignore = args, 0))
+#define KENTRY_ATTR_NO_PACKED_FP32_OPS
+#endif
+
+template <int MinBlockPerCu, typename Kernel, typename... Args>
+KENTRY_LAUNCH_BOUNDS __global__ void kentry(Args... args)
+{
+    KENTRY_BODY;
+}
+template <typename Attr, int MinBlockPerCu, typename Kernel, typename... Args>
+KENTRY_LAUNCH_BOUNDS __global__ //
+    std::enable_if_t<!kattr_no_packed_fp32_ops_v<Attr>>
+    kentry(Args... args)
+{
+    KENTRY_BODY;
+}
+template <typename Attr, int MinBlockPerCu, typename Kernel, typename... Args>
+KENTRY_LAUNCH_BOUNDS KENTRY_ATTR_NO_PACKED_FP32_OPS __global__ //
+    std::enable_if_t<kattr_no_packed_fp32_ops_v<Attr>>
+    kentry(Args... args)
+{
+    KENTRY_BODY;
+}
+
+#undef KENTRY_LAUNCH_BOUNDS
+#undef KENTRY_BODY
+#undef KENTRY_ATTR_NO_PACKED_FP32_OPS
+
+//
+// return a anonymous functor(lambda) to be called later
+// the KernelImpl should be a class without non-static data member, or let's say
+// can be instantiate with "KernelImpl{}"
+//
+// the "static __device__ operator()(some_arg)" is the entry point of KernelImpl
+//
+// Attr can be used to support linking multiple object files that have the same kernel compiled for
+// different architectures. In this case each object file has to use a different tag (gfx9_t,
+// gfx12_t etc.), so the kernel will have different symbols for each architecture. It can also be
+// used to pass some compile-time attributes to the kernel.
+template <int MinBlockPerCu = CK_TILE_MIN_BLOCK_PER_CU,
+          typename Attr     = void,
+          typename KernelImpl,
+          typename... Args>
+CK_TILE_HOST auto
+make_kernel(KernelImpl /*f*/, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
+{
+    const auto kernel = []() {
+        if constexpr(std::is_void_v<Attr>)
+            return kentry<MinBlockPerCu, KernelImpl, Args...>;
+        else
+            return kentry<Attr, MinBlockPerCu, KernelImpl, Args...>;
+    }();
+    return [=](const stream_config& s) {
+        kernel<<<grid_dim, block_dim, lds_byte, s.stream_id_>>>(args...);
+    };
+}
+
+template <typename... Callables>
+CK_TILE_HOST void launch_and_check(const stream_config& sc, Callables&&... callables)
+{
+    // abort the sequence in case of intermediate error
+    if(!((static_cast<void>(callables(sc)), hipPeekAtLastError() == hipSuccess) && ...))
+    {
+        HIP_CHECK_ERROR(hipGetLastError());
+    }
+}
+
+// Measure the preprocess time during the cold iterations
+template <typename TimerType, typename PreprocessFunc>
+CK_TILE_HOST double
+preprocess_profiling_impl(TimerType timer, const stream_config& s, PreprocessFunc preprocess)
+{
+    timer.start(s.stream_id_);
+    for(int i = 0; i < s.nrepeat_; i++)
+    {
+        if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
+        {
+            preprocess();
+        }
+    }
+    timer.stop(s.stream_id_);
+
+    return timer.duration() / s.nrepeat_;
+}
+
+template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
+CK_TILE_HOST double timing_loop_flush_cache_impl(TimerType timer,
+                                                 const stream_config& s,
+                                                 CallablesFunc&& callables_func,
+                                                 PreprocessFunc preprocess = nullptr)
+{
+    auto run_flush_cache = [&]() { ck_tile::flush_icache(); };
+    // Warm up
+    for(int i = 0; i < s.cold_niters_; i++)
+    {
+        if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
+        {
+            preprocess();
+        }
+        callables_func();
+    }
+    // Main timing loop
+    int i = 0;
+    timer.start(s.stream_id_);
+    while(i < s.nrepeat_)
+    {
+        run_flush_cache();
+        if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
+        {
+            preprocess();
+        }
+
+        callables_func();
+        i++;
+    }
+    timer.stop(s.stream_id_);
+    // Flush cache timing loop
+    auto flush_cache_time = preprocess_profiling_impl(gpu_timer{}, s, run_flush_cache);
+    if(i == 0)
+    {
+        return 0.;
+    }
+    // Exclude flush cache from result
+    return (timer.duration() / s.nrepeat_) - flush_cache_time;
+}
+
+template <typename TimerType, typename CallablesFunc, typename PreprocessFunc = std::nullptr_t>
+CK_TILE_HOST double timing_loop_impl(TimerType timer,
+                                     const stream_config& s,
+                                     CallablesFunc&& callables_func,
+                                     PreprocessFunc preprocess = nullptr)
+{
+    for(int i = 0; i < s.cold_niters_; i++)
+    {
+        if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
+        {
+            preprocess();
+        }
+        callables_func();
+    }
+
+    int i = 0;
+    timer.start(s.stream_id_);
+    while(i < s.nrepeat_)
+    {
+        if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
+        {
+            preprocess();
+        }
+
+        callables_func();
+        i++;
+    }
+    timer.stop(s.stream_id_);
+
+    if(i == 0)
+        return 0.;
+    return timer.duration() / s.nrepeat_;
+}
+
+// clang-format off
+/*
+ * launch_kernel()
+ *
+ * this is the function to launch arbitrary number of kernels with optional timer(selected by stream_config)
+ * the callables should have signature as "operator()(const stream_config& s){ ... }" to call
+ * 
+ * the simplest way is pass in a lambda function, with "[=](const stream_config& s){ call_your_kernel_here() }"
+ * as signature, for the callable (pay attention to the capture list)
+ * 
+ * e.g.
+ *  ck_tile::launch_kernel(s,
+ *                      [=](const stream_config& s){ hipMemset(ptr, 0, size) },
+ *                      [=](const stream_config& s){ some_kernel<<<grids, blocks>>>(arg); }
+ *                      );
+ * 
+ * if you use ck_tile kernel, or similiar to this style (structure with "static __device__ operator()(...){}")
+ * you can pass your kernel to ck_tile::make_kernel(), which will create a anonymous functor for you,
+ * then pass it to ck_tile::launch_kernel()
+ * 
+ * e.g.
+ *  ck_tile::launch_kernel(s,
+ *                      ck_tile::make_kernel<T0, B0>(kernel_0{}, grids0, blocks0, 0, kargs0),
+ *                      ck_tile::make_kernel<T0, B1>(kernel_1{}, grids1, blocks1, 0, kargs1),
+ *                       ...);
+ **/
+// clang-format on
+template <typename... Callables>
+CK_TILE_HOST float launch_kernel(const stream_config& s, Callables&&... callables)
+{
+    static_assert(sizeof...(callables) > 0, "At least one callable is required!");
+
+    if(!s.time_kernel_)
+    {
+        launch_and_check(s, std::forward<Callables>(callables)...);
+        return 0;
+    }
+
+    auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
+
+    if(s.is_gpu_timer_)
+    {
+        return timing_loop_impl(gpu_timer{}, s, callables_func);
+    }
+    else
+    {
+        return timing_loop_impl(cpu_timer{}, s, callables_func);
+    }
+}
+
+template <typename PreprocessFunc, typename... Callables>
+CK_TILE_HOST float
+launch_kernel_time_mask(const stream_config& s, PreprocessFunc preprocess, Callables&&... callables)
+{
+    static_assert(sizeof...(callables) > 0, "At least one callable is required!");
+
+    if(!s.time_kernel_)
+    {
+        preprocess();
+        launch_and_check(s, std::forward<Callables>(callables)...);
+        return 0;
+    }
+
+    auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
+
+    if(s.is_gpu_timer_)
+    {
+        return timing_loop_impl(gpu_timer{}, s, callables_func, preprocess);
+    }
+    else
+    {
+        return timing_loop_impl(cpu_timer{}, s, callables_func, preprocess);
+    }
+}
+
+template <typename PreprocessFunc, typename... Callables>
+CK_TILE_HOST float launch_kernel_time_mask_flush_cache(const stream_config& s,
+                                                       PreprocessFunc preprocess,
+                                                       Callables&&... callables)
+{
+    static_assert(sizeof...(callables) > 0, "At least one callable is required!");
+
+    if(!s.time_kernel_)
+    {
+        preprocess();
+        launch_and_check(s, std::forward<Callables>(callables)...);
+        return 0;
+    }
+
+    auto callables_func = [&]() { launch_and_check(s, std::forward<Callables>(callables)...); };
+
+    if(s.is_gpu_timer_)
+    {
+        return timing_loop_flush_cache_impl(gpu_timer{}, s, callables_func, preprocess);
+    }
+    else
+    {
+        return timing_loop_flush_cache_impl(cpu_timer{}, s, callables_func, preprocess);
+    }
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/permute_pk_int4.hpp
+++ b/include/ck_tile/host/permute_pk_int4.hpp
@@ -0,0 +1,77 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#pragma once
+#include "ck_tile/core/utility/bit_cast.hpp"
+namespace ck_tile {
+
+/**
+ * @brief Permute packed int4 vectors for device implementation compatibility
+ *
+ * This function transforms 4 pk_int4_t values from original layout to hardware-optimized layout:
+ * - Original layout (4 pk_int4_t): 0x76543210
+ * - Transformed layout (4 pk_int4_t): 0x75316420
+ *
+ * Each pk_int4_t contains two 4-bit values packed in the high and low nibbles of an int8_t
+ *
+ * Example:
+ * - Input:  0x76, 0x54, 0x32, 0x10
+ * - Output: 0x75, 0x31, 0x64, 0x20
+ *
+ * @note Input tensor length must be a multiple of 4
+ *
+ * This transformation is required before transferring B matrix data (of type pk_int4_t) to device.
+ * The device conversion functions (i4_to_half4, i4_to_bhalf4, amd_assembly_i4_to_fp8x8,
+ * amd_assembly_i4_to_bf8x8) require data in 0x75316420 order to correctly convert pk_int4_t to
+ * other numeric types.
+ */
+template <typename Tensor>
+void permute_vectors_i4x4_b(Tensor& tensor)
+{
+    auto tensor_row_buf = tensor.data();
+    for(size_t idx = 0; idx < tensor.size(); idx += 4)
+    {
+        int8_t input[8];
+
+        for(int k = 0; k < 4; k++)
+        {
+            int8_t i4x2      = bit_cast<int8_t>(tensor_row_buf[idx + k]);
+            input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+            input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+        }
+
+        // permute 0x76543210 => 0x75316420
+        {
+            int8_t hi   = input[2];
+            int8_t lo   = input[0];
+            int8_t i4x2 = (hi << 4) | lo;
+
+            tensor_row_buf[idx + 0] = bit_cast<pk_int4_t>(i4x2);
+        }
+
+        {
+            int8_t hi   = input[6];
+            int8_t lo   = input[4];
+            int8_t i4x2 = (hi << 4) | lo;
+
+            tensor_row_buf[idx + 1] = bit_cast<pk_int4_t>(i4x2);
+        }
+
+        {
+            int8_t hi   = input[3];
+            int8_t lo   = input[1];
+            int8_t i4x2 = (hi << 4) | lo;
+
+            tensor_row_buf[idx + 2] = bit_cast<pk_int4_t>(i4x2);
+        }
+
+        {
+            int8_t hi   = input[7];
+            int8_t lo   = input[5];
+            int8_t i4x2 = (hi << 4) | lo;
+
+            tensor_row_buf[idx + 3] = bit_cast<pk_int4_t>(i4x2);
+        }
+    }
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/ranges.hpp
+++ b/include/ck_tile/host/ranges.hpp
@@ -0,0 +1,69 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <iterator>
+#include <type_traits>
+#include <utility>
+
+// ranges implementation are not intented to be used by user
+// TODO: do we need this?
+namespace ck_tile {
+
+template <typename T>
+using iter_value_t = typename std::iterator_traits<remove_cvref_t<T>>::value_type;
+
+template <typename T>
+using iter_reference_t = decltype(*std::declval<T&>());
+
+template <typename T>
+using iter_difference_t = typename std::iterator_traits<remove_cvref_t<T>>::difference_type;
+
+namespace ranges {
+template <typename R>
+using iterator_t = decltype(std::begin(std::declval<R&>()));
+
+template <typename R>
+using sentinel_t = decltype(std::end(std::declval<R&>()));
+
+template <typename R>
+using range_size_t = decltype(std::size(std::declval<R&>()));
+
+template <typename R>
+using range_difference_t = ck_tile::iter_difference_t<ranges::iterator_t<R>>;
+
+template <typename R>
+using range_value_t = iter_value_t<ranges::iterator_t<R>>;
+
+template <typename R>
+using range_reference_t = iter_reference_t<ranges::iterator_t<R>>;
+
+template <typename T, typename = void>
+struct is_range : std::false_type
+{
+};
+
+template <typename T>
+struct is_range<
+    T,
+    std::void_t<decltype(std::begin(std::declval<T&>())), decltype(std::end(std::declval<T&>()))>>
+    : std::true_type
+{
+};
+
+template <typename T>
+inline constexpr bool is_range_v = is_range<T>::value;
+
+template <typename T, typename = void>
+struct is_sized_range : std::false_type
+{
+};
+
+template <typename T>
+struct is_sized_range<T, std::void_t<decltype(std::size(std::declval<T&>()))>>
+    : std::bool_constant<is_range_v<T>>
+{
+};
+} // namespace ranges
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_batched_contraction.hpp
+++ b/include/ck_tile/host/reference/reference_batched_contraction.hpp
@@ -0,0 +1,275 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cstdlib>
+#include <thread>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+
+// Helper to apply elementwise operation with variable number of D tensors
+template <typename EDataType, typename AccDataType, typename CDEElementWise>
+struct ApplyCDEElementWise
+{
+    template <typename... DValues>
+    CK_TILE_HOST_DEVICE static void apply(EDataType& result,
+                                          AccDataType sum,
+                                          const CDEElementWise& cde_elementwise,
+                                          DValues... d_vals)
+    {
+        if constexpr(sizeof...(DValues) == 0)
+        {
+            result = static_cast<EDataType>(sum);
+        }
+        else
+        {
+            cde_elementwise(
+                result, ck_tile::type_convert<float>(sum), ck_tile::type_convert<float>(d_vals)...);
+        }
+    }
+};
+
+// Helper to extract D values at a given offset using index sequence
+template <typename DDataType,
+          ck_tile::index_t NumDTensor,
+          typename Indices = std::make_index_sequence<NumDTensor>>
+struct ExtractDValues;
+
+template <typename DDataType, ck_tile::index_t NumDTensor, std::size_t... Is>
+struct ExtractDValues<DDataType, NumDTensor, std::index_sequence<Is...>>
+{
+    template <typename EDataType, typename AccDataType, typename CDEElementWise>
+    CK_TILE_HOST static void
+    apply_at_offsets(EDataType& result,
+                     AccDataType sum,
+                     const CDEElementWise& cde_elementwise,
+                     const std::array<ck_tile::HostTensor<DDataType>, NumDTensor>& ds_tensors,
+                     const std::array<std::size_t, NumDTensor>& d_offsets)
+    {
+        ApplyCDEElementWise<EDataType, AccDataType, CDEElementWise>::apply(
+            result, sum, cde_elementwise, ds_tensors[Is].mData[d_offsets[Is]]...);
+    }
+};
+
+template <typename ADataType,
+          typename BDataType,
+          typename DDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename CDEElementWise,
+          ck_tile::index_t NumDTensor>
+
+void compute_reference_batched_contraction(
+    const ck_tile::HostTensor<ADataType>& a_full_dims,
+    const ck_tile::HostTensor<BDataType>& b_full_dims,
+    const std::array<ck_tile::HostTensor<DDataType>, NumDTensor>& ds_full_dims_host,
+    ck_tile::HostTensor<EDataType>& e_full_dims_host_ref,
+    ck_tile::index_t G_total,
+    ck_tile::index_t M_total,
+    ck_tile::index_t N_total,
+    ck_tile::index_t K_total,
+    const CDEElementWise& cde_elementwise,
+    const std::vector<ck_tile::index_t>& G_dims,
+    const std::vector<ck_tile::index_t>& M_dims,
+    const std::vector<ck_tile::index_t>& N_dims,
+    const std::vector<ck_tile::index_t>& K_dims)
+{
+    std::cout << "Calculating reference using stride-aware indexing with parallel processing..."
+              << std::endl;
+
+    // Extract stride information from tensor descriptors
+    const auto a_strides = a_full_dims.get_strides();
+    const auto b_strides = b_full_dims.get_strides();
+    const auto e_strides = e_full_dims_host_ref.get_strides();
+
+    // Extract D tensor strides
+    std::array<std::vector<std::size_t>, NumDTensor> ds_strides;
+    for(ck_tile::index_t d = 0; d < NumDTensor; ++d)
+    {
+        ds_strides[d] = ds_full_dims_host[d].get_strides();
+    }
+
+    const ck_tile::index_t num_g_dims = G_dims.size();
+    const ck_tile::index_t num_m_dims = M_dims.size();
+    const ck_tile::index_t num_n_dims = N_dims.size();
+    const ck_tile::index_t num_k_dims = K_dims.size();
+
+    // Helper lambda to compute linear index from flat indices using strides
+    auto compute_a_offset = [&](ck_tile::index_t g_flat,
+                                ck_tile::index_t m_flat,
+                                ck_tile::index_t k_flat) -> std::size_t {
+        std::size_t offset = 0;
+
+        // Decode G dimensions
+        ck_tile::index_t temp = g_flat;
+        for(int i = num_g_dims - 1; i >= 0; --i)
+        {
+            offset += (temp % G_dims[i]) * a_strides[i];
+            temp /= G_dims[i];
+        }
+
+        // Decode M dimensions
+        temp = m_flat;
+        for(int i = num_m_dims - 1; i >= 0; --i)
+        {
+            offset += (temp % M_dims[i]) * a_strides[num_g_dims + i];
+            temp /= M_dims[i];
+        }
+
+        // Decode K dimensions
+        temp = k_flat;
+        for(int i = num_k_dims - 1; i >= 0; --i)
+        {
+            offset += (temp % K_dims[i]) * a_strides[num_g_dims + num_m_dims + i];
+            temp /= K_dims[i];
+        }
+
+        return offset;
+    };
+
+    auto compute_b_offset = [&](ck_tile::index_t g_flat,
+                                ck_tile::index_t n_flat,
+                                ck_tile::index_t k_flat) -> std::size_t {
+        std::size_t offset = 0;
+
+        // Decode G dimensions
+        ck_tile::index_t temp = g_flat;
+        for(int i = num_g_dims - 1; i >= 0; --i)
+        {
+            offset += (temp % G_dims[i]) * b_strides[i];
+            temp /= G_dims[i];
+        }
+
+        // Decode N dimensions
+        temp = n_flat;
+        for(int i = num_n_dims - 1; i >= 0; --i)
+        {
+            offset += (temp % N_dims[i]) * b_strides[num_g_dims + i];
+            temp /= N_dims[i];
+        }
+
+        // Decode K dimensions
+        temp = k_flat;
+        for(int i = num_k_dims - 1; i >= 0; --i)
+        {
+            offset += (temp % K_dims[i]) * b_strides[num_g_dims + num_n_dims + i];
+            temp /= K_dims[i];
+        }
+
+        return offset;
+    };
+
+    auto compute_e_offset = [&](ck_tile::index_t g_flat,
+                                ck_tile::index_t m_flat,
+                                ck_tile::index_t n_flat) -> std::size_t {
+        std::size_t offset = 0;
+
+        // Decode G dimensions
+        ck_tile::index_t temp = g_flat;
+        for(int i = num_g_dims - 1; i >= 0; --i)
+        {
+            offset += (temp % G_dims[i]) * e_strides[i];
+            temp /= G_dims[i];
+        }
+
+        // Decode M dimensions
+        temp = m_flat;
+        for(int i = num_m_dims - 1; i >= 0; --i)
+        {
+            offset += (temp % M_dims[i]) * e_strides[num_g_dims + i];
+            temp /= M_dims[i];
+        }
+
+        // Decode N dimensions
+        temp = n_flat;
+        for(int i = num_n_dims - 1; i >= 0; --i)
+        {
+            offset += (temp % N_dims[i]) * e_strides[num_g_dims + num_m_dims + i];
+            temp /= N_dims[i];
+        }
+
+        return offset;
+    };
+
+    // Helper to compute D tensor offset (D tensors have same shape as E: [G, M, N])
+    auto compute_d_offset = [&](ck_tile::index_t g_flat,
+                                ck_tile::index_t m_flat,
+                                ck_tile::index_t n_flat,
+                                ck_tile::index_t d_idx) -> std::size_t {
+        std::size_t offset    = 0;
+        const auto& d_strides = ds_strides[d_idx];
+
+        // Decode G dimensions
+        ck_tile::index_t temp = g_flat;
+        for(int i = num_g_dims - 1; i >= 0; --i)
+        {
+            offset += (temp % G_dims[i]) * d_strides[i];
+            temp /= G_dims[i];
+        }
+
+        // Decode M dimensions
+        temp = m_flat;
+        for(int i = num_m_dims - 1; i >= 0; --i)
+        {
+            offset += (temp % M_dims[i]) * d_strides[num_g_dims + i];
+            temp /= M_dims[i];
+        }
+
+        // Decode N dimensions
+        temp = n_flat;
+        for(int i = num_n_dims - 1; i >= 0; --i)
+        {
+            offset += (temp % N_dims[i]) * d_strides[num_g_dims + num_m_dims + i];
+            temp /= N_dims[i];
+        }
+
+        return offset;
+    };
+
+    // Parallel computation over G and M dimensions
+    auto f_gm = [&](auto g_flat, auto m_flat) {
+        for(ck_tile::index_t n_flat = 0; n_flat < N_total; ++n_flat)
+        {
+            AccDataType sum = 0;
+
+            // Compute dot product over K dimension using stride-aware indexing
+            for(ck_tile::index_t k_flat = 0; k_flat < K_total; ++k_flat)
+            {
+                const std::size_t a_offset = compute_a_offset(g_flat, m_flat, k_flat);
+                const std::size_t b_offset = compute_b_offset(g_flat, n_flat, k_flat);
+
+                auto a_val = a_full_dims.mData[a_offset];
+                auto b_val = b_full_dims.mData[b_offset];
+                sum += static_cast<AccDataType>(a_val) * static_cast<AccDataType>(b_val);
+            }
+
+            // Compute output offset using strides
+            const std::size_t e_offset = compute_e_offset(g_flat, m_flat, n_flat);
+
+            // Compute individual D tensor offsets using their respective strides
+            std::array<std::size_t, NumDTensor> d_offsets;
+            for(ck_tile::index_t d = 0; d < NumDTensor; ++d)
+            {
+                d_offsets[d] = compute_d_offset(g_flat, m_flat, n_flat, d);
+            }
+
+            // Apply elementwise operation with D tensors using compile-time dispatch
+            EDataType result = static_cast<EDataType>(sum);
+            ExtractDValues<DDataType, NumDTensor>::apply_at_offsets(
+                result, sum, cde_elementwise, ds_full_dims_host, d_offsets);
+
+            // Store result using stride-aware indexing
+            e_full_dims_host_ref.mData[e_offset] = static_cast<EDataType>(result);
+        }
+    };
+
+    // Execute parallel computation using hardware concurrency
+    // Parallelize over G_total and M_total dimensions for optimal CPU utilization
+    make_ParallelTensorFunctor(f_gm, G_total, M_total)(std::thread::hardware_concurrency());
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_batched_dropout.hpp
+++ b/include/ck_tile/host/reference/reference_batched_dropout.hpp
@@ -0,0 +1,33 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename DataType, typename RandValOutputDataType>
+CK_TILE_HOST void reference_batched_dropout(HostTensor<DataType>& in_out_b_m_n,
+                                            const HostTensor<RandValOutputDataType>& randval_b_m_n,
+                                            const uint8_t& p_undrop_in_uint8_t,
+                                            const float scale)
+{
+    const int N = in_out_b_m_n.mDesc.get_lengths()[2];
+    auto f      = [&](auto batch, auto m) {
+        for(int n = 0; n < N; ++n)
+        {
+            float tmp = ck_tile::type_convert<float>(in_out_b_m_n(batch, m, n)) * scale;
+            in_out_b_m_n(batch, m, n) = randval_b_m_n(batch, m, n) <= p_undrop_in_uint8_t
+                                                 ? ck_tile::type_convert<DataType>(tmp)
+                                                 : DataType(0);
+        }
+    };
+
+    make_ParallelTensorFunctor(
+        f, randval_b_m_n.mDesc.get_lengths()[0], randval_b_m_n.mDesc.get_lengths()[1])(
+        std::thread::hardware_concurrency());
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_batched_dropout_randval.hpp
+++ b/include/ck_tile/host/reference/reference_batched_dropout_randval.hpp
@@ -0,0 +1,74 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename RandValOutputDataType>
+CK_TILE_HOST void
+reference_batched_dropout_randval(HostTensor<RandValOutputDataType>& randval_b_m_n,
+                                  index_t batch,
+                                  uint64_t drop_seed,
+                                  uint64_t drop_offset)
+{
+    const index_t nhead         = randval_b_m_n.mDesc.get_lengths()[0];
+    const index_t real_seqlen_q = randval_b_m_n.mDesc.get_lengths()[1];
+    const index_t real_seqlen_k = randval_b_m_n.mDesc.get_lengths()[2];
+
+    static_assert(std::is_same_v<RandValOutputDataType, uint8_t>);
+
+    // BlockDropout generates random numbers by 32x32 tiles. Even when warp gemm 16x16 is used, the
+    // order of values in the bigger 32x32 tile must be the same because fwd and bwd may use
+    // different warp gemms (16x16 or 32x32).
+    // To compute 32x32 tiles, WarpGemmMfmaF16F16F32M32N32K16SwizzleA is used. It is
+    // WarpGemmAttributeMfmaImplF16F16F32M32N32K8 with SFactor = 2 (swizzling factor).
+    // Matrix element to register mapping for WarpGemmAttributeMfmaImplF16F16F32M32N32K8:
+    // C i:  (8 * floor(GPR_num / 4) % 32) + 4 * floor(lane / 32) + (GPR_num % 4)
+    // C j: (lane % 32)
+    // With SFactor = 2 it becomes:
+    // C i: (16 * floor(GPR_num / 8) % 32) + 8 * floor(lane / 32) + (GPR_num % 8)
+    // C j: (lane % 32)
+    // See ck_tile/ops/fmha/block/block_dropout.hpp for more details.
+
+    // The number of Philox 4x32 results required to fill 32x32 tile of 8-bit values
+    constexpr index_t philox_per_tile = 64;
+    constexpr index_t warp_gemm_mn    = 32;
+
+    const index_t rows = integer_divide_ceil(real_seqlen_q, warp_gemm_mn);
+    const index_t cols = integer_divide_ceil(real_seqlen_k, warp_gemm_mn);
+
+    auto f = [&](index_t i_h, index_t row, index_t col) {
+        uint2 rowcol = make_uint2(row, col);
+        for(index_t lane = 0; lane < philox_per_tile; lane++)
+        {
+            const uint64_t ph_head_offset = drop_offset + (batch * nhead + i_h) * philox_per_tile;
+            const index_t ph_offset       = lane;
+            philox ph(drop_seed, ph_head_offset + ph_offset);
+
+            uint8_t random_uint8_t[16];
+            ph.get_random_16x8(random_uint8_t, reinterpret_cast<unsigned long long&>(rowcol));
+
+            for(auto r = 0; r < 16; r++)
+            {
+                index_t i = (16 * (r / 8) % 32) + 8 * (lane / 32) + (r % 8);
+                index_t j = (lane % 32);
+                index_t m = row * warp_gemm_mn + i;
+                index_t n = col * warp_gemm_mn + j;
+
+                if(m < real_seqlen_q && n < real_seqlen_k)
+                {
+                    randval_b_m_n(i_h, m, n) = random_uint8_t[r];
+                }
+            }
+        }
+    };
+
+    make_ParallelTensorFunctor(f, nhead, rows, cols)(std::thread::hardware_concurrency());
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_batched_elementwise.hpp
+++ b/include/ck_tile/host/reference/reference_batched_elementwise.hpp
@@ -0,0 +1,64 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename AElementOp      = ck_tile::identity,
+          typename BElementOp      = ck_tile::identity,
+          typename BinaryElementOp = ck_tile::plus<AccDataType>>
+CK_TILE_HOST void reference_batched_elementwise(const HostTensor<ADataType>& a_b_m_n,
+                                                const HostTensor<BDataType>& b_b_m_n,
+                                                HostTensor<CDataType>& c_b_m_n,
+                                                const AElementOp& a_element_op           = {},
+                                                const BElementOp& b_element_op           = {},
+                                                const BinaryElementOp& binary_element_op = {})
+{
+    const ck_tile::index_t N = c_b_m_n.mDesc.get_lengths()[2];
+
+    const bool broadcast_a_dim_b = (a_b_m_n.get_lengths()[0] == 1);
+    const bool broadcast_a_dim_m = (a_b_m_n.get_lengths()[1] == 1);
+    const bool broadcast_a_dim_n = (a_b_m_n.get_lengths()[2] == 1);
+
+    const bool broadcast_b_dim_b = (b_b_m_n.get_lengths()[0] == 1);
+    const bool broadcast_b_dim_m = (b_b_m_n.get_lengths()[1] == 1);
+    const bool broadcast_b_dim_n = (b_b_m_n.get_lengths()[2] == 1);
+
+    auto f = [&](auto batch, auto m) {
+        for(ck_tile::index_t n = 0; n < N; ++n)
+        {
+            AccDataType v_a{};
+            {
+                ck_tile::index_t i_b = (broadcast_a_dim_b ? 0 : batch);
+                ck_tile::index_t i_m = (broadcast_a_dim_m ? 0 : m);
+                ck_tile::index_t i_n = (broadcast_a_dim_n ? 0 : n);
+
+                v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_b_m_n(i_b, i_m, i_n)));
+            }
+
+            AccDataType v_b{};
+            {
+                ck_tile::index_t i_b = (broadcast_b_dim_b ? 0 : batch);
+                ck_tile::index_t i_m = (broadcast_b_dim_m ? 0 : m);
+                ck_tile::index_t i_n = (broadcast_b_dim_n ? 0 : n);
+
+                v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_b_m_n(i_b, i_m, i_n)));
+            }
+
+            c_b_m_n(batch, m, n) = ck_tile::type_convert<CDataType>(binary_element_op(v_a, v_b));
+        }
+    };
+
+    make_ParallelTensorFunctor(f, c_b_m_n.mDesc.get_lengths()[0], c_b_m_n.mDesc.get_lengths()[1])(
+        std::thread::hardware_concurrency());
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_batched_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_batched_gemm.hpp
@@ -0,0 +1,90 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename AElementOp   = ck_tile::identity,
+          typename BElementOp   = ck_tile::identity,
+          typename ACCElementOp = ck_tile::identity>
+CK_TILE_HOST void reference_batched_gemm(const HostTensor<ADataType>& a_b_m_k,
+                                         const HostTensor<BDataType>& b_b_n_k,
+                                         HostTensor<CDataType>& c_b_m_n,
+                                         const AElementOp& a_element_op     = {},
+                                         const BElementOp& b_element_op     = {},
+                                         const ACCElementOp& acc_element_op = {})
+{
+    const int N = b_b_n_k.mDesc.get_lengths()[1];
+    const int K = b_b_n_k.mDesc.get_lengths()[2];
+
+    auto f = [&](auto batch, auto m) {
+        for(int n = 0; n < N; ++n)
+        {
+            AccDataType v_acc = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                ADataType v_a = a_element_op(a_b_m_k(batch, m, k));
+                BDataType v_b = b_element_op(b_b_n_k(batch, n, k));
+
+                v_acc += ck_tile::type_convert<AccDataType>(v_a) *
+                         ck_tile::type_convert<AccDataType>(v_b);
+            }
+
+            c_b_m_n(batch, m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
+        }
+    };
+
+    make_ParallelTensorFunctor(f, c_b_m_n.mDesc.get_lengths()[0], c_b_m_n.mDesc.get_lengths()[1])(
+        std::thread::hardware_concurrency());
+}
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename AElementOp   = ck_tile::idx_identity,
+          typename BElementOp   = ck_tile::idx_identity,
+          typename ACCElementOp = ck_tile::idx_identity>
+CK_TILE_HOST void reference_batched_quant_gemm(const HostTensor<ADataType>& a_b_m_k,
+                                               const HostTensor<BDataType>& b_b_n_k,
+                                               HostTensor<CDataType>& c_b_m_n,
+                                               const AElementOp& a_element_op     = {},
+                                               const BElementOp& b_element_op     = {},
+                                               const ACCElementOp& acc_element_op = {})
+{
+    const int N = b_b_n_k.mDesc.get_lengths()[1];
+    const int K = b_b_n_k.mDesc.get_lengths()[2];
+
+    auto f = [&](auto batch, auto m) {
+        for(int n = 0; n < N; ++n)
+        {
+            AccDataType v_acc = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                AccDataType v_a = ck_tile::type_convert<AccDataType>(
+                    a_element_op(std::make_tuple(batch, m, k), a_b_m_k(batch, m, k)));
+                AccDataType v_b = ck_tile::type_convert<AccDataType>(
+                    b_element_op(std::make_tuple(batch, n, k), b_b_n_k(batch, n, k)));
+
+                v_acc += v_a * v_b;
+            }
+
+            c_b_m_n(batch, m, n) = ck_tile::type_convert<CDataType>(
+                acc_element_op(std::make_tuple(batch, m, n), v_acc));
+        }
+    };
+
+    make_ParallelTensorFunctor(f, c_b_m_n.mDesc.get_lengths()[0], c_b_m_n.mDesc.get_lengths()[1])(
+        std::thread::hardware_concurrency());
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_batched_masking.hpp
+++ b/include/ck_tile/host/reference/reference_batched_masking.hpp
@@ -0,0 +1,32 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename CDataType, typename MaskingType>
+CK_TILE_HOST void reference_batched_masking(HostTensor<CDataType>& c_b_m_n, const MaskingType& mask)
+{
+    const int M = c_b_m_n.mDesc.get_lengths()[1];
+    const int N = c_b_m_n.mDesc.get_lengths()[2];
+
+    auto f = [&](auto batch) {
+        for(int n = 0; n < N; ++n)
+        {
+            for(int m = 0; m < M; ++m)
+            {
+                if(mask.IsOutOfSinkBound(m, n))
+                    c_b_m_n(batch, m, n) = -ck_tile::numeric<CDataType>::infinity();
+            }
+        }
+    };
+
+    make_ParallelTensorFunctor(f,
+                               c_b_m_n.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_batched_mx_descale.hpp
+++ b/include/ck_tile/host/reference/reference_batched_mx_descale.hpp
@@ -0,0 +1,61 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename InDataType,
+          typename ScaleDataType,
+          typename OutDataType,
+          typename ComputeDataType>
+CK_TILE_HOST HostTensor<OutDataType>
+reference_batched_mx_descale(const HostTensor<InDataType>& a_b_m_k,
+                             const HostTensor<ScaleDataType>& scales_b_m_ks,
+                             const std::size_t scale_granularity)
+{
+    const std::size_t B = a_b_m_k.get_length(0);
+    const std::size_t M = a_b_m_k.get_length(1);
+    const std::size_t K = a_b_m_k.get_length(2);
+
+    HostTensor<ComputeDataType> a_b_m_k_scaled(a_b_m_k.get_lengths());
+
+    auto f = [&](auto batch) {
+        constexpr index_t packed_size = ck_tile::numeric_traits<InDataType>::PackedSize;
+
+        for(std::size_t m = 0; m < M; ++m)
+        {
+            for(std::size_t k = 0; k < K; k += packed_size)
+            {
+                const auto scale = ck_tile::type_convert<ComputeDataType>(
+                    scales_b_m_ks(batch, m, k / scale_granularity));
+
+                if constexpr(std::is_same_v<InDataType, pk_fp4_t>)
+                {
+                    auto a_f4x2  = a_b_m_k(batch, m, k);
+                    auto a_f4_lo = ck_tile::type_convert<ComputeDataType>(
+                        a_f4x2.template unpack<>(number<0>{}));
+                    auto a_f4_hi = ck_tile::type_convert<ComputeDataType>(
+                        a_f4x2.template unpack<>(number<1>{}));
+
+                    a_b_m_k_scaled(batch, m, k)     = a_f4_lo * scale;
+                    a_b_m_k_scaled(batch, m, k + 1) = a_f4_hi * scale;
+                }
+                else
+                {
+                    a_b_m_k_scaled(batch, m, k) =
+                        ck_tile::type_convert<ComputeDataType>(a_b_m_k(batch, m, k)) * scale;
+                }
+            }
+        }
+    };
+    make_ParallelTensorFunctor(f, B)(std::thread::hardware_concurrency());
+
+    return a_b_m_k_scaled;
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_batched_rotary_position_embedding.hpp
+++ b/include/ck_tile/host/reference/reference_batched_rotary_position_embedding.hpp
@@ -0,0 +1,73 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+#include <cassert>
+#include <thread>
+
+namespace ck_tile {
+
+template <typename DataType, typename ComputeDataType = float>
+CK_TILE_HOST void reference_batched_rotary_position_embedding(const HostTensor<DataType>& input_bsd,
+                                                              const HostTensor<DataType>& cos_sd,
+                                                              const HostTensor<DataType>& sin_sd,
+                                                              bool interleaved,
+                                                              HostTensor<DataType>& output_bsd,
+                                                              bool use_1_row_sin_cos = false)
+{
+    assert(cos_sd.get_num_of_dimension() == 2 && sin_sd.get_num_of_dimension() == 2);
+    assert(cos_sd.get_length(0) == sin_sd.get_length(0) &&
+           cos_sd.get_length(1) == sin_sd.get_length(1));
+
+    const index_t rotary_dim = cos_sd.get_length(1) * 2;
+    assert(static_cast<std::size_t>(rotary_dim) <= input_bsd.get_length(2));
+
+    output_bsd.ForEach([&](auto& self, auto i) {
+        const index_t i_d = i[2];
+        if(rotary_dim <= i_d)
+        {
+            self(i) = input_bsd(i);
+            return;
+        }
+        assert(i_d < rotary_dim);
+
+        const index_t i_s         = i[1];
+        const index_t i_s_cos_sin = (use_1_row_sin_cos ? 0 : i_s);
+
+        const ComputeDataType cos = type_convert<ComputeDataType>(
+            interleaved ? cos_sd(i_s_cos_sin, i_d / 2)
+                        : cos_sd(i_s_cos_sin, i_d % cos_sd.get_length(1)));
+        const ComputeDataType sin = type_convert<ComputeDataType>(
+            interleaved ? sin_sd(i_s_cos_sin, i_d / 2)
+                        : sin_sd(i_s_cos_sin, i_d % sin_sd.get_length(1)));
+
+        const ComputeDataType half_rotated_input = [&] {
+            const index_t i_b = i[0];
+
+            if(interleaved)
+            {
+                const bool is_even         = (i_d % 2 == 0);
+                const index_t pos          = i_d + (is_even ? 1 : -1);
+                const ComputeDataType sign = (is_even ? -1 : 1);
+                return sign * type_convert<ComputeDataType>(input_bsd(i_b, i_s, pos));
+            }
+            else
+            {
+                const index_t half_rdim    = (rotary_dim / 2);
+                const index_t pos          = (i_d + half_rdim) % rotary_dim;
+                const ComputeDataType sign = (pos < half_rdim ? 1 : -1);
+                return sign * type_convert<ComputeDataType>(input_bsd(i_b, i_s, pos));
+            }
+        }();
+        ComputeDataType result =
+            type_convert<ComputeDataType>(input_bsd(i)) * cos + half_rotated_input * sin;
+
+        self(i) = type_convert<DataType>(result);
+    });
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_batched_softmax.hpp
+++ b/include/ck_tile/host/reference/reference_batched_softmax.hpp
@@ -0,0 +1,71 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename ADataType,
+          typename CompDataType,
+          typename BDataType,
+          typename CompElementOp = ck_tile::identity>
+CK_TILE_HOST void reference_batched_softmax(
+    const HostTensor<ADataType>& a_b_m_n,
+    HostTensor<BDataType>& b_b_m_n,
+    const CompElementOp& comp_element_op                                    = {},
+    std::optional<std::reference_wrapper<HostTensor<CompDataType>>> lse_b_m = std::nullopt)
+{
+    const int N = a_b_m_n.mDesc.get_lengths()[2];
+
+    auto f = [&](auto batch, auto m) {
+        CompDataType v_max = -ck_tile::numeric<CompDataType>::infinity();
+
+        // max
+        for(int n = 0; n < N; ++n)
+        {
+            const CompDataType v_a = ck_tile::type_convert<CompDataType>(a_b_m_n(batch, m, n));
+
+            v_max = v_max < v_a ? v_a : v_max;
+        }
+
+        CompDataType v_exp_sum = 0;
+        // validate v_max if all the elements within a row are -INF
+        if(std::isinf(v_max) && v_max < 0)
+        {
+            v_max = ck_tile::type_convert<CompDataType>(0.f);
+        }
+
+        // sum
+        for(int n = 0; n < N; ++n)
+        {
+            const CompDataType v_a = ck_tile::type_convert<CompDataType>(a_b_m_n(batch, m, n));
+
+            v_exp_sum += ck_tile::exp(v_a - v_max);
+        }
+
+        // if sum is zero(masked), or nan/inf(other computation error), don't do divide
+        CompDataType inv_sum = (v_exp_sum == 0.f ? 1.f : 1.f / v_exp_sum);
+
+        // elementwise
+        for(int n = 0; n < N; ++n)
+        {
+            const CompDataType v_a = ck_tile::type_convert<CompDataType>(a_b_m_n(batch, m, n));
+            const CompDataType v_b = ck_tile::exp(v_a - v_max) * inv_sum;
+
+            b_b_m_n(batch, m, n) = ck_tile::type_convert<BDataType>(comp_element_op(v_b));
+        }
+        // lse
+        if(lse_b_m)
+        {
+            lse_b_m->get()(batch, m) = v_max + ck_tile::log(v_exp_sum);
+        }
+    };
+
+    make_ParallelTensorFunctor(f, b_b_m_n.mDesc.get_lengths()[0], b_b_m_n.mDesc.get_lengths()[1])(
+        std::thread::hardware_concurrency());
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_batched_transpose.hpp
+++ b/include/ck_tile/host/reference/reference_batched_transpose.hpp
@@ -0,0 +1,59 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename Type>
+CK_TILE_HOST void reference_batched_transpose(const HostTensor<Type>& x,
+                                              HostTensor<Type>& y,
+                                              std::string layout_in  = "NCHW",
+                                              std::string layout_out = "NHWC")
+{
+    const int N = x.mDesc.get_lengths()[0];
+
+    auto f = [&](auto batch) {
+        if(layout_in == "NCHW" && layout_out == "NHWC")
+        {
+            const int C = x.mDesc.get_lengths()[1];
+            const int H = x.mDesc.get_lengths()[2];
+            const int W = x.mDesc.get_lengths()[3];
+            for(int c = 0; c < C; ++c)
+            {
+                for(int h = 0; h < H; ++h)
+                {
+                    for(int w = 0; w < W; ++w)
+                    {
+                        Type v_x          = x(batch, c, h, w);
+                        y(batch, h, w, c) = v_x;
+                    }
+                }
+            }
+        }
+        else if(layout_in == "NHWC" && layout_out == "NCHW")
+        {
+            const int H = x.mDesc.get_lengths()[1];
+            const int W = x.mDesc.get_lengths()[2];
+            const int C = x.mDesc.get_lengths()[3];
+            for(int h = 0; h < H; ++h)
+            {
+                for(int w = 0; w < W; ++w)
+                {
+                    for(int c = 0; c < C; ++c)
+                    {
+                        Type v_x          = x(batch, h, w, c);
+                        y(batch, c, h, w) = v_x;
+                    }
+                }
+            }
+        }
+    };
+
+    make_ParallelTensorFunctor(f, N)(std::thread::hardware_concurrency());
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_blocked_attention.hpp
+++ b/include/ck_tile/host/reference/reference_blocked_attention.hpp
@@ -0,0 +1,156 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <vector>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/core/utility/bit_cast.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+
+template <typename AccT, typename T>
+CK_TILE_HOST_DEVICE constexpr AccT to_acc(T value)
+{
+    if constexpr(std::is_same_v<T, ck_tile::bf16_t>)
+    {
+#if CK_TILE_USE_CUSTOM_DATA_TYPE
+        return static_cast<AccT>(value);
+#else
+        return static_cast<AccT>(
+            ck_tile::bf16_to_float_raw(ck_tile::bit_cast<ck_tile::bf16_raw_t>(value)));
+#endif
+    }
+    else
+    {
+        return static_cast<AccT>(value);
+    }
+}
+
+// Reference implementation: blocked attention (for sparse attention tests).
+template <typename T, typename MaskT, typename AccT = float>
+void reference_blocked_attention(
+    const HostTensor<T>& q,                  // [B, H, S_q, D]
+    const HostTensor<T>& k,                  // [B, H, S_k, D]
+    const HostTensor<T>& v,                  // [B, H, S_k, D_v]
+    const HostTensor<MaskT>& block_relation, // [B, H, Q_blocks, K_blocks]
+    HostTensor<T>& output,                   // [B, H, S_q, D_v]
+    index_t BLKQ,
+    index_t BLKK,
+    AccT scale)
+{
+    auto q_lengths   = q.get_lengths();
+    index_t batch    = q_lengths[0];
+    index_t nhead    = q_lengths[1];
+    index_t seqlen_q = q_lengths[2];
+    index_t hdim     = q_lengths[3];
+
+    auto v_lengths   = v.get_lengths();
+    index_t seqlen_k = v_lengths[2];
+    index_t hdim_v   = v_lengths[3];
+
+    index_t num_q_blocks = (seqlen_q + BLKQ - 1) / BLKQ;
+    index_t num_k_blocks = (seqlen_k + BLKK - 1) / BLKK;
+
+    for(index_t b = 0; b < batch; ++b)
+    {
+        for(index_t h = 0; h < nhead; ++h)
+        {
+            for(index_t qb = 0; qb < num_q_blocks; ++qb)
+            {
+                index_t q_start = qb * BLKQ;
+                if(q_start >= seqlen_q)
+                {
+                    continue;
+                }
+                index_t q_end = std::min<index_t>(q_start + BLKQ, seqlen_q);
+
+                std::vector<index_t> relevant_k_indices;
+                for(index_t kb = 0; kb < num_k_blocks; ++kb)
+                {
+                    // Treat block_relation as boolean; >0.5 marks an active block.
+                    if(static_cast<float>(block_relation(b, h, qb, kb)) > 0.5f)
+                    {
+                        relevant_k_indices.push_back(kb);
+                    }
+                }
+
+                if(relevant_k_indices.empty())
+                {
+                    continue;
+                }
+
+                for(index_t sq = q_start; sq < q_end; ++sq)
+                {
+                    std::vector<AccT> scores;
+                    AccT max_score = -std::numeric_limits<AccT>::infinity();
+
+                    for(auto kb : relevant_k_indices)
+                    {
+                        index_t k_start = kb * BLKK;
+                        if(k_start >= seqlen_k)
+                        {
+                            continue;
+                        }
+                        index_t k_end = std::min<index_t>(k_start + BLKK, seqlen_k);
+
+                        for(index_t sk = k_start; sk < k_end; ++sk)
+                        {
+                            AccT score = 0.0f;
+                            for(index_t d = 0; d < hdim; ++d)
+                            {
+                                score +=
+                                    to_acc<AccT>(q(b, h, sq, d)) * to_acc<AccT>(k(b, h, sk, d));
+                            }
+                            score = score * scale;
+                            scores.push_back(score);
+                            max_score = std::max(max_score, score);
+                        }
+                    }
+
+                    AccT sum_exp = 0.0f;
+                    for(auto& s : scores)
+                    {
+                        s = std::exp(s - max_score);
+                        sum_exp += s;
+                    }
+                    for(auto& s : scores)
+                    {
+                        s /= sum_exp;
+                    }
+
+                    for(index_t dv = 0; dv < hdim_v; ++dv)
+                    {
+                        AccT out_val     = 0.0f;
+                        size_t score_idx = 0;
+
+                        for(auto kb : relevant_k_indices)
+                        {
+                            index_t k_start = kb * BLKK;
+                            if(k_start >= seqlen_k)
+                            {
+                                continue;
+                            }
+                            index_t k_end = std::min<index_t>(k_start + BLKK, seqlen_k);
+
+                            for(index_t sk = k_start; sk < k_end; ++sk)
+                            {
+                                out_val += scores[score_idx] * to_acc<AccT>(v(b, h, sk, dv));
+                                score_idx++;
+                            }
+                        }
+
+                        output(b, h, sq, dv) = static_cast<T>(out_val);
+                    }
+                }
+            }
+        }
+    }
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_elementwise.hpp
+++ b/include/ck_tile/host/reference/reference_elementwise.hpp
@@ -0,0 +1,47 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+template <typename ADataType, typename BDataType, typename ComputeDataType, typename ElementOp>
+CK_TILE_HOST void reference_unary_elementwise(const HostTensor<ADataType>& a,
+                                              HostTensor<BDataType>& b,
+                                              ElementOp element_op)
+{
+    // TODO: imeplement gpu version reference function
+    auto f = [&](auto i) {
+        auto v_a   = type_convert<ComputeDataType>(a.mData[i]);
+        auto v_b   = element_op(v_a);
+        b.mData[i] = ck_tile::type_convert<BDataType>(v_b);
+    };
+
+    make_ParallelTensorFunctor(f, b.get_element_space_size())(std::thread::hardware_concurrency());
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ComputeDataType,
+          typename ElementOp>
+CK_TILE_HOST void reference_binary_elementwise(const HostTensor<ADataType>& a,
+                                               const HostTensor<BDataType>& b,
+                                               HostTensor<CDataType>& c,
+                                               ElementOp element_op)
+{
+    // TODO: imeplement gpu version reference function
+    auto f = [&](auto i) {
+        auto v_a   = type_convert<ComputeDataType>(a.mData[i]);
+        auto v_b   = type_convert<ComputeDataType>(b.mData[i]);
+        auto v_c   = element_op(v_a, v_b);
+        c.mData[i] = ck_tile::type_convert<CDataType>(v_c);
+    };
+
+    make_ParallelTensorFunctor(f, c.get_element_space_size())(std::thread::hardware_concurrency());
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_fused_moe.hpp
+++ b/include/ck_tile/host/reference/reference_fused_moe.hpp
@@ -0,0 +1,205 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+// [indexing implementation-1]
+// using M_a as constexpr block_size to partition all tokens into different slices
+// each slice map to one expert, and one expert can have multiple slices
+// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5
+// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]]
+//                            tok-0      tok-1      tok-2      tok-3      tok-4
+//           topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float
+//           number)
+//
+// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]]
+//  (only for reference)    exp-0  exp-1     exp-2   exp-3          exp-4  exp-5
+// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]]
+//
+// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1)
+// max_num_tokens_padded : topk * input_tokens + num_experts * M_a - topk (updated)
+// * this could be larger than actual, since actual tokens are on GPU
+//
+// sorted_token_ids_ptr   : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6,
+// 0, 1, 2, 5]
+//                          |-  exp-0  -|-  exp-1  -|-  exp-2  -|-      exp-3          -|-  exp-4
+//                          -|-  exp-5  -|
+// sorted_weight_ptr      : [a, *, *, *, g, j, m, *, d, k, *, *, b, e, h, l, n, *, *, *, *, *, *, *,
+// c, f, i, o]
+//
+// * length is max_num_tokens_padded, actual size is num_tokens_post_padded_ptr
+//
+// sorted_expert_ids_ptr  : [0, 1, 2, 3, 3, 4, 5]
+// * length is (max_num_tokens_padded + block_size - 1) / block_size
+///
+// num_tokens_post_padded_ptr : [28]
+// num_sorted_tiles_ptr : [7]
+
+template <typename AccDataType, // you only need to explcitly set this one
+          typename Activation,  // ck_tile::element_wise::Gelu
+          typename ADataType,
+          typename GDataType,
+          typename DDataType,
+          typename ODataType,
+          typename AScaleDataType,
+          typename GScaleDataType,
+          typename DScaleDataType,
+          typename YSmoothScaleDataType,
+          typename TopkWeightDataType,
+          typename IndexDataType>
+void reference_fused_moe(
+    const ck_tile::HostTensor<ADataType>& a_host,       // [tokens, hidden_size]
+    const ck_tile::HostTensor<GDataType>& g_host,       // [experts, interme_size_0, hidden_size]
+    const ck_tile::HostTensor<DDataType>& d_host,       // [experts, hidden_size, interme_size_1]
+    const ck_tile::HostTensor<AScaleDataType>& sa_host, // [tokens, 1],
+    const ck_tile::HostTensor<GScaleDataType>& sg_host, // [experts, 1, interme_size_0]
+    const ck_tile::HostTensor<DScaleDataType>& sd_host, // [experts, 1, hidden_size],
+    const ck_tile::HostTensor<YSmoothScaleDataType>& sy_host,        // [experts, 1, interme_size_0]
+    ck_tile::HostTensor<ODataType>& o_host,                          // [tokens, hidden_size]
+    const ck_tile::HostTensor<IndexDataType>& sorted_token_ids_host, // [max_num_tokens_padded]
+    const ck_tile::HostTensor<TopkWeightDataType>& sorted_weight_host, // [max_num_tokens_padded]
+    const ck_tile::HostTensor<IndexDataType>&
+        sorted_expert_ids_host, // [(max_num_tokens_padded + block_size - 1) / block_size]
+    const ck_tile::HostTensor<IndexDataType>& num_sorted_tiles_host, // [1]
+
+    const ck_tile::HostTensor<IndexDataType>&
+        token_ids_host, // [tokens, topk] --> ugly!!! remove in the future
+
+    ck_tile::index_t block_m,
+    ck_tile::index_t tokens,
+    ck_tile::index_t experts,
+    ck_tile::index_t hidden_size,
+    ck_tile::index_t intermediate_size, // this size is for gate/up/down
+    ck_tile::index_t topk,
+    ck_tile::index_t gate_only)
+{
+    assert(sorted_token_ids_host.get_num_of_dimension() == 1);
+    assert(sorted_weight_host.get_num_of_dimension() == 1);
+    assert(sorted_expert_ids_host.get_num_of_dimension() == 1);
+    assert(num_sorted_tiles_host.get_element_size() == 1);
+    ck_tile::index_t num_sorted_tiles    = num_sorted_tiles_host.mData[0] / block_m;
+    ck_tile::index_t intermediate_size_0 = intermediate_size * (gate_only ? 1 : 2);
+    ck_tile::index_t intermediate_size_1 = intermediate_size;
+
+    ck_tile::HostTensor<AccDataType> out_topk_tokens({tokens, topk, hidden_size});
+
+    int max_num_tokens_padded = topk * tokens + experts * block_m - topk;
+    // assert();
+    auto f = [&](auto i_flatten) {
+        ck_tile::index_t i_tile = i_flatten / block_m;
+        if(i_tile >= num_sorted_tiles)
+            return;
+        ck_tile::index_t i_expert = sorted_expert_ids_host.mData[i_tile];
+
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+        ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten];
+        ck_tile::index_t i_topk  = i_token >> 24;
+        i_token &= 0xffffff;
+        if(i_token >= tokens)
+            return;
+        (void)token_ids_host;
+#else
+        // TODO: better remove this in the future, or modify the token_id value
+        auto get_topk_id = [&](ck_tile::index_t token_id_, ck_tile::index_t expert_id_) {
+            for(ck_tile::index_t i_ = 0; i_ < topk; i_++)
+            {
+                if(token_ids_host(token_id_, i_) == expert_id_)
+                    return i_;
+            }
+            throw std::runtime_error("not correct token/expert pair\n");
+            return -1; // TODO: not correct!!
+        };
+        ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten];
+        if(i_token >= tokens)
+            return;
+        ck_tile::index_t i_topk = get_topk_id(i_token, i_expert); // TODO: ugly
+#endif
+        auto weight = sorted_weight_host.mData[i_flatten];
+
+        ck_tile::HostTensor<AccDataType> acc_0({1, intermediate_size_0});
+        // first gemm
+        for(ck_tile::index_t i_n = 0; i_n < intermediate_size_0; i_n++)
+        {
+            AccDataType acc = static_cast<AccDataType>(0);
+            for(ck_tile::index_t i_k = 0; i_k < hidden_size; i_k++)
+            {
+                acc += type_convert<AccDataType>(a_host(i_token, i_k)) *
+                       type_convert<AccDataType>(g_host(i_expert, i_n, i_k));
+            }
+            acc_0(0, i_n) = acc;
+            // printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, acc);
+        }
+
+        ck_tile::HostTensor<AccDataType> y({1, intermediate_size_1});
+        if(gate_only)
+        {
+            if(intermediate_size_1 != intermediate_size_0)
+                throw std::runtime_error(
+                    "intermediate_size not correct, 0:" + std::to_string(intermediate_size_0) +
+                    ", 1:" + std::to_string(intermediate_size_1));
+            for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++)
+            {
+                Activation{}(y(0, i_n), acc_0(0, i_n));
+                // printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, y(0, i_n));
+            }
+        }
+        else
+        {
+            if(intermediate_size_1 * 2 != intermediate_size_0)
+                throw std::runtime_error(
+                    "intermediate_size not correct, 0:" + std::to_string(intermediate_size_0) +
+                    ", 1:" + std::to_string(intermediate_size_1));
+            for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++)
+            {
+                AccDataType tmp;
+                Activation{}(tmp, acc_0(0, i_n));
+                y(0, i_n) = tmp * acc_0(0, i_n + intermediate_size_1); // TODO: elementwise mul
+            }
+        }
+
+        // second gemm, loop along gemm-n
+        ck_tile::HostTensor<AccDataType> acc_1({1, hidden_size});
+        for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
+        {
+            AccDataType acc = static_cast<AccDataType>(0);
+            for(ck_tile::index_t i_k = 0; i_k < intermediate_size_1; i_k++)
+            {
+                acc += y(0, i_k) * type_convert<AccDataType>(d_host(i_expert, i_n, i_k));
+            }
+            acc_1(0, i_n) = acc * weight; // multiple weight here
+        }
+
+        for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
+        {
+            out_topk_tokens(i_token, i_topk, i_n) = acc_1(0, i_n);
+        }
+    };
+
+    // make_ParallelTensorFunctor(f, max_num_tokens_padded)(std::thread::hardware_concurrency());
+    make_ParallelTensorFunctor(f, max_num_tokens_padded)(1);
+
+    // reduce
+    auto r = [&](auto i_token) {
+        for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
+        {
+            AccDataType acc = type_convert<AccDataType>(0);
+            for(ck_tile::index_t i_topk = 0; i_topk < topk; i_topk++)
+            {
+                acc += out_topk_tokens(i_token, i_topk, i_n);
+            }
+            o_host(i_token, i_n) = type_convert<ODataType>(acc);
+        }
+    };
+    make_ParallelTensorFunctor(r, tokens)(std::thread::hardware_concurrency());
+
+    (void)num_sorted_tiles_host;
+    (void)sa_host;
+    (void)sg_host;
+    (void)sd_host;
+    (void)sy_host;
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_gemm.hpp
--- a/include/ck_tile/host/reference/reference_grouped_conv_bwd_data.hpp
+++ b/include/ck_tile/host/reference/reference_grouped_conv_bwd_data.hpp
@@ -0,0 +1,228 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cinttypes>
+#include <cstdlib>
+#include <thread>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+CK_TILE_HOST void reference_grouped_conv_bwd_data(HostTensor<InDataType>& input,
+                                                  const HostTensor<WeiDataType>& weight,
+                                                  const HostTensor<OutDataType>& output,
+                                                  std::vector<ck_tile::long_index_t> conv_strides,
+                                                  std::vector<ck_tile::long_index_t> conv_dilations,
+                                                  std::vector<ck_tile::long_index_t> in_left_pads,
+                                                  std::vector<ck_tile::long_index_t>)
+{
+    if(!(input.get_num_of_dimension() == NDimSpatial + 3 &&
+         weight.get_num_of_dimension() == NDimSpatial + 3 &&
+         output.get_num_of_dimension() == NDimSpatial + 3))
+    {
+
+        printf("%" PRIu64 " %" PRIu64 " %" PRIu64,
+               input.get_num_of_dimension(),
+               weight.get_num_of_dimension(),
+               output.get_num_of_dimension());
+
+        throw std::runtime_error("wrong! inconsistent dimension");
+    }
+
+    if constexpr(NDimSpatial == 1)
+    {
+        auto func = [&](auto g, auto n, auto c, auto wi) {
+            std::size_t K = weight.get_lengths()[1];
+            std::size_t X = weight.get_lengths()[3];
+
+            std::size_t Wo = output.get_lengths()[3];
+            float v_acc    = 0;
+
+            for(std::size_t x = 0; x < X; ++x)
+            {
+                auto w_tmp = static_cast<ck_tile::long_index_t>(wi) +
+                             static_cast<ck_tile::long_index_t>(in_left_pads[0]) -
+                             static_cast<ck_tile::long_index_t>(x * conv_dilations[0]);
+
+                if(w_tmp % conv_strides[0] == 0)
+                {
+                    auto wo = static_cast<ck_tile::long_index_t>(w_tmp) /
+                              static_cast<ck_tile::long_index_t>(conv_strides[0]);
+
+                    if(wo >= 0 && ck_tile::type_convert<std::size_t>(wo) < Wo)
+                    {
+                        for(std::size_t k = 0; k < K; ++k)
+                        {
+                            OutDataType v_out = output(g, n, k, wo);
+                            WeiDataType v_wei = weight(g, k, c, x);
+                            v_acc += ck_tile::type_convert<float>(v_out) *
+                                     ck_tile::type_convert<float>(v_wei);
+                        }
+                    }
+                }
+            }
+            InDataType v_acc_converted = ck_tile::type_convert<InDataType>(v_acc);
+            input(g, n, c, wi)         = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   input.get_lengths()[0],
+                                   input.get_lengths()[1],
+                                   input.get_lengths()[2],
+                                   input.get_lengths()[3])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        auto func = [&](auto g, auto n, auto c, auto hi, auto wi) {
+            std::size_t K = weight.get_lengths()[1];
+            std::size_t Y = weight.get_lengths()[3];
+            std::size_t X = weight.get_lengths()[4];
+
+            std::size_t Ho = output.get_lengths()[3];
+            std::size_t Wo = output.get_lengths()[4];
+
+            float v_acc = 0;
+
+            for(std::size_t y = 0; y < Y; ++y)
+            {
+                auto h_tmp = static_cast<ck_tile::long_index_t>(hi) +
+                             static_cast<ck_tile::long_index_t>(in_left_pads[0]) -
+                             static_cast<ck_tile::long_index_t>(y * conv_dilations[0]);
+                if(h_tmp % conv_strides[0] == 0)
+                {
+                    auto ho = static_cast<ck_tile::long_index_t>(h_tmp) /
+                              static_cast<ck_tile::long_index_t>(conv_strides[0]);
+                    if(ho >= 0 && ck_tile::type_convert<std::size_t>(ho) < Ho)
+                    {
+                        for(std::size_t x = 0; x < X; ++x)
+                        {
+                            auto w_tmp = static_cast<ck_tile::long_index_t>(wi) +
+                                         static_cast<ck_tile::long_index_t>(in_left_pads[1]) -
+                                         static_cast<ck_tile::long_index_t>(x * conv_dilations[1]);
+                            if(w_tmp % conv_strides[1] == 0)
+                            {
+                                auto wo = static_cast<ck_tile::long_index_t>(w_tmp) /
+                                          static_cast<ck_tile::long_index_t>(conv_strides[1]);
+
+                                if(wo >= 0 && ck_tile::type_convert<std::size_t>(wo) < Wo)
+                                {
+                                    for(std::size_t k = 0; k < K; ++k)
+                                    {
+                                        OutDataType v_out = output(g, n, k, ho, wo);
+                                        WeiDataType v_wei = weight(g, k, c, y, x);
+                                        v_acc += ck_tile::type_convert<float>(v_out) *
+                                                 ck_tile::type_convert<float>(v_wei);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            InDataType v_acc_converted = ck_tile::type_convert<InDataType>(v_acc);
+            input(g, n, c, hi, wi)     = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   input.get_lengths()[0],
+                                   input.get_lengths()[1],
+                                   input.get_lengths()[2],
+                                   input.get_lengths()[3],
+                                   input.get_lengths()[4])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 3)
+    {
+        auto func = [&](auto g, auto n, auto c, auto di, auto hi, auto wi) {
+            std::size_t K = weight.get_lengths()[1];
+            std::size_t Z = weight.get_lengths()[3];
+            std::size_t Y = weight.get_lengths()[4];
+            std::size_t X = weight.get_lengths()[5];
+
+            std::size_t Do = output.get_lengths()[3];
+            std::size_t Ho = output.get_lengths()[4];
+            std::size_t Wo = output.get_lengths()[5];
+
+            float v_acc = 0;
+
+            for(std::size_t z = 0; z < Z; ++z)
+            {
+                auto d_tmp = static_cast<ck_tile::long_index_t>(di) +
+                             static_cast<ck_tile::long_index_t>(in_left_pads[0]) -
+                             static_cast<ck_tile::long_index_t>(z * conv_dilations[0]);
+                if(d_tmp % conv_strides[0] == 0)
+                {
+                    auto do_ = static_cast<ck_tile::long_index_t>(d_tmp) /
+                               static_cast<ck_tile::long_index_t>(conv_strides[0]);
+                    if(do_ >= 0 && ck_tile::type_convert<std::size_t>(do_) < Do)
+                    {
+                        for(std::size_t y = 0; y < Y; ++y)
+                        {
+                            auto h_tmp = static_cast<ck_tile::long_index_t>(hi) +
+                                         static_cast<ck_tile::long_index_t>(in_left_pads[1]) -
+                                         static_cast<ck_tile::long_index_t>(y * conv_dilations[1]);
+                            if(h_tmp % conv_strides[1] == 0)
+                            {
+                                auto ho = static_cast<ck_tile::long_index_t>(h_tmp) /
+                                          static_cast<ck_tile::long_index_t>(conv_strides[1]);
+                                if(ho >= 0 && ck_tile::type_convert<std::size_t>(ho) < Ho)
+                                {
+                                    for(std::size_t x = 0; x < X; ++x)
+                                    {
+                                        auto w_tmp =
+                                            static_cast<ck_tile::long_index_t>(wi) +
+                                            static_cast<ck_tile::long_index_t>(in_left_pads[2]) -
+                                            static_cast<ck_tile::long_index_t>(x *
+                                                                               conv_dilations[2]);
+
+                                        if(w_tmp % conv_strides[2] == 0)
+                                        {
+                                            auto wo =
+                                                static_cast<ck_tile::long_index_t>(w_tmp) /
+                                                static_cast<ck_tile::long_index_t>(conv_strides[2]);
+                                            if(wo >= 0 &&
+                                               ck_tile::type_convert<std::size_t>(wo) < Wo)
+                                            {
+                                                for(std::size_t k = 0; k < K; ++k)
+                                                {
+                                                    OutDataType v_out =
+                                                        output(g, n, k, do_, ho, wo);
+                                                    WeiDataType v_wei = weight(g, k, c, z, y, x);
+                                                    v_acc += ck_tile::type_convert<float>(v_out) *
+                                                             ck_tile::type_convert<float>(v_wei);
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            InDataType v_acc_converted = ck_tile::type_convert<InDataType>(v_acc);
+            input(g, n, c, di, hi, wi) = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   input.get_lengths()[0],
+                                   input.get_lengths()[1],
+                                   input.get_lengths()[2],
+                                   input.get_lengths()[3],
+                                   input.get_lengths()[4],
+                                   input.get_lengths()[5])(std::thread::hardware_concurrency());
+    }
+    else
+    {
+        throw std::runtime_error(
+            "Ref_conv_bwd_data: number of dimensions must be between 1 and 3.");
+    }
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_grouped_conv_bwd_weight.hpp
+++ b/include/ck_tile/host/reference/reference_grouped_conv_bwd_weight.hpp
@@ -0,0 +1,167 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cstdlib>
+#include <thread>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+CK_TILE_HOST void
+reference_grouped_conv_bwd_weight(const HostTensor<InDataType>& input,
+                                  HostTensor<WeiDataType>& weight,
+                                  const HostTensor<OutDataType>& output,
+                                  std::vector<ck_tile::long_index_t> conv_strides,
+                                  std::vector<ck_tile::long_index_t> conv_dilations,
+                                  std::vector<ck_tile::long_index_t> in_left_pads,
+                                  std::vector<ck_tile::long_index_t>)
+{
+    if(!(input.get_num_of_dimension() == NDimSpatial + 3 &&
+         weight.get_num_of_dimension() == NDimSpatial + 3 &&
+         output.get_num_of_dimension() == NDimSpatial + 3))
+    {
+        throw std::runtime_error("wrong! inconsistent dimension");
+    }
+
+    if constexpr(NDimSpatial == 1)
+    {
+        auto func = [&](auto g, auto k, auto c, auto x) {
+            float v_acc = 0;
+
+            for(std::size_t n = 0; n < output.get_lengths()[1]; ++n)
+            {
+                for(std::size_t wo = 0; wo < output.get_lengths()[3]; ++wo)
+                {
+                    auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[0]) +
+                              static_cast<ck_tile::long_index_t>(x * conv_dilations[0]) -
+                              static_cast<ck_tile::long_index_t>(in_left_pads[0]);
+
+                    if(wi >= 0 && ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[3])
+                    {
+                        InDataType v_in   = input(g, n, c, wi);
+                        OutDataType v_out = output(g, n, k, wo);
+                        v_acc += ck_tile::type_convert<float>(v_out) *
+                                 ck_tile::type_convert<float>(v_in);
+                    }
+                }
+            }
+            OutDataType v_acc_converted = ck_tile::type_convert<WeiDataType>(v_acc);
+            weight(g, k, c, x)          = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   weight.get_lengths()[0],
+                                   weight.get_lengths()[1],
+                                   weight.get_lengths()[2],
+                                   weight.get_lengths()[3])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        auto func = [&](auto g, auto k, auto c, auto y, auto x) {
+            float v_acc = 0;
+
+            for(std::size_t n = 0; n < output.get_lengths()[1]; ++n)
+            {
+                for(std::size_t ho = 0; ho < output.get_lengths()[3]; ++ho)
+                {
+                    auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[0]) +
+                              static_cast<ck_tile::long_index_t>(y * conv_dilations[0]) -
+                              static_cast<ck_tile::long_index_t>(in_left_pads[0]);
+
+                    for(std::size_t wo = 0; wo < output.get_lengths()[4]; ++wo)
+                    {
+                        auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[1]) +
+                                  static_cast<ck_tile::long_index_t>(x * conv_dilations[1]) -
+                                  static_cast<ck_tile::long_index_t>(in_left_pads[1]);
+
+                        if(hi >= 0 &&
+                           ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[3] &&
+                           wi >= 0 &&
+                           ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[4])
+                        {
+                            InDataType v_in   = input(g, n, c, hi, wi);
+                            OutDataType v_out = output(g, n, k, ho, wo);
+
+                            v_acc += ck_tile::type_convert<float>(v_out) *
+                                     ck_tile::type_convert<float>(v_in);
+                        }
+                    }
+                }
+            }
+            WeiDataType v_acc_converted = ck_tile::type_convert<WeiDataType>(v_acc);
+            weight(g, k, c, y, x)       = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   weight.get_lengths()[0],
+                                   weight.get_lengths()[1],
+                                   weight.get_lengths()[2],
+                                   weight.get_lengths()[3],
+                                   weight.get_lengths()[4])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 3)
+    {
+        auto func = [&](auto g, auto k, auto c, auto z, auto y, auto x) {
+            float v_acc = 0;
+
+            for(std::size_t n = 0; n < output.get_lengths()[1]; ++n)
+            {
+                for(std::size_t do_ = 0; do_ < output.get_lengths()[3]; ++do_)
+                {
+                    auto di = static_cast<ck_tile::long_index_t>(do_ * conv_strides[0]) +
+                              static_cast<ck_tile::long_index_t>(z * conv_dilations[0]) -
+                              static_cast<ck_tile::long_index_t>(in_left_pads[0]);
+                    for(std::size_t ho = 0; ho < output.get_lengths()[4]; ++ho)
+                    {
+                        auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[1]) +
+                                  static_cast<ck_tile::long_index_t>(y * conv_dilations[1]) -
+                                  static_cast<ck_tile::long_index_t>(in_left_pads[1]);
+                        for(std::size_t wo = 0; wo < output.get_lengths()[5]; ++wo)
+                        {
+                            auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[2]) +
+                                      static_cast<ck_tile::long_index_t>(x * conv_dilations[2]) -
+                                      static_cast<ck_tile::long_index_t>(in_left_pads[2]);
+                            if(di >= 0 &&
+                               ck_tile::type_convert<std::size_t>(di) < input.get_lengths()[3] &&
+                               hi >= 0 &&
+                               ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[4] &&
+                               wi >= 0 &&
+                               ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[5])
+                            {
+                                InDataType v_in   = input(g, n, c, di, hi, wi);
+                                OutDataType v_out = output(g, n, k, do_, ho, wo);
+
+                                v_acc += ck_tile::type_convert<float>(v_out) *
+                                         ck_tile::type_convert<float>(v_in);
+                            }
+                        }
+                    }
+                }
+            }
+            WeiDataType v_acc_converted = ck_tile::type_convert<WeiDataType>(v_acc);
+            weight(g, k, c, z, y, x)    = v_acc_converted;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   weight.get_lengths()[0],
+                                   weight.get_lengths()[1],
+                                   weight.get_lengths()[2],
+                                   weight.get_lengths()[3],
+                                   weight.get_lengths()[4],
+                                   weight.get_lengths()[5])(std::thread::hardware_concurrency());
+    }
+    else
+    {
+        throw std::runtime_error(
+            "Ref_conv_bwd_weight: number of dimensions must be between 1 and 3.");
+    }
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_grouped_conv_fwd.hpp
+++ b/include/ck_tile/host/reference/reference_grouped_conv_fwd.hpp
@@ -0,0 +1,182 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cstdlib>
+#include <thread>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename Elfunc = ck_tile::element_wise::PassThrough,
+          typename Tuple  = ck_tile::tuple<>>
+CK_TILE_HOST void reference_grouped_conv_fwd(const HostTensor<InDataType>& input,
+                                             const HostTensor<WeiDataType>& weight,
+                                             HostTensor<OutDataType>& output,
+                                             std::vector<ck_tile::long_index_t> conv_strides,
+                                             std::vector<ck_tile::long_index_t> conv_dilations,
+                                             std::vector<ck_tile::long_index_t> in_left_pads,
+                                             std::vector<ck_tile::long_index_t>,
+                                             Elfunc elfunc = Elfunc{},
+                                             Tuple ds      = {})
+{
+    if(!(input.get_num_of_dimension() == NDimSpatial + 3 &&
+         weight.get_num_of_dimension() == NDimSpatial + 3 &&
+         output.get_num_of_dimension() == NDimSpatial + 3))
+    {
+        throw std::runtime_error("wrong! inconsistent dimension");
+    }
+
+    if constexpr(NDimSpatial == 1)
+    {
+        auto func = [&](auto g, auto n, auto k, auto wo) {
+            float v_acc = 0;
+
+            for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
+            {
+                for(std::size_t x = 0; x < weight.get_lengths()[3]; ++x)
+                {
+                    auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[0]) +
+                              static_cast<ck_tile::long_index_t>(x * conv_dilations[0]) -
+                              static_cast<ck_tile::long_index_t>(in_left_pads[0]);
+
+                    if(wi >= 0 && ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[3])
+                    {
+                        InDataType v_in   = input(g, n, c, wi);
+                        WeiDataType v_wei = weight(g, k, c, x);
+                        v_acc += ck_tile::type_convert<float>(v_in) *
+                                 ck_tile::type_convert<float>(v_wei);
+                    }
+                }
+            }
+            if constexpr(Tuple::size() > 0)
+                elfunc(v_acc, v_acc, ds.at(ck_tile::number<0>{})(g, n, k, wo));
+            else
+                elfunc(v_acc, v_acc);
+            OutDataType v_acc_out = ck_tile::type_convert<OutDataType>(v_acc);
+            output(g, n, k, wo)   = v_acc_out;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   output.get_lengths()[0],
+                                   output.get_lengths()[1],
+                                   output.get_lengths()[2],
+                                   output.get_lengths()[3])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        auto func = [&](auto g, auto n, auto k, auto ho, auto wo) {
+            float v_acc = 0;
+
+            for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
+            {
+                for(std::size_t y = 0; y < weight.get_lengths()[3]; ++y)
+                {
+                    auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[0]) +
+                              static_cast<ck_tile::long_index_t>(y * conv_dilations[0]) -
+                              static_cast<ck_tile::long_index_t>(in_left_pads[0]);
+
+                    for(std::size_t x = 0; x < weight.get_lengths()[4]; ++x)
+                    {
+                        auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[1]) +
+                                  static_cast<ck_tile::long_index_t>(x * conv_dilations[1]) -
+                                  static_cast<ck_tile::long_index_t>(in_left_pads[1]);
+
+                        if(hi >= 0 &&
+                           ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[3] &&
+                           wi >= 0 &&
+                           ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[4])
+                        {
+                            InDataType v_in   = input(g, n, c, hi, wi);
+                            WeiDataType v_wei = weight(g, k, c, y, x);
+
+                            v_acc += ck_tile::type_convert<float>(v_in) *
+                                     ck_tile::type_convert<float>(v_wei);
+                        }
+                    }
+                }
+            }
+            if constexpr(Tuple::size() > 0)
+                elfunc(v_acc, v_acc, ds.at(ck_tile::number<0>{})(g, n, k, ho, wo));
+            else
+                elfunc(v_acc, v_acc);
+            OutDataType v_acc_out   = ck_tile::type_convert<OutDataType>(v_acc);
+            output(g, n, k, ho, wo) = v_acc_out;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   output.get_lengths()[0],
+                                   output.get_lengths()[1],
+                                   output.get_lengths()[2],
+                                   output.get_lengths()[3],
+                                   output.get_lengths()[4])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 3)
+    {
+        auto func = [&](auto g, auto n, auto k, auto d_o, auto ho, auto wo) {
+            float v_acc = 0;
+
+            for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
+            {
+                for(std::size_t z = 0; z < weight.get_lengths()[3]; ++z)
+                {
+                    auto di = static_cast<ck_tile::long_index_t>(d_o * conv_strides[0]) +
+                              static_cast<ck_tile::long_index_t>(z * conv_dilations[0]) -
+                              static_cast<ck_tile::long_index_t>(in_left_pads[0]);
+                    for(std::size_t y = 0; y < weight.get_lengths()[4]; ++y)
+                    {
+                        auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[1]) +
+                                  static_cast<ck_tile::long_index_t>(y * conv_dilations[1]) -
+                                  static_cast<ck_tile::long_index_t>(in_left_pads[1]);
+                        for(std::size_t x = 0; x < weight.get_lengths()[5]; ++x)
+                        {
+                            auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[2]) +
+                                      static_cast<ck_tile::long_index_t>(x * conv_dilations[2]) -
+                                      static_cast<ck_tile::long_index_t>(in_left_pads[2]);
+                            if(di >= 0 &&
+                               ck_tile::type_convert<std::size_t>(di) < input.get_lengths()[3] &&
+                               hi >= 0 &&
+                               ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[4] &&
+                               wi >= 0 &&
+                               ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[5])
+                            {
+                                InDataType v_in   = input(g, n, c, di, hi, wi);
+                                WeiDataType v_wei = weight(g, k, c, z, y, x);
+
+                                v_acc += ck_tile::type_convert<float>(v_in) *
+                                         ck_tile::type_convert<float>(v_wei);
+                            }
+                        }
+                    }
+                }
+            }
+            if constexpr(Tuple::size() > 0)
+                elfunc(v_acc, v_acc, ds.at(ck_tile::number<0>{})(g, n, k, d_o, ho, wo));
+            else
+                elfunc(v_acc, v_acc);
+            OutDataType v_acc_out        = ck_tile::type_convert<OutDataType>(v_acc);
+            output(g, n, k, d_o, ho, wo) = v_acc_out;
+        };
+
+        make_ParallelTensorFunctor(func,
+                                   output.get_lengths()[0],
+                                   output.get_lengths()[1],
+                                   output.get_lengths()[2],
+                                   output.get_lengths()[3],
+                                   output.get_lengths()[4],
+                                   output.get_lengths()[5])(std::thread::hardware_concurrency());
+    }
+    else
+    {
+        throw std::runtime_error("Ref_Conv_fwd: number of dimensions must be between 1 and 3.");
+    }
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_im2col.hpp
+++ b/include/ck_tile/host/reference/reference_im2col.hpp
@@ -0,0 +1,133 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename InDataType, typename OutDataType, index_t NDimSpatial>
+CK_TILE_HOST void reference_im2col(const HostTensor<InDataType>& in_host,
+                                   HostTensor<OutDataType>& out_host,
+                                   const ck_tile::conv::ConvParam& conv_params)
+{
+    const long_index_t G = in_host.get_lengths()[0];
+    const long_index_t N = in_host.get_lengths()[1];
+    const long_index_t C = in_host.get_lengths()[2];
+
+    if constexpr(NDimSpatial == 1)
+    {
+        const long_index_t Wo = conv_params.output_spatial_lengths_[0];
+        auto func             = [&](auto g, auto n, auto wo) {
+            long_index_t row    = n * Wo + wo;
+            long_index_t column = 0;
+
+            for(long_index_t x = 0; x < conv_params.filter_spatial_lengths_[0]; ++x)
+            {
+                auto wi = static_cast<long_index_t>(wo * conv_params.conv_filter_strides_[0]) +
+                          static_cast<long_index_t>(x * conv_params.conv_filter_dilations_[0]) -
+                          static_cast<long_index_t>(conv_params.input_left_pads_[0]);
+
+                for(long_index_t c = 0; c < C; ++c)
+                {
+                    if(wi >= 0 && type_convert<std::size_t>(wi) < in_host.get_lengths()[3])
+                    {
+                        InDataType v_in          = in_host(g, n, c, wi);
+                        out_host(g, row, column) = type_convert<OutDataType>(v_in);
+                    }
+                    column++;
+                }
+            }
+        };
+
+        make_ParallelTensorFunctor(func, G, N, Wo)(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        const long_index_t Ho = conv_params.output_spatial_lengths_[0];
+        const long_index_t Wo = conv_params.output_spatial_lengths_[1];
+
+        auto func = [&](auto g, auto n, auto ho, auto wo) {
+            long_index_t row    = n * Ho * Wo + ho * Wo + wo;
+            long_index_t column = 0;
+
+            for(long_index_t y = 0; y < conv_params.filter_spatial_lengths_[0]; ++y)
+            {
+                auto hi = static_cast<long_index_t>(ho * conv_params.conv_filter_strides_[0]) +
+                          static_cast<long_index_t>(y * conv_params.conv_filter_dilations_[0]) -
+                          static_cast<long_index_t>(conv_params.input_left_pads_[0]);
+
+                for(long_index_t x = 0; x < conv_params.filter_spatial_lengths_[1]; ++x)
+                {
+                    auto wi = static_cast<long_index_t>(wo * conv_params.conv_filter_strides_[1]) +
+                              static_cast<long_index_t>(x * conv_params.conv_filter_dilations_[1]) -
+                              static_cast<long_index_t>(conv_params.input_left_pads_[1]);
+
+                    for(long_index_t c = 0; c < C; ++c)
+                    {
+
+                        if(hi >= 0 && type_convert<std::size_t>(hi) < in_host.get_lengths()[3] &&
+                           wi >= 0 && type_convert<std::size_t>(wi) < in_host.get_lengths()[4])
+                        {
+                            InDataType v_in          = in_host(g, n, c, hi, wi);
+                            out_host(g, row, column) = type_convert<OutDataType>(v_in);
+                        }
+                        column++;
+                    }
+                }
+            }
+        };
+
+        make_ParallelTensorFunctor(func, G, N, Ho, Wo)(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 3)
+    {
+        const long_index_t Do = conv_params.output_spatial_lengths_[0];
+        const long_index_t Ho = conv_params.output_spatial_lengths_[1];
+        const long_index_t Wo = conv_params.output_spatial_lengths_[2];
+
+        auto func = [&](auto g, auto n, auto d_o, auto ho, auto wo) {
+            long_index_t row    = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo;
+            long_index_t column = 0;
+
+            for(long_index_t z = 0; z < conv_params.filter_spatial_lengths_[0]; ++z)
+            {
+                auto di = static_cast<long_index_t>(d_o * conv_params.conv_filter_strides_[0]) +
+                          static_cast<long_index_t>(z * conv_params.conv_filter_dilations_[0]) -
+                          static_cast<long_index_t>(conv_params.input_left_pads_[0]);
+                for(long_index_t y = 0; y < conv_params.filter_spatial_lengths_[1]; ++y)
+                {
+                    auto hi = static_cast<long_index_t>(ho * conv_params.conv_filter_strides_[1]) +
+                              static_cast<long_index_t>(y * conv_params.conv_filter_dilations_[1]) -
+                              static_cast<long_index_t>(conv_params.input_left_pads_[1]);
+                    for(long_index_t x = 0; x < conv_params.filter_spatial_lengths_[2]; ++x)
+                    {
+                        auto wi =
+                            static_cast<long_index_t>(wo * conv_params.conv_filter_strides_[2]) +
+                            static_cast<long_index_t>(x * conv_params.conv_filter_dilations_[2]) -
+                            static_cast<long_index_t>(conv_params.input_left_pads_[2]);
+                        for(long_index_t c = 0; c < C; ++c)
+                        {
+                            if(di >= 0 &&
+                               type_convert<std::size_t>(di) < in_host.get_lengths()[3] &&
+                               hi >= 0 &&
+                               type_convert<std::size_t>(hi) < in_host.get_lengths()[4] &&
+                               wi >= 0 && type_convert<std::size_t>(wi) < in_host.get_lengths()[5])
+                            {
+                                InDataType v_in          = in_host(g, n, c, di, hi, wi);
+                                out_host(g, row, column) = type_convert<OutDataType>(v_in);
+                            }
+                            column++;
+                        }
+                    }
+                }
+            }
+        };
+
+        make_ParallelTensorFunctor(func, G, N, Do, Ho, Wo)(std::thread::hardware_concurrency());
+    }
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp
+++ b/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp
@@ -0,0 +1,96 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+
+// Note: for simplicity, each functor only care about single M
+struct reference_layernorm2d_default_epilogue
+{
+    template <typename OutDataType, typename AccDataType>
+    void operator()(int m, HostTensor<OutDataType>& o, const HostTensor<AccDataType>& acc)
+    {
+        const int N = acc.mDesc.get_lengths()[1];
+        for(int n = 0; n < N; ++n)
+        {
+            o(m, n) = ck_tile::type_convert<OutDataType>(acc(m, n));
+        }
+    }
+
+    template <typename OutDataType, typename AccDataType>
+    auto operator()(int m, const HostTensor<AccDataType>& acc)
+    {
+        HostTensor<OutDataType> o(acc.get_lengths(), acc.get_strides());
+        operator()(m, o, acc);
+        return o;
+    }
+};
+
+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename ComputeDataType,
+          typename YDataType,
+          typename MeanDataType,
+          typename InvStdDataType,
+          typename Epilogue = reference_layernorm2d_default_epilogue>
+void reference_layernorm2d_fwd(const HostTensor<XDataType>& x_m_n,
+                               const HostTensor<GammaDataType>& gamma_n,
+                               const HostTensor<BetaDataType>& beta_n,
+                               HostTensor<YDataType>& y_m_n,
+                               HostTensor<MeanDataType>& mean_m,
+                               HostTensor<InvStdDataType>& invStd_m,
+                               ComputeDataType epsilon,
+                               Epilogue epilogue_functor = {})
+{
+    auto layernorm2d_fwd_func = [&](auto m) {
+        const int N = x_m_n.mDesc.get_lengths()[1];
+
+        int count                = 0;
+        ComputeDataType mean     = 0;
+        ComputeDataType variance = 0;
+        ComputeDataType divisor  = 0;
+
+        for(int n = 0; n < N; ++n)
+        {
+            ++count;
+            ComputeDataType x     = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
+            ComputeDataType delta = x - mean;
+            mean += delta / count;
+            ComputeDataType delta2 = x - mean;
+            variance += delta * delta2;
+        }
+
+        // actual variance
+        variance = variance / count;
+        divisor  = ck_tile::type_convert<ComputeDataType>(1) / ck_tile::sqrt(variance + epsilon);
+
+        if constexpr(!std::is_same_v<MeanDataType, ck_tile::null_type>)
+            mean_m(m) = ck_tile::type_convert<MeanDataType>(mean);
+
+        if constexpr(!std::is_same_v<InvStdDataType, ck_tile::null_type>)
+            invStd_m(m) = ck_tile::type_convert<InvStdDataType>(divisor);
+
+        HostTensor<ComputeDataType> acc(x_m_n.get_lengths(), x_m_n.get_strides());
+        for(int n = 0; n < N; ++n)
+        {
+            ComputeDataType x     = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
+            ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n));
+            ComputeDataType beta  = ck_tile::type_convert<ComputeDataType>(beta_n(n));
+            auto a_               = (x - mean) * divisor;
+            a_                    = a_ * gamma + beta;
+
+            acc(m, n) = a_;
+        }
+
+        epilogue_functor(m, y_m_n, acc);
+    };
+
+    make_ParallelTensorFunctor(layernorm2d_fwd_func,
+                               mean_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_moe_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_moe_gemm.hpp
@@ -0,0 +1,318 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cstdlib>
+#include <thread>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename LayoutA,
+          typename LayoutB,
+          typename LayoutC,
+          int MoeGemmKind = 0, // 0: gemm1_gate_only, 1: gemm1_gate_up, 2: gemm2, 3:gemm1_split_k
+          typename ActivationOp = identity>
+__global__ void moe_gemm_kernel(const ck_tile::index_t* p_sorted_token_ids_,
+                                const ck_tile::index_t* p_sorted_expert_ids_,
+                                const ck_tile::index_t* p_max_token_id_,
+                                const ADataType* A,
+                                const BDataType* B,
+                                CDataType* C,
+                                const AccDataType* expert_weight_ptr,
+                                ck_tile::index_t Num_tokens,
+                                ck_tile::index_t TokensPerBlock,
+                                ck_tile::index_t TopK,
+                                ck_tile::index_t M,
+                                ck_tile::index_t N,
+                                ck_tile::index_t K,
+                                ck_tile::index_t strideA,
+                                ck_tile::index_t strideB,
+                                ck_tile::index_t strideC,
+                                index_t scale_granularity_m,
+                                index_t scale_granularity_n,
+                                index_t scale_granularity_k,
+                                float* scale_A_ptr,
+                                float* scale_B_ptr,
+                                float* expert_bias_ptr)
+{
+    constexpr auto is_split_k = MoeGemmKind == 3;
+    int idx                   = blockIdx.x * blockDim.x + threadIdx.x;
+    int problem_N             = MoeGemmKind == 1 ? N / 2 : N;
+    int row                   = idx / problem_N; // Compute row index
+    int col                   = idx % problem_N; // Compute column index
+
+    index_t gather_token_id  = 0;
+    index_t scatter_token_id = 0;
+    index_t expert_id        = 0;
+
+    if(row < p_max_token_id_[0])
+    {
+        expert_id        = p_sorted_expert_ids_[row / TokensPerBlock];
+        gather_token_id  = p_sorted_token_ids_[row] & 0xff'ffff;
+        scatter_token_id = p_sorted_token_ids_[row] & 0xff'ffff;
+        if(gather_token_id >= Num_tokens)
+        {
+            return;
+        }
+        if(MoeGemmKind == 2)
+        {
+            gather_token_id = gather_token_id * TopK + (p_sorted_token_ids_[row] >> 24);
+        }
+        else
+        {
+            scatter_token_id = scatter_token_id * TopK + (p_sorted_token_ids_[row] >> 24);
+        }
+    }
+    else
+    {
+        return;
+    }
+
+    if(row < M)
+    {
+        AccDataType acc    = 0.0;
+        AccDataType acc_up = 0.0;
+
+        AccDataType acc_temp    = 0.0;
+        AccDataType acc_up_temp = 0.0;
+
+        float scale_A    = 0;
+        float scale_B    = 0;
+        float scale_B_up = 0;
+
+        index_t scale_A_stride        = (M + scale_granularity_m - 1) / scale_granularity_m;
+        index_t scale_B_stride        = (N + scale_granularity_n - 1) / scale_granularity_n;
+        index_t scale_B_expert_stride = scale_B_stride * K / scale_granularity_k;
+
+        for(int k = 0; k < K; ++k)
+        {
+            if(k % scale_granularity_k == 0)
+            {
+                // update acc
+                acc += acc_temp * scale_A * scale_B;
+                acc_up += acc_up_temp * scale_A * scale_B_up;
+                // reset acc temp
+                acc_temp    = 0.0;
+                acc_up_temp = 0.0;
+                // update scale factors
+                scale_A = scale_A_ptr[(gather_token_id / scale_granularity_m) +
+                                      (k / scale_granularity_k) * scale_A_stride];
+                scale_B =
+                    scale_B_ptr[expert_id * scale_B_expert_stride + col / scale_granularity_n +
+                                (k / scale_granularity_k) * scale_B_stride];
+                if constexpr(MoeGemmKind == 1)
+                    scale_B_up = scale_B_ptr[expert_id * scale_B_expert_stride +
+                                             (col + problem_N) / scale_granularity_n +
+                                             (k / scale_granularity_k) * scale_B_stride];
+            }
+
+            constexpr index_t packed_size_a = ck_tile::numeric_traits<ADataType>::PackedSize;
+            constexpr index_t packed_size_b = ck_tile::numeric_traits<BDataType>::PackedSize;
+            // Adjust indexing based on matrix layout
+            int a_index = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
+                              ? gather_token_id * strideA + k
+                              : k * strideA + gather_token_id;
+
+            long b_index =
+                long(expert_id) * N * K +
+                ((std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>) ? col * strideB + k
+                                                                             : k * strideB + col);
+            long b_index_up;
+            if constexpr(MoeGemmKind == 1)
+                b_index_up = long(expert_id) * N * K +
+                             ((std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
+                                  ? (col + problem_N) * strideB + k
+                                  : k * strideB + col + problem_N);
+
+            AccDataType v_a;
+            AccDataType v_b;
+            AccDataType v_b_up;
+            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+            {
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(A[a_index / packed_size_a]);
+                if(k % 2 == 1)
+                    v_a = fp32_val.hi;
+                else
+                    v_a = fp32_val.lo;
+            }
+            else if constexpr(std::is_same_v<ADataType, pk_fp4_t>)
+            {
+                const fp32x2_t fp32_val = pk_fp4_to_fp32x2(A[a_index / packed_size_a]);
+                if(k % 2 == 1)
+                    v_a = fp32_val.hi;
+                else
+                    v_a = fp32_val.lo;
+            }
+            else
+            {
+                v_a = ck_tile::type_convert<AccDataType>(A[a_index]);
+            }
+            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            {
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(B[b_index / packed_size_b]);
+                if(k % 2 == 1)
+                    v_b = fp32_val.hi;
+                else
+                    v_b = fp32_val.lo;
+                if constexpr(MoeGemmKind == 1)
+                {
+                    const fp32x2_t fp32_val_up =
+                        pk_int4_t_to_fp32x2_t(B[b_index_up / packed_size_b]);
+                    if(k % 2 == 1)
+                        v_b_up = fp32_val_up.hi;
+                    else
+                        v_b_up = fp32_val_up.lo;
+                }
+            }
+            else if constexpr(std::is_same_v<BDataType, pk_fp4_t>)
+            {
+                const fp32x2_t fp32_val = pk_fp4_to_fp32x2(B[b_index / packed_size_b], 1.0f);
+                if(k % 2 == 1)
+                    v_b = fp32_val.hi;
+                else
+                    v_b = fp32_val.lo;
+                if constexpr(MoeGemmKind == 1)
+                {
+                    const fp32x2_t fp32_val_up =
+                        pk_fp4_to_fp32x2(B[b_index_up / packed_size_b], 1.0f);
+                    if(k % 2 == 1)
+                        v_b_up = fp32_val_up.hi;
+                    else
+                        v_b_up = fp32_val_up.lo;
+                }
+            }
+            else
+            {
+                v_b = ck_tile::type_convert<AccDataType>(B[b_index]);
+                if constexpr(MoeGemmKind == 1)
+                    v_b_up = ck_tile::type_convert<AccDataType>(B[b_index_up]);
+            }
+            acc_temp += v_a * v_b;
+            if constexpr(MoeGemmKind == 1)
+                acc_up_temp += v_a * v_b_up;
+        }
+
+        acc += acc_temp * scale_A * scale_B;
+        acc_up += acc_up_temp * scale_A * scale_B_up;
+
+        float bias = 0.f, bias_up = 0.f;
+        if(expert_bias_ptr != nullptr && !is_split_k)
+        {
+            bias = expert_bias_ptr[expert_id * N + col];
+            if constexpr(MoeGemmKind == 1)
+                bias_up = expert_bias_ptr[expert_id * N + col + problem_N];
+        }
+
+        int c_index = (std::is_same_v<LayoutC, tensor_layout::gemm::RowMajor>)
+                          ? scatter_token_id * strideC + col
+                          : col * strideC + scatter_token_id;
+        if constexpr(MoeGemmKind < 2)
+        {
+            C[c_index] = ck_tile::type_convert<CDataType>(
+                ActivationOp{}(acc + bias, MoeGemmKind == 1 ? acc_up + bias_up : 1));
+        }
+        else
+        {
+            // moe gemm2 don't use activation.
+            auto weight =
+                is_split_k ? ck_tile::type_convert<AccDataType>(1.0f) : expert_weight_ptr[row];
+            CDataType res = ck_tile::type_convert<CDataType>((acc + bias) * weight);
+
+            thread_buffer<CDataType, 2> add_v = 0;
+            if(c_index % 2)
+            {
+                // result is the second value of fp16 pair.
+                add_v.template get_as<CDataType>()[1] = res;
+            }
+            else
+            {
+                // result is the first value of fp16 pair.
+                add_v.template get_as<CDataType>()[0] = res;
+            }
+            // mask last bit to make sure atomicAdd pointer is aligned of DWORD.
+            atomic_add_g<CDataType, 2>(reinterpret_cast<CDataType*>(C + (c_index & 0xffff'fffe)),
+                                       add_v);
+        }
+    }
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename LayoutA,
+          typename LayoutB,
+          typename LayoutC,
+          int MoeGemmKind = 0, // 0: gemm1_gate_only, 1: gemm1_gate_up, 2: gemm2, 3:gemm1_split_k
+          typename ActivationOp = identity>
+void reference_moe_gemm_gpu(const index_t* p_sorted_token_ids_,
+                            const index_t* p_sorted_expert_ids_,
+                            const index_t* p_max_token_id_,
+                            const ADataType* a_ptr,
+                            const BDataType* b_ptr,
+                            CDataType* c_ptr,
+                            const AccDataType* expert_weight_ptr,
+                            index_t Num_tokens,
+                            index_t TokensPerBlock,
+                            index_t TopK,
+                            index_t M,
+                            index_t N,
+                            index_t K,
+                            index_t stride_a,
+                            index_t stride_b,
+                            index_t stride_c,
+                            index_t scale_granularity_m,
+                            index_t scale_granularity_n,
+                            index_t scale_granularity_k,
+                            float* scale_A_ptr,
+                            float* scale_B_ptr,
+                            float* exp_bias = nullptr)
+{
+    int problem_N          = MoeGemmKind == 1 ? N / 2 : N;
+    int totalElements      = M * problem_N;
+    int numThreadsPerBlock = 256; // Common choice for threads per block
+    int numBlocks          = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
+
+    moe_gemm_kernel<ADataType,
+                    BDataType,
+                    AccDataType,
+                    CDataType,
+                    LayoutA,
+                    LayoutB,
+                    LayoutC,
+                    MoeGemmKind,
+                    ActivationOp><<<numBlocks, numThreadsPerBlock>>>(p_sorted_token_ids_,
+                                                                     p_sorted_expert_ids_,
+                                                                     p_max_token_id_,
+                                                                     a_ptr,
+                                                                     b_ptr,
+                                                                     c_ptr,
+                                                                     expert_weight_ptr,
+                                                                     Num_tokens,
+                                                                     TokensPerBlock,
+                                                                     TopK,
+                                                                     M,
+                                                                     N,
+                                                                     K,
+                                                                     stride_a,
+                                                                     stride_b,
+                                                                     stride_c,
+                                                                     scale_granularity_m,
+                                                                     scale_granularity_n,
+                                                                     scale_granularity_k,
+                                                                     scale_A_ptr,
+                                                                     scale_B_ptr,
+                                                                     exp_bias);
+
+    return;
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_moe_sorting.hpp
+++ b/include/ck_tile/host/reference/reference_moe_sorting.hpp
@@ -0,0 +1,121 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+namespace ck_tile {
+
+#define MOE_SORTING_MOCK_ID(token_id_, topk_id_) \
+    static_cast<uint32_t>(((token_id_) & 0x00ffffff) | (((topk_id_) & 0xff) << 24))
+
+template <typename WeightType, typename IndexType = index_t>
+CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
+                                        const HostTensor<WeightType>& weights,
+                                        const HostTensor<IndexType>& local_expert_mask,
+                                        HostTensor<IndexType>& p_sorted_token_ids,
+                                        HostTensor<WeightType>& sorted_weight,
+                                        HostTensor<IndexType>& sorted_expert_ids,
+                                        index_t& unit_cnt,
+                                        const index_t experts,
+                                        const index_t unit_size,
+                                        const index_t tokens,
+                                        bool local_expert_masking,
+                                        bool skip_experts_with_zero_token = true)
+{
+    // note: if tokens is smaller than topk_ids.mDesc.get_lengths()[0], indicating local_token case
+    const index_t num_token = tokens; //  topk_ids.mDesc.get_lengths()[0];
+    const index_t topk      = topk_ids.mDesc.get_lengths()[1];
+    // allocate a temp buffer, and fill the value with [number_token|topk]
+    std::vector<std::vector<IndexType>> expert_tokens(
+        experts,
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+        std::vector<IndexType>(unit_size, MOE_SORTING_MOCK_ID(num_token, topk)));
+#else
+        std::vector<IndexType>(unit_size, num_token));
+#endif
+    std::vector<std::vector<WeightType>> expert_token_weights(
+        experts, std::vector<WeightType>(unit_size, 0));
+    // count number of unit-size slices in this expert
+    std::vector<IndexType> expert_slices(experts, 1);
+    // count the tokens used in this expert
+    std::vector<IndexType> expert_slice_idxs(experts, 0);
+    // TODO: above 2 buffer seems duplicated
+
+    for(index_t t = 0; t < num_token; t++)
+    {
+        for(index_t k = 0; k < topk; k++)
+        {
+            IndexType e  = topk_ids(t, k);
+            WeightType w = weights(t, k);
+            index_t idx  = expert_slice_idxs[e];
+            if(idx > expert_slices[e] * unit_size - 1)
+            {
+                expert_slices[e]++;
+                index_t new_size = expert_slices[e] * unit_size;
+                expert_tokens[e].resize(new_size);
+                expert_token_weights[e].resize(new_size);
+                for(index_t i = (expert_slices[e] - 1) * unit_size; i < new_size; i++)
+                {
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+                    expert_tokens[e][i] = MOE_SORTING_MOCK_ID(num_token, topk);
+#else
+                    expert_tokens[e][i] = num_token;
+#endif
+                    expert_token_weights[e][i] = 0;
+                }
+            }
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+            expert_tokens[e][idx] = MOE_SORTING_MOCK_ID(t, k);
+#else
+            expert_tokens[e][idx] = t;
+#endif
+            expert_token_weights[e][idx] = w;
+            expert_slice_idxs[e]++;
+        }
+    }
+
+    IndexType* out_tokens    = p_sorted_token_ids.data();
+    WeightType* out_weights  = sorted_weight.data();
+    IndexType* out_expert_id = sorted_expert_ids.data();
+    int curr_expert_id       = 0;
+    for(index_t e = 0; e < experts; e++)
+    {
+        if(local_expert_masking)
+        {
+            if(local_expert_mask(e) == 0)
+                continue;
+        }
+        if(skip_experts_with_zero_token)
+        {
+            if(expert_slice_idxs[e] == 0)
+            {
+                curr_expert_id++;
+                continue;
+            }
+        }
+
+        memcpy(out_tokens, expert_tokens[e].data(), sizeof(index_t) * expert_slices[e] * unit_size);
+        out_tokens += expert_slices[e] * unit_size;
+        memcpy(out_weights,
+               expert_token_weights[e].data(),
+               sizeof(WeightType) * expert_slices[e] * unit_size);
+        out_weights += expert_slices[e] * unit_size;
+
+        for(index_t s = 0; s < expert_slices[e]; s++)
+        {
+            out_expert_id[s] = curr_expert_id;
+            unit_cnt++;
+        }
+        out_expert_id += expert_slices[e];
+        curr_expert_id++;
+    }
+    unit_cnt *= unit_size;
+    return;
+}
+
+#undef MOE_SORTING_MOCK_ID
+
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_permute.hpp
+++ b/include/ck_tile/host/reference/reference_permute.hpp
@@ -0,0 +1,76 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+#include <numeric>
+#include <functional>
+
+namespace ck_tile {
+
+/*
+    this will do permute + contiguous like functionality in pytorch
+*/
+template <typename DataType>
+CK_TILE_HOST void
+reference_permute(const HostTensor<DataType>& x, HostTensor<DataType>& y, std::vector<index_t> perm)
+{
+    const auto x_len = x.mDesc.get_lengths();
+    const auto y_len = y.mDesc.get_lengths();
+    assert(x_len.size() == y_len.size());
+    index_t rank     = x_len.size();
+    const auto x_elm = std::accumulate(x_len.begin(), x_len.end(), 1, std::multiplies<index_t>());
+    const auto y_elm = std::accumulate(y_len.begin(), y_len.end(), 1, std::multiplies<index_t>());
+    assert(x_elm == y_elm);
+    (void)y_elm;
+
+    auto f = [&](auto i_element) {
+        std::vector<size_t> y_coord = [&]() {
+            std::vector<size_t> tmp(rank, 0);
+            size_t r = i_element;
+            for(index_t i = rank - 1; i >= 0; i--)
+            {
+                tmp[i] = r % y_len[i];
+                r      = r / y_len[i];
+            }
+            return tmp;
+        }();
+
+        std::vector<size_t> x_coord = [&]() {
+            std::vector<size_t> tmp(rank, 0);
+            for(index_t i = 0; i < rank; i++)
+            {
+                tmp[perm[i]] = y_coord[i];
+            }
+            return tmp;
+        }();
+
+        // do permute
+        y(y_coord) = x(x_coord);
+    };
+
+    make_ParallelTensorFunctor(f, x_elm)(std::thread::hardware_concurrency());
+}
+
+template <typename DataType>
+CK_TILE_HOST auto reference_permute(const HostTensor<DataType>& x, std::vector<index_t> perm)
+{
+    auto x_shape                          = x.get_lengths();
+    ck_tile::index_t rank                 = perm.size();
+    std::vector<ck_tile::index_t> y_shape = [&]() {
+        std::vector<ck_tile::index_t> tmp(rank, 0);
+        for(int i = 0; i < static_cast<int>(rank); i++)
+        {
+            tmp[i] = x_shape[perm[i]];
+        }
+        return tmp;
+    }();
+
+    HostTensor<DataType> y(y_shape);
+    reference_permute(x, y, perm);
+    return y;
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_pool.hpp
+++ b/include/ck_tile/host/reference/reference_pool.hpp
@@ -0,0 +1,198 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include "ck_tile/ops/pooling/kernel/pool_kernel.hpp"
+#include <thread>
+#include <cmath>
+
+namespace ck_tile {
+
+template <typename InDataType,
+          typename ComputeDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          typename ReduceOp,
+          typename TensorShape,
+          typename WindowShape,
+          bool OutputIndex = false>
+CK_TILE_HOST void reference_pool2d(const HostTensor<InDataType>& input,
+                                   HostTensor<OutDataType>& output,
+                                   HostTensor<IndexDataType>& output_index,
+                                   PoolKernelArgs<TensorShape, WindowShape> kargs,
+                                   ReduceOp reduce_op)
+{
+    const ck_tile::index_t N = kargs.input_shape.at(ck_tile::number<0>{});
+    const ck_tile::index_t H = kargs.input_shape.at(ck_tile::number<1>{});
+    const ck_tile::index_t W = kargs.input_shape.at(ck_tile::number<2>{});
+    const ck_tile::index_t C = kargs.input_shape.at(ck_tile::number<3>{});
+
+    const ck_tile::index_t Ho = kargs.output_shape.at(ck_tile::number<1>{});
+    const ck_tile::index_t Wo = kargs.output_shape.at(ck_tile::number<2>{});
+
+    const ck_tile::index_t Y = kargs.window_lengths.at(ck_tile::number<0>{});
+    const ck_tile::index_t X = kargs.window_lengths.at(ck_tile::number<1>{});
+
+    const ck_tile::index_t Sy = kargs.window_strides.at(ck_tile::number<0>{});
+    const ck_tile::index_t Sx = kargs.window_strides.at(ck_tile::number<1>{});
+
+    const ck_tile::index_t Dy = kargs.window_dilations.at(ck_tile::number<0>{});
+    const ck_tile::index_t Dx = kargs.window_dilations.at(ck_tile::number<1>{});
+
+    const ck_tile::index_t LeftPy = kargs.input_left_pads.at(ck_tile::number<0>{});
+    const ck_tile::index_t LeftPx = kargs.input_left_pads.at(ck_tile::number<1>{});
+    // Right padding is handled implicitly by bounds checking
+
+    auto f = [&](auto n, auto ho, auto wo, auto c) {
+        ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
+
+        IndexDataType current_index = 0; // Declare outside if constexpr for efficiency
+
+        for(ck_tile::index_t y = 0; y < Y; ++y)
+        {
+            // Calculate input height index with stride, dilation, and padding
+            ck_tile::index_t hi = ho * Sy + y * Dy - LeftPy;
+
+            for(ck_tile::index_t x = 0; x < X; ++x)
+            {
+                // Calculate input width index with stride, dilation, and padding
+                ck_tile::index_t wi = wo * Sx + x * Dx - LeftPx;
+
+                if(hi >= 0 && hi < H && wi >= 0 && wi < W)
+                {
+                    const ComputeDataType v_in = type_convert<ComputeDataType>(input(n, hi, wi, c));
+
+                    if constexpr(OutputIndex)
+                    {
+                        IndexDataType flat_index = input.GetOffsetFromMultiIndex(n, hi, wi, c);
+                        bool changed             = false;
+                        v_acc                    = reduce_op(v_acc, v_in, changed);
+                        if(changed)
+                        {
+                            current_index = flat_index;
+                        }
+                    }
+                    else
+                    {
+                        v_acc = reduce_op(v_acc, v_in);
+                    }
+                }
+                // For positions outside bounds, we implicitly use identity value
+            }
+        }
+
+        output(n, ho, wo, c) = ck_tile::type_convert<OutDataType>(v_acc);
+
+        if constexpr(OutputIndex)
+        {
+            output_index(n, ho, wo, c) = current_index;
+        }
+    };
+
+    // Parallelize over all output dimensions
+    make_ParallelTensorFunctor(f, N, Ho, Wo, C)(std::thread::hardware_concurrency());
+}
+
+template <typename InDataType,
+          typename ComputeDataType,
+          typename OutDataType,
+          typename IndexDataType,
+          typename ReduceOp,
+          typename TensorShape,
+          typename WindowShape,
+          bool OutputIndex = false>
+CK_TILE_HOST void reference_pool3d(const HostTensor<InDataType>& input,
+                                   HostTensor<OutDataType>& output,
+                                   HostTensor<IndexDataType>& output_index,
+                                   PoolKernelArgs<TensorShape, WindowShape> kargs,
+                                   ReduceOp reduce_op)
+{
+    const ck_tile::index_t N = kargs.input_shape.at(ck_tile::number<0>{});
+    const ck_tile::index_t D = kargs.input_shape.at(ck_tile::number<1>{});
+    const ck_tile::index_t H = kargs.input_shape.at(ck_tile::number<2>{});
+    const ck_tile::index_t W = kargs.input_shape.at(ck_tile::number<3>{});
+    const ck_tile::index_t C = kargs.input_shape.at(ck_tile::number<4>{});
+
+    const ck_tile::index_t Do = kargs.output_shape.at(ck_tile::number<1>{});
+    const ck_tile::index_t Ho = kargs.output_shape.at(ck_tile::number<2>{});
+    const ck_tile::index_t Wo = kargs.output_shape.at(ck_tile::number<3>{});
+
+    const ck_tile::index_t Z = kargs.window_lengths.at(ck_tile::number<0>{});
+    const ck_tile::index_t Y = kargs.window_lengths.at(ck_tile::number<1>{});
+    const ck_tile::index_t X = kargs.window_lengths.at(ck_tile::number<2>{});
+
+    const ck_tile::index_t Sz = kargs.window_strides.at(ck_tile::number<0>{});
+    const ck_tile::index_t Sy = kargs.window_strides.at(ck_tile::number<1>{});
+    const ck_tile::index_t Sx = kargs.window_strides.at(ck_tile::number<2>{});
+
+    const ck_tile::index_t Dz = kargs.window_dilations.at(ck_tile::number<0>{});
+    const ck_tile::index_t Dy = kargs.window_dilations.at(ck_tile::number<1>{});
+    const ck_tile::index_t Dx = kargs.window_dilations.at(ck_tile::number<2>{});
+
+    const ck_tile::index_t LeftPz = kargs.input_left_pads.at(ck_tile::number<0>{});
+    const ck_tile::index_t LeftPy = kargs.input_left_pads.at(ck_tile::number<1>{});
+    const ck_tile::index_t LeftPx = kargs.input_left_pads.at(ck_tile::number<2>{});
+    // Right padding is handled implicitly by bounds checking
+
+    auto f = [&](auto n, auto do_, auto ho, auto wo, auto c) {
+        ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
+
+        IndexDataType current_index = 0; // Declare outside if constexpr for efficiency
+
+        for(ck_tile::index_t z = 0; z < Z; ++z)
+        {
+            // Calculate input depth index with stride, dilation, and padding
+            ck_tile::index_t di = do_ * Sz + z * Dz - LeftPz;
+
+            for(ck_tile::index_t y = 0; y < Y; ++y)
+            {
+                // Calculate input height index with stride, dilation, and padding
+                ck_tile::index_t hi = ho * Sy + y * Dy - LeftPy;
+
+                for(ck_tile::index_t x = 0; x < X; ++x)
+                {
+                    // Calculate input width index with stride, dilation, and padding
+                    ck_tile::index_t wi = wo * Sx + x * Dx - LeftPx;
+
+                    if(di >= 0 && di < D && hi >= 0 && hi < H && wi >= 0 && wi < W)
+                    {
+                        const ComputeDataType v_in =
+                            type_convert<ComputeDataType>(input(n, di, hi, wi, c));
+
+                        if constexpr(OutputIndex)
+                        {
+                            IndexDataType flat_index =
+                                input.GetOffsetFromMultiIndex(n, di, hi, wi, c);
+                            bool changed = false;
+                            v_acc        = reduce_op(v_acc, v_in, changed);
+                            if(changed)
+                            {
+                                current_index = flat_index;
+                            }
+                        }
+                        else
+                        {
+                            v_acc = reduce_op(v_acc, v_in);
+                        }
+                    }
+                    // For positions outside bounds, we implicitly use identity value
+                }
+            }
+        }
+
+        output(n, do_, ho, wo, c) = ck_tile::type_convert<OutDataType>(v_acc);
+
+        if constexpr(OutputIndex)
+        {
+
+            output_index(n, do_, ho, wo, c) = current_index;
+        }
+    };
+
+    // Parallelize over all output dimensions
+    make_ParallelTensorFunctor(f, N, Do, Ho, Wo, C)(std::thread::hardware_concurrency());
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_reduce.hpp
+++ b/include/ck_tile/host/reference/reference_reduce.hpp
@@ -0,0 +1,341 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename XDataType, typename ComputeDataType, typename YDataType, typename ReduceOp>
+CK_TILE_HOST void
+reference_reduce(const HostTensor<XDataType>& x_m_n, HostTensor<YDataType>& y_m, ReduceOp reduce_op)
+{
+    auto f = [&](auto m) {
+        const int N = x_m_n.mDesc.get_lengths()[1];
+
+        ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
+
+        for(int n = 0; n < N; ++n)
+        {
+            const ComputeDataType v_a = type_convert<ComputeDataType>(x_m_n(m, n));
+
+            v_acc = reduce_op(v_acc, v_a);
+        }
+
+        y_m(m) = ck_tile::type_convert<YDataType>(v_acc);
+    };
+
+    make_ParallelTensorFunctor(f, y_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
+}
+
+// Generic reference reduce for arbitrary dimensions
+template <
+    typename XDataType,
+    typename ComputeDataType,
+    typename YDataType,
+    typename ReduceOp,
+    typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to keep
+    typename ReduceDims> // Expected type: ck_tile::sequence<...> containing dimension indices to
+                         // reduce
+CK_TILE_HOST void reference_reduce(const HostTensor<XDataType>& x_tensor,
+                                   HostTensor<YDataType>& y_tensor,
+                                   ReduceOp reduce_op,
+                                   KeptDim kept_dim,
+                                   ReduceDims reduce_dims)
+{
+    const auto& x_lengths = x_tensor.mDesc.get_lengths();
+
+    // Calculate total kept elements (product of all kept dimension lengths)
+    index_t total_kept_elements = 1;
+    static_for<0, kept_dim.size(), 1>{}(
+        [&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; });
+
+    // Calculate total reduce elements (product of all reduce dimension lengths)
+    index_t total_reduce_elements = 1;
+    static_for<0, reduce_dims.size(), 1>{}(
+        [&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });
+
+    auto f = [&](auto linear_kept_idx) {
+        ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
+
+        // Convert linear kept index to multi-dimensional kept indices
+        std::vector<index_t> kept_indices(kept_dim.size());
+        index_t temp_kept = linear_kept_idx;
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) {
+            constexpr auto dim_idx = kept_dim.size() - 1 - i;
+            constexpr auto dim     = kept_dim.at(dim_idx);
+            const auto len         = x_lengths[dim];
+            kept_indices[dim_idx]  = temp_kept % len;
+            temp_kept /= len;
+        });
+
+        for(index_t reduce_idx = 0; reduce_idx < total_reduce_elements; ++reduce_idx)
+        {
+            // Convert linear reduce index to multi-dimensional reduce indices
+            std::vector<index_t> reduce_indices(reduce_dims.size());
+            index_t temp_reduce = reduce_idx;
+            static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
+                constexpr auto dim_idx  = reduce_dims.size() - 1 - i;
+                constexpr auto dim      = reduce_dims.at(dim_idx);
+                const auto len          = x_lengths[dim];
+                reduce_indices[dim_idx] = temp_reduce % len;
+                temp_reduce /= len;
+            });
+
+            // Build full input tensor indices by combining kept and reduce indices
+            std::vector<std::size_t> full_indices(x_lengths.size(), 0);
+            static_for<0, kept_dim.size(), 1>{}(
+                [&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; });
+            static_for<0, reduce_dims.size(), 1>{}(
+                [&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; });
+
+            // Access input tensor element
+            const auto v_a = type_convert<ComputeDataType>(x_tensor(full_indices));
+
+            v_acc = reduce_op(v_acc, v_a);
+        }
+
+        // Calculate output tensor index using kept indices
+        // The output tensor has the same structure as the kept dimensions
+        std::vector<std::size_t> y_indices(kept_dim.size());
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; });
+
+        y_tensor(y_indices) = type_convert<YDataType>(v_acc);
+    };
+
+    make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
+}
+
+template <typename XDataType,
+          typename ComputeDataType,
+          typename YDataType,
+          typename YRefTuple,
+          typename ReduceOps, // Expected type: ck_tile::tuple<...> containing reduce operations
+          typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to
+                            // keep
+          typename ReduceDims, // Expected type: ck_tile::sequence<...> containing dimension indices
+                               // to reduce
+          typename ElementWiseOps,
+          typename AccElementWiseOps>
+CK_TILE_HOST void reference_multiple_reduce(const HostTensor<XDataType>& x_tensor,
+                                            YRefTuple& y_tensor_tuple,
+                                            ReduceOps reduce_ops,
+                                            KeptDim kept_dim,
+                                            ReduceDims reduce_dims,
+                                            ElementWiseOps elementwise_ops,
+                                            AccElementWiseOps accumulator_ops)
+{
+    const auto& x_lengths = x_tensor.mDesc.get_lengths();
+
+    // Calculate total kept elements (product of all kept dimension lengths)
+    index_t total_kept_elements = 1;
+    static_for<0, kept_dim.size(), 1>{}(
+        [&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; });
+
+    // Calculate total reduce elements (product of all reduce dimension lengths)
+    index_t total_reduce_elements = 1;
+    static_for<0, reduce_dims.size(), 1>{}(
+        [&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });
+
+    auto f = [&](auto linear_kept_idx) {
+        // Initialize accumulators for each reduction operation
+        auto v_acc_tuple = ck_tile::generate_tuple(
+            [&](auto i) {
+                return reduce_ops.template at<i>().template GetIdentityValue<ComputeDataType>();
+            },
+            number<reduce_ops.size()>{});
+
+        // Convert linear kept index to multi-dimensional kept indices
+        std::vector<index_t> kept_indices(kept_dim.size());
+        index_t temp_kept = linear_kept_idx;
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) {
+            constexpr auto dim_idx = kept_dim.size() - 1 - i;
+            constexpr auto dim     = kept_dim.at(dim_idx);
+            const auto len         = x_lengths[dim];
+            kept_indices[dim_idx]  = temp_kept % len;
+            temp_kept /= len;
+        });
+
+        for(index_t reduce_idx = 0; reduce_idx < total_reduce_elements; ++reduce_idx)
+        {
+            // Convert linear reduce index to multi-dimensional reduce indices
+            std::vector<index_t> reduce_indices(reduce_dims.size());
+            index_t temp_reduce = reduce_idx;
+            static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
+                constexpr auto dim_idx  = reduce_dims.size() - 1 - i;
+                constexpr auto dim      = reduce_dims.at(dim_idx);
+                const auto len          = x_lengths[dim];
+                reduce_indices[dim_idx] = temp_reduce % len;
+                temp_reduce /= len;
+            });
+
+            // Build full input tensor indices by combining kept and reduce indices
+            std::vector<std::size_t> full_indices(x_lengths.size(), 0);
+            static_for<0, kept_dim.size(), 1>{}(
+                [&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; });
+            static_for<0, reduce_dims.size(), 1>{}(
+                [&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; });
+
+            // Access input tensor element
+            auto v_a = type_convert<ComputeDataType>(x_tensor(full_indices));
+
+            // Apply each reduction operation
+            static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
+                // Apply element-wise operation before reduction
+                elementwise_ops.at(i)(v_a, v_a);
+
+                v_acc_tuple.template at<i>() =
+                    reduce_ops.template at<i>()(v_acc_tuple.template at<i>(), v_a);
+            });
+        }
+
+        static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
+            // Apply accumulator element-wise operation after reduction
+            accumulator_ops.at(i)(v_acc_tuple.template at<i>(), v_acc_tuple.template at<i>());
+        });
+
+        // Calculate output tensor index using kept indices
+        // The output tensor has the same structure as the kept dimensions
+        std::vector<std::size_t> y_indices(kept_dim.size());
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; });
+
+        // Store results for each reduction operation in the output tensor
+        static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
+            y_tensor_tuple.template at<i>()(y_indices) =
+                type_convert<YDataType>(v_acc_tuple.template at<i>());
+        });
+    };
+
+    make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
+}
+
+template <typename XDataType,
+          typename ComputeDataType,
+          typename YDataType,
+          typename YRefTuple,
+          typename ReduceOps, // Expected type: ck_tile::tuple<...> containing reduce operations
+          typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to
+                            // keep
+          typename ReduceDims, // Expected type: ck_tile::sequence<...> containing dimension indices
+                               // to reduce
+          typename ElementWiseOps,
+          typename AccElementWiseOps,
+          typename InterBlockReduceOps>
+CK_TILE_HOST void reference_multiple_reduce_multiblock(const HostTensor<XDataType>& x_tensor,
+                                                       YRefTuple& y_tensor_tuple,
+                                                       ReduceOps reduce_ops,
+                                                       KeptDim kept_dim,
+                                                       ReduceDims reduce_dims,
+                                                       ElementWiseOps elementwise_ops,
+                                                       AccElementWiseOps accumulator_ops,
+                                                       InterBlockReduceOps inter_block_reduce_ops,
+                                                       ck_tile::index_t num_blocks)
+{
+    const auto& x_lengths = x_tensor.mDesc.get_lengths();
+
+    // Calculate total kept elements (product of all kept dimension lengths)
+    index_t total_kept_elements = 1;
+    static_for<0, kept_dim.size(), 1>{}(
+        [&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; });
+
+    // Calculate total reduce elements (product of all reduce dimension lengths)
+    index_t total_reduce_elements = 1;
+    static_for<0, reduce_dims.size(), 1>{}(
+        [&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });
+
+    // Initialize output tensors
+    static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
+        auto& y_tensor = y_tensor_tuple.template at<i>();
+        for(auto& val : y_tensor.mData)
+        {
+            val = inter_block_reduce_ops.template at<i>().template GetIdentityValue<YDataType>();
+        }
+    });
+
+    auto f = [&](auto linear_kept_idx) {
+        // Convert linear kept index to multi-dimensional kept indices
+        std::vector<index_t> kept_indices(kept_dim.size());
+        index_t temp_kept = linear_kept_idx;
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) {
+            constexpr auto dim_idx = kept_dim.size() - 1 - i;
+            constexpr auto dim     = kept_dim.at(dim_idx);
+            const auto len         = x_lengths[dim];
+            kept_indices[dim_idx]  = temp_kept % len;
+            temp_kept /= len;
+        });
+
+        // Calculate output tensor index using kept indices
+        std::vector<std::size_t> y_indices(kept_dim.size());
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; });
+
+        const auto max_element_per_block = (total_reduce_elements + num_blocks - 1) / num_blocks;
+
+        for(index_t block_id = 0; block_id < num_blocks; ++block_id)
+        {
+            // Initialize accumulators for each reduction operation for the current block
+            auto v_acc_tuple = ck_tile::generate_tuple(
+                [&](auto i) {
+                    return reduce_ops.template at<i>().template GetIdentityValue<ComputeDataType>();
+                },
+                number<reduce_ops.size()>{});
+
+            const index_t element_offset = block_id * max_element_per_block;
+            const index_t element_end =
+                std::min(element_offset + max_element_per_block, total_reduce_elements);
+
+            for(index_t linear_reduce_idx = element_offset; linear_reduce_idx < element_end;
+                ++linear_reduce_idx)
+            {
+                // Convert linear reduce index to multi-dimensional reduce indices
+                std::vector<index_t> reduce_indices(reduce_dims.size());
+                index_t temp_reduce = linear_reduce_idx;
+                static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
+                    constexpr auto dim_idx  = reduce_dims.size() - 1 - i;
+                    constexpr auto dim      = reduce_dims.at(dim_idx);
+                    const auto len          = x_lengths[dim];
+                    reduce_indices[dim_idx] = temp_reduce % len;
+                    temp_reduce /= len;
+                });
+
+                // Build full input tensor indices by combining kept and reduce indices
+                std::vector<std::size_t> full_indices(x_lengths.size(), 0);
+                static_for<0, kept_dim.size(), 1>{}(
+                    [&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; });
+                static_for<0, reduce_dims.size(), 1>{}(
+                    [&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; });
+
+                // Access input tensor element
+                const auto v_a_in = type_convert<ComputeDataType>(x_tensor(full_indices));
+
+                // Apply each reduction operation
+                static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
+                    auto v_a = v_a_in;
+                    // Apply element-wise operation before reduction
+                    elementwise_ops.at(i)(v_a, v_a);
+
+                    v_acc_tuple.template at<i>() =
+                        reduce_ops.template at<i>()(v_acc_tuple.template at<i>(), v_a);
+                });
+            }
+
+            static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
+                // Apply accumulator element-wise operation after reduction
+                accumulator_ops.at(i)(v_acc_tuple.template at<i>(), v_acc_tuple.template at<i>());
+
+                // Update the output tensor with the partial result from this block
+                auto& y_tensor = y_tensor_tuple.template at<i>();
+                auto& y_val    = y_tensor(y_indices);
+                y_val          = inter_block_reduce_ops.template at<i>()(
+                    y_val, type_convert<YDataType>(v_acc_tuple.template at<i>()));
+            });
+        }
+    };
+
+    make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
+++ b/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
@@ -0,0 +1,114 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_traits.hpp"
+
+namespace ck_tile {
+
+// Note: for simplicity, each functor only care about single M
+struct reference_rmsnorm2d_default_epilogue
+{
+    template <typename OutDataType, typename AccDataType>
+    void operator()(int m, HostTensor<OutDataType>& o, const HostTensor<AccDataType>& acc)
+    {
+        const int N = acc.mDesc.get_lengths()[1];
+        for(int n = 0; n < N; ++n)
+        {
+            o(m, n) = ck_tile::type_convert<OutDataType>(acc(m, n));
+        }
+    }
+
+    template <typename OutDataType, typename AccDataType>
+    auto operator()(int m, const HostTensor<AccDataType>& acc)
+    {
+        HostTensor<OutDataType> o(acc.get_lengths(), acc.get_strides());
+        operator()(m, o, acc);
+        return o;
+    }
+};
+
+template <typename XDataType,
+          typename GammaDataType,
+          typename ComputeDataType,
+          typename YDataType,
+          typename InvRmsDataType,
+          typename UnquantYDataType,
+          typename Epilogue = reference_rmsnorm2d_default_epilogue>
+void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
+                             const HostTensor<GammaDataType>& gamma_n,
+                             HostTensor<YDataType>& y_m_n,
+                             HostTensor<InvRmsDataType>& invRms_m,
+                             HostTensor<UnquantYDataType>& unquant_y_m_n,
+                             ComputeDataType epsilon,
+                             Epilogue epilogue_functor = {},
+                             const int use_model_sensitive_rmsnorm =
+                                 static_cast<int>(Rmsnorm2dSensitiveEnum::NO_SPECIFIC_MODEL))
+{
+    auto rmsnorm2d_fwd_func = [&](auto m) {
+        const int N = x_m_n.mDesc.get_lengths()[1];
+
+        ComputeDataType mean_square = 0;
+        ComputeDataType divisor     = 0;
+
+        for(int n = 0; n < N; ++n)
+        {
+            ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
+            mean_square += x * x;
+        }
+
+        mean_square = mean_square / N;
+        divisor = ck_tile::type_convert<ComputeDataType>(1) / ck_tile::sqrt(mean_square + epsilon);
+
+        if constexpr(!std::is_same_v<InvRmsDataType, ck_tile::null_type>)
+            invRms_m(m) = ck_tile::type_convert<InvRmsDataType>(divisor);
+
+        HostTensor<ComputeDataType> acc(x_m_n.get_lengths(), x_m_n.get_strides());
+        for(int n = 0; n < N; ++n)
+        {
+            ComputeDataType x     = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
+            ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n));
+            if(use_model_sensitive_rmsnorm ==
+               static_cast<int>(
+                   Rmsnorm2dSensitiveEnum::NO_SPECIFIC_MODEL)) // 0: for no specific model
+            {
+                acc(m, n) = x * divisor * gamma;
+            }
+            else if(use_model_sensitive_rmsnorm ==
+                    static_cast<int>(Rmsnorm2dSensitiveEnum::T5_MODEL_LIKE)) // 1: for T5-like model
+            {
+                if constexpr(std::is_same_v<XDataType, ck_tile::bf16_t>)
+                {
+                    const auto tmp0 = float_to_bf16<bf16_rounding_mode::standard>(x * divisor);
+                    const auto tmp1 = float_to_bf16<bf16_rounding_mode::standard>(
+                        type_convert<ComputeDataType>(tmp0) * gamma);
+                    const auto rmsn_ = type_convert<ComputeDataType>(tmp1);
+                    acc(m, n)        = rmsn_;
+                }
+                else
+                {
+                    const auto tmp   = type_convert<XDataType>(x * divisor);
+                    const auto rmsn_ = type_convert<ComputeDataType>(tmp) * gamma;
+                    acc(m, n)        = rmsn_;
+                }
+            }
+        }
+
+        if constexpr(!std::is_same_v<UnquantYDataType, ck_tile::null_type>)
+        {
+            epilogue_functor(m, unquant_y_m_n, y_m_n, acc);
+        }
+        else
+        {
+            epilogue_functor(m, y_m_n, acc);
+        }
+    };
+
+    make_ParallelTensorFunctor(rmsnorm2d_fwd_func, invRms_m.mDesc.get_lengths()[0])(
+        std::thread::hardware_concurrency());
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_rowwise_quantization2d.hpp
+++ b/include/ck_tile/host/reference/reference_rowwise_quantization2d.hpp
@@ -0,0 +1,33 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+template <typename XDataType, typename ScaleDataType, typename QXDataType>
+CK_TILE_HOST void reference_rowwise_quantization2d(const HostTensor<XDataType>& x_m_n,
+                                                   const HostTensor<ScaleDataType>& scale_m,
+                                                   HostTensor<QXDataType>& qx_m_n)
+{
+    auto f = [&](auto m) {
+        const int N = x_m_n.mDesc.get_lengths()[1];
+
+        for(int n = 0; n < N; ++n)
+        {
+            auto v_x = x_m_n(m, n);
+            // scale = amax / 127 for int8
+            auto v_scale = type_convert<XDataType>(scale_m(m));
+            auto v_qx    = v_x / v_scale;
+            qx_m_n(m, n) = type_convert<QXDataType>(saturates<QXDataType>{}(v_qx));
+        }
+    };
+
+    make_ParallelTensorFunctor(f,
+                               scale_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_softmax.hpp
+++ b/include/ck_tile/host/reference/reference_softmax.hpp
@@ -0,0 +1,89 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename InputType, typename ComputeType, typename OutputType = ComputeType>
+CK_TILE_HOST void
+reference_softmax(const HostTensor<InputType>& x, HostTensor<OutputType>& y, index_t dim = -1)
+{
+    index_t rank = x.get_num_of_dimension();
+    assert(static_cast<std::size_t>(rank) == y.get_num_of_dimension());
+    assert(dim == -1 || dim < rank);
+
+    index_t target_dim  = dim == -1 ? (rank - 1) : dim;
+    index_t softmax_len = x.get_length(target_dim);
+    index_t n_parallel  = x.get_element_size() / softmax_len;
+    auto x_len          = x.get_lengths();
+
+    auto f = [&](auto i_element) {
+        std::vector<size_t> coord = [&]() {
+            std::vector<size_t> t_(rank, 0);
+            size_t r = i_element;
+            for(index_t i = rank - 1; i >= 0; i--)
+            {
+                if(i == target_dim)
+                    continue;
+                t_[i] = r % x_len[i];
+                r     = r / x_len[i];
+            }
+            return t_;
+        }();
+
+        ComputeType v_max = -ck_tile::numeric<ComputeType>::infinity();
+
+        // compute max
+        for(auto idx = 0; idx < softmax_len; idx++)
+        {
+            auto c_               = coord;
+            c_[target_dim]        = idx;
+            const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
+            v_max                 = v_max < v_x ? v_x : v_max;
+        }
+
+        ComputeType v_exp_sum = static_cast<ComputeType>(0);
+
+        // sum
+        for(auto idx = 0; idx < softmax_len; idx++)
+        {
+            auto c_        = coord;
+            c_[target_dim] = idx;
+
+            const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
+
+            v_exp_sum += ck_tile::exp(v_x - v_max);
+        }
+
+        // elementwise
+        for(auto idx = 0; idx < softmax_len; idx++)
+        {
+            auto c_        = coord;
+            c_[target_dim] = idx;
+
+            const ComputeType v_x = ck_tile::type_convert<ComputeType>(x(c_));
+
+            auto out = ck_tile::exp(v_x - v_max) / v_exp_sum;
+
+            y(c_) = ck_tile::type_convert<OutputType>(out);
+        }
+    };
+
+    make_ParallelTensorFunctor(f, n_parallel)(std::thread::hardware_concurrency());
+}
+
+template <typename InputType, typename ComputeType, typename OutputType = ComputeType>
+CK_TILE_HOST auto reference_softmax(const HostTensor<InputType>& x, index_t dim = -1)
+{
+    HostTensor<OutputType> y(x.get_lengths(), x.get_strides());
+
+    reference_softmax<InputType, ComputeType, OutputType>(x, y, dim);
+
+    return y;
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_topk.hpp
+++ b/include/ck_tile/host/reference/reference_topk.hpp
@@ -0,0 +1,125 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+#include <numeric>
+#include <functional>
+#include <utility>
+#include <algorithm>
+
+namespace ck_tile {
+
+/*
+    similiar to torch.topk()
+    x (Tensor) – the input tensor.
+    k (int) – the k in “top-k”
+    dim (int, optional) – the dimension to sort along
+    largest (bool, optional) – largest or smallest elements
+    sorted (bool, optional) – elements in sorted order or not
+
+    output:
+    y_values
+    y_indices
+
+    https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/TopKImpl.h
+*/
+template <typename DataType, typename IndexType = index_t>
+CK_TILE_HOST void reference_topk(const HostTensor<DataType>& x,
+                                 HostTensor<DataType>& y_values,
+                                 HostTensor<IndexType>& y_indices,
+                                 index_t k,
+                                 index_t dim  = -1,
+                                 bool largest = true,
+                                 bool sorted  = true)
+{
+    // rank must be the same
+    index_t rank = x.get_num_of_dimension();
+    assert(static_cast<std::size_t>(rank) == y_values.get_num_of_dimension());
+    assert(static_cast<size_t>(rank) == y_indices.get_num_of_dimension());
+    assert(dim == -1 || dim < rank);
+
+    index_t topk_dim     = dim == -1 ? (rank - 1) : dim;
+    index_t topk_src_len = x.get_length(topk_dim);
+    auto x_len           = x.get_lengths();
+
+    assert(k <= topk_src_len);
+    assert(static_cast<size_t>(k) == y_values.get_length(topk_dim) &&
+           static_cast<size_t>(k) == y_indices.get_length(topk_dim));
+
+    index_t n_parallel = x.get_element_size() / topk_src_len;
+
+    // clang-format off
+    auto f = [&](auto i_element) {
+        std::vector<size_t> topk_coord = [&](){
+            std::vector<size_t> t_(rank, 0);
+            size_t r = i_element;
+            for(index_t i = rank - 1; i >= 0; i--) {
+                if(i == topk_dim)          continue; // topk dim should be zero
+                t_[i] = r % x_len[i];      r = r / x_len[i];
+            }
+            return t_;
+        }();
+
+        using elem_t = std::pair<DataType, IndexType>;
+        std::vector<elem_t> q = [&](){
+            std::vector<elem_t> t_(topk_src_len);
+            for(index_t i = 0; i < topk_src_len; i++) {
+                auto c_ = topk_coord;  c_[topk_dim] = i;
+                t_[i].first = x(c_);   t_[i].second = i;
+            }
+            return t_;
+        }();
+
+        // run topk
+        if(largest) {
+            std::nth_element(q.begin(), q.begin() + k - 1, q.end(),
+            [](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first > rhs.first; });
+            if(sorted) {
+                std::sort(q.begin(), q.begin() + k - 1,
+                [](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first > rhs.first; });
+            }
+        } else {
+            std::nth_element(q.begin(), q.begin() + k - 1, q.end(),
+            [](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first < rhs.first; });
+            if(sorted) {
+                std::sort(q.begin(), q.begin() + k - 1,
+                [](const elem_t& lhs, const elem_t& rhs) -> bool { return lhs.first < rhs.first; });
+            }
+        }
+
+        // write out
+        for(index_t i = 0; i < k; i++) {
+            auto c_ = topk_coord;  c_[topk_dim] = i;
+            y_values(c_) = q[i].first;  y_indices(c_) = q[i].second;
+        }
+    };
+    // clang-format on
+
+    make_ParallelTensorFunctor(f, n_parallel)(std::thread::hardware_concurrency());
+}
+
+// TODO: if using this method, the return tensor would be dense(no stride)
+template <typename DataType, typename IndexType = index_t>
+CK_TILE_HOST auto reference_topk(const HostTensor<DataType>& x,
+                                 index_t k,
+                                 index_t dim  = -1,
+                                 bool largest = true,
+                                 bool sorted  = true)
+{
+    auto lens          = x.get_lengths();
+    index_t target_dim = (dim == -1) ? (lens.size() - 1) : dim;
+    assert(target_dim < lens.size());
+    assert(k <= lens[target_dim]);
+    lens[target_dim] = k;
+    HostTensor<DataType> y_values(lens);
+    HostTensor<IndexType> y_indices(lens);
+
+    reference_topk<DataType, IndexType>(x, y_values, y_indices, k, dim, largest, sorted);
+
+    return ck_tile::make_tuple(y_values, y_indices);
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_transpose.hpp
+++ b/include/ck_tile/host/reference/reference_transpose.hpp
@@ -0,0 +1,33 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename ADataType, typename BDataType>
+void reference_transpose_elementwise(const HostTensor<ADataType>& a, HostTensor<BDataType>& b)
+{
+    ck_tile::index_t M = static_cast<ck_tile::index_t>(a.mDesc.get_lengths()[0]);
+    ck_tile::index_t N = static_cast<ck_tile::index_t>(a.mDesc.get_lengths()[1]);
+
+    // Ensure the b tensor is sized correctly for N x M
+    if(static_cast<ck_tile::index_t>(b.mDesc.get_lengths()[0]) != N ||
+       static_cast<ck_tile::index_t>(b.mDesc.get_lengths()[1]) != M)
+    {
+        throw std::runtime_error("Output tensor b has incorrect dimensions for transpose.");
+    }
+
+    auto f = [&](auto i, auto j) {
+        auto v_a = a(i, j);
+        b(j, i)  = ck_tile::type_convert<BDataType>(v_a);
+    };
+
+    make_ParallelTensorFunctor(f, M, N)(std::thread::hardware_concurrency());
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/rotating_buffers.hpp
+++ b/include/ck_tile/host/rotating_buffers.hpp
@@ -0,0 +1,132 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+#include <hip/hip_runtime.h>
+
+namespace ck_tile {
+
+// RotatingMemWrapper: Prevents GPU data cache reuse during kernel benchmarking.
+//
+// Purpose:
+//   When benchmarking a kernel repeatedly with the same input buffers, the GPU L2 cache
+//   will serve data from cache (hot) instead of HBM (cold), leading to artificially fast
+//   timing measurements. This wrapper rotates through multiple copies of buffers at different
+//   memory addresses to force cache misses.
+//
+// How it works:
+//   Constructor: Creates rotating_count copies of matrices A and B in GPU memory
+//   Next():      Switches pointers to the next buffer copy (cycles through all copies)
+//   Destructor:  Frees extra buffer copies and restores original pointers
+//
+// Combined with flush_icache(), this ensures realistic "cold cache" performance measurements.
+template <typename ADataType, typename BDataType>
+struct RotatingMemWrapper
+{
+    RotatingMemWrapper() = delete;
+    RotatingMemWrapper(const void* a_ptr_,
+                       const void* b_ptr_,
+                       std::size_t rotating_count_hint,
+                       std::size_t size_a_,
+                       std::size_t size_b_)
+        : a_ptr(a_ptr_),
+          b_ptr(b_ptr_),
+          rotating_count(rotating_count_hint),
+          size_a(size_a_),
+          size_b(size_b_)
+    {
+        // Store original buffer pointers as first entry
+        p_a_grids.push_back(a_ptr);
+        p_b_grids.push_back(b_ptr);
+
+        // limit the rotating count to prevent oom
+        const uint64_t footprint          = (size_a + size_b);
+        const uint64_t max_rotating_count = (1ULL << 31) / footprint;
+        rotating_count                    = std::min(rotating_count, max_rotating_count);
+
+        // Create (rotating_count - 1) additional copies at different memory addresses
+        for(size_t i = 1; i < rotating_count; i++)
+        {
+            {
+                void* pADeviceBuf;
+                HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
+                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf), // target buffer
+                                          const_cast<void*>(p_a_grids[0]), // source buffer
+                                          size_a_,
+                                          hipMemcpyDeviceToDevice));
+                p_a_grids.push_back(pADeviceBuf);
+            }
+
+            {
+                void* pBDeviceBuf;
+                HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
+                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf), // target buffer
+                                          const_cast<void*>(p_b_grids[0]), // source buffer
+                                          size_b_,
+                                          hipMemcpyDeviceToDevice));
+                p_b_grids.push_back(pBDeviceBuf);
+            }
+        }
+    }
+    // Rotate to the next buffer copy. Call this before each kernel run to use different
+    // memory addresses, forcing the GPU to fetch data from HBM instead of cache.
+    void Next()
+    {
+        if(rotating_count > 1)
+        {
+            std::size_t idx = iter++ % rotating_count; // Cycle through all buffer copies
+            a_ptr           = p_a_grids[idx];
+            b_ptr           = p_b_grids[idx];
+        }
+    }
+    void Print()
+    {
+        std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b
+                  << ", rotating_count: " << rotating_count << "}" << std::endl;
+    }
+    // Cleanup: Free all extra buffer copies (keeping original) and restore original pointers
+    ~RotatingMemWrapper() noexcept
+    {
+        if(rotating_count > 1)
+        {
+            // Restore original buffer pointers
+            a_ptr = p_a_grids[0];
+            b_ptr = p_b_grids[0];
+
+            // Free extra buffer copies (index 0 is the original, don't free it)
+            for(size_t i = 1; i < rotating_count; i++)
+            {
+                ck_tile::hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
+                ck_tile::hip_check_error(hipFree(const_cast<void*>(p_b_grids[i])));
+            }
+        }
+    }
+
+    private:
+    const void* a_ptr;
+    const void* b_ptr;
+    std::size_t iter           = 0;
+    std::size_t rotating_count = 1;
+    std::size_t size_a         = 0;
+    std::size_t size_b         = 0;
+    std::vector<const void*> p_a_grids;
+    std::vector<const void*> p_b_grids;
+};
+inline void flush_icache()
+{
+    hipDeviceProp_t deviceProps;
+    HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
+
+    // Over-provision blocks to ensure all CUs execute the flush instruction.
+    // With imperfect scheduling, launching exactly 1 block per CU doesn't guarantee coverage.
+    // 60x over-provisioning provides statistical certainty that every CU gets at least one block.
+    constexpr int32_t blocks_per_cu = 60;
+    int32_t gpu_block3              = deviceProps.multiProcessorCount * blocks_per_cu;
+
+    ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
+    HIP_CHECK_ERROR(hipGetLastError());
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/stream_config.hpp
+++ b/include/ck_tile/host/stream_config.hpp
@@ -0,0 +1,40 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+namespace ck_tile {
+/*
+ * construct this structure with behavior as:
+ *
+ *   // create stream config with default stream(NULL), and not timing the kernel
+ *   stream_config s = stream_config{};
+ *
+ *   // create stream config with _some_stream_id_, and not timing the kernel
+ *   stream_config s = stream_config{_some_stream_id_};
+ *
+ *   // create stream config with _some_stream_id_, and benchmark with warmup/repeat as default
+ *   stream_config s = stream_config{_some_stream_id_, true};
+ *
+ *   // create stream config with _some_stream_id_, and benchmark using cpu timer
+ *   stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, false};
+ *
+ *   // create stream config with _some_stream_id_, and enable gpu timer for rotating buffer with
+ *rotating buffer count stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, true,
+ *true, 1};
+ **/
+
+struct stream_config
+{
+    hipStream_t stream_id_ = nullptr;
+    bool time_kernel_      = false;
+    int log_level_         = 0;
+    int cold_niters_       = 3;
+    int nrepeat_           = 10;
+    bool is_gpu_timer_     = true; // keep compatible
+    bool flush_cache_      = false;
+    int rotating_count_    = 1;
+};
+} // namespace ck_tile
--- a/include/ck_tile/host/stream_utils.hpp
+++ b/include/ck_tile/host/stream_utils.hpp
@@ -0,0 +1,45 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <hip/hip_runtime_api.h>
+
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/host/stream_config.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+
+namespace ck_tile {
+
+static inline index_t get_available_compute_units(const stream_config& s)
+{
+    constexpr static uint32_t MAX_MASK_DWORDS = 64;
+
+    // assume at most 64*32 = 2048 CUs
+    uint32_t cu_mask[MAX_MASK_DWORDS]{};
+
+    auto count_set_bits = [](uint32_t dword) {
+        index_t count = 0;
+        while(dword != 0)
+        {
+            if(dword & 0x1)
+            {
+                count++;
+            }
+            dword = dword >> 1;
+        }
+        return count;
+    };
+
+    HIP_CHECK_ERROR(hipExtStreamGetCUMask(s.stream_id_, MAX_MASK_DWORDS, &cu_mask[0]));
+
+    index_t num_cu = 0;
+    for(uint32_t i = 0; i < MAX_MASK_DWORDS; i++)
+    {
+        num_cu += count_set_bits(cu_mask[i]);
+    }
+
+    return num_cu;
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/host/tensor_shuffle_utils.hpp
+++ b/include/ck_tile/host/tensor_shuffle_utils.hpp
@@ -0,0 +1,186 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+#include "device_prop.hpp"
+#include <stdexcept>
+
+namespace ck_tile {
+template <typename T>
+auto shuffle_aq(const ck_tile::HostTensor<T>* t, int block_aq_k)
+{
+    if(t->get_lengths().size() != 2)
+    {
+        throw std::runtime_error("Host tensor is not rank 2 tensor.");
+    }
+    int m_   = t->get_lengths()[0];
+    int aqk_ = t->get_lengths()[1];
+
+    if(aqk_ % block_aq_k != 0)
+    {
+        throw std::runtime_error("shuffle_aq needs a aqk of multiple times of block_aq_k.");
+    }
+    ck_tile::HostTensor<T> t_view({m_, aqk_ / block_aq_k, block_aq_k});
+    std::copy(t->begin(), t->end(), t_view.begin());
+    return ck_tile::reference_permute(t_view, {1, 0, 2});
+}
+
+template <typename T>
+auto shuffle_bq(const ck_tile::HostTensor<T>* t, int block_bq_k)
+{
+    const auto& lengths = t->get_lengths();
+    const size_t rank   = lengths.size();
+
+    // Validate block_bq_k divisibility based on rank
+    int bqk_dim = (rank == 5) ? lengths[4] : (rank == 2) ? lengths[0] : -1;
+
+    if(bqk_dim < 0)
+    {
+        throw std::runtime_error("shuffle_bq expects either rank-2 or rank-5 tensor, got rank " +
+                                 std::to_string(rank));
+    }
+
+    if(bqk_dim % block_bq_k != 0)
+    {
+        throw std::runtime_error("shuffle_bq needs bqk dimension to be a multiple of block_bq_k.");
+    }
+
+    // For TilePermuteN
+    if(rank == 5)
+    {
+        // Handle 5D tensor: [n, nrepeat, nwarp, n_warp_tile, bqk]
+        ck_tile::HostTensor<T> t_view({static_cast<int>(lengths[0]),
+                                       static_cast<int>(lengths[1]),
+                                       static_cast<int>(lengths[2]),
+                                       static_cast<int>(lengths[3]),
+                                       bqk_dim / block_bq_k,
+                                       block_bq_k});
+        std::copy(t->begin(), t->end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {4, 0, 1, 2, 3, 5});
+    }
+    else // rank == 2
+    {
+        // Handle 2D tensor: [bqk, n]
+        int n_ = lengths[1];
+        ck_tile::HostTensor<T> t_view({n_, bqk_dim / block_bq_k, block_bq_k});
+        std::copy(t->begin(), t->end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {1, 0, 2});
+    }
+}
+
+template <typename GemmConfig, typename T>
+auto shuffle_b(const ck_tile::HostTensor<T>& t, GemmConfig)
+{
+    assert(t.get_lengths().size() == 2);
+    int n_ = t.get_lengths()[1];
+    int k_ = t.get_lengths()[0];
+
+    if(ck_tile::is_gfx12_supported())
+    {
+        constexpr int divisor      = 2;
+        constexpr int kABK1PerLane = 8;
+        int kABK0PerLane           = GemmConfig::K_Warp_Tile / divisor / kABK1PerLane;
+        ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                       GemmConfig::N_Warp_Tile,
+                                       k_ / GemmConfig::K_Warp_Tile,
+                                       kABK0PerLane,
+                                       divisor,
+                                       kABK1PerLane});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 4, 1, 3, 5});
+    }
+    else if(ck_tile::is_gfx11_supported())
+    {
+        int divisor = 1;
+        ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                       GemmConfig::N_Warp_Tile,
+                                       k_ / GemmConfig::K_Warp_Tile,
+                                       divisor,
+                                       GemmConfig::K_Warp_Tile / divisor});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+    else
+    {
+        constexpr int KLane = ck_tile::get_warp_size() / GemmConfig::N_Warp_Tile;
+        constexpr int ItemsPerAccess =
+            std::min(16 / static_cast<int>(sizeof(T)), GemmConfig::K_Warp_Tile / KLane);
+
+        ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                       GemmConfig::N_Warp_Tile,
+                                       k_ / ItemsPerAccess,
+                                       ItemsPerAccess});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 1, 3});
+    }
+}
+
+template <typename GemmConfig, typename T>
+auto shuffle_b(const ck_tile::HostTensor<T>& t)
+{
+    return shuffle_b(t, GemmConfig{});
+}
+
+template <typename GemmConfig, typename T>
+auto bq_permuteN(const ck_tile::HostTensor<T>& t, index_t group_n)
+{
+    assert(t.get_lengths().size() == 2);
+
+    int n_                = t.get_lengths()[1];
+    int bqk_              = t.get_lengths()[0];
+    constexpr int NRepeat = GemmConfig::N_Tile / GemmConfig::N_Warp_Tile / GemmConfig::N_Warp;
+
+    ck_tile::HostTensor<T> t_view({n_ / (GemmConfig::N_Tile / group_n),
+                                   GemmConfig::N_Warp,
+                                   GemmConfig::N_Warp_Tile / group_n,
+                                   NRepeat,
+                                   bqk_});
+    std::copy(t.begin(), t.end(), t_view.begin());
+    return ck_tile::reference_permute(t_view, {0, 3, 1, 2, 4});
+}
+
+template <typename GemmConfig, typename T>
+auto shuffle_b_permuteN(const ck_tile::HostTensor<T>& t, const GemmConfig& gemmConfig)
+{
+    assert(t.get_lengths().size() == 2);
+    int n_      = t.get_lengths()[1];
+    int k_      = t.get_lengths()[0];
+    int NRepeat = gemmConfig.N_Tile / gemmConfig.N_Warp_Tile / gemmConfig.N_Warp;
+    if(ck_tile::is_gfx12_supported())
+    {
+        constexpr int divisor      = 2;
+        constexpr int kABK1PerLane = 8;
+        int kABK0PerLane           = gemmConfig.K_Warp_Tile / divisor / kABK1PerLane;
+        ck_tile::HostTensor<T> t_view({n_ / gemmConfig.N_Tile,
+                                       gemmConfig.N_Warp,
+                                       gemmConfig.N_Warp_Tile,
+                                       NRepeat,
+                                       k_ / gemmConfig.K_Warp_Tile,
+                                       kABK0PerLane,
+                                       divisor,
+                                       kABK1PerLane});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 6, 5, 2, 7});
+    }
+    else
+    {
+        constexpr int KLane = ck_tile::get_warp_size() / GemmConfig::N_Warp_Tile;
+        constexpr int ItemsPerAccess =
+            std::min(16 / static_cast<int>(sizeof(T)), GemmConfig::K_Warp_Tile / KLane);
+        ck_tile::HostTensor<T> t_view({n_ / gemmConfig.N_Tile,
+                                       gemmConfig.N_Warp,
+                                       gemmConfig.N_Warp_Tile,
+                                       NRepeat,
+                                       k_ / ItemsPerAccess,
+                                       ItemsPerAccess});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 2, 5});
+    }
+}
+
+template <typename GemmConfig, typename T>
+auto shuffle_b_permuteN(const ck_tile::HostTensor<T>& t)
+{
+    return shuffle_b_permuteN(t, GemmConfig{});
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/timer.hpp
+++ b/include/ck_tile/host/timer.hpp
@@ -0,0 +1,77 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+#include "ck_tile/host/high_res_cpu_clock.hpp"
+#include <hip/hip_runtime.h>
+#include <cstddef>
+
+namespace ck_tile {
+
+struct gpu_timer
+{
+    CK_TILE_HOST gpu_timer()
+    {
+        HIP_CHECK_ERROR(hipEventCreate(&start_evt));
+        HIP_CHECK_ERROR(hipEventCreate(&stop_evt));
+    }
+
+    CK_TILE_HOST ~gpu_timer() noexcept(false)
+    {
+        HIP_CHECK_ERROR(hipEventDestroy(start_evt));
+        HIP_CHECK_ERROR(hipEventDestroy(stop_evt));
+    }
+
+    CK_TILE_HOST void start(const hipStream_t& s)
+    {
+        HIP_CHECK_ERROR(hipStreamSynchronize(s));
+        HIP_CHECK_ERROR(hipEventRecord(start_evt, s));
+    }
+
+    CK_TILE_HOST void stop(const hipStream_t& s)
+    {
+        HIP_CHECK_ERROR(hipEventRecord(stop_evt, s));
+        HIP_CHECK_ERROR(hipEventSynchronize(stop_evt));
+    }
+    // return in ms
+    CK_TILE_HOST float duration() const
+    {
+        float ms = 0;
+        HIP_CHECK_ERROR(hipEventElapsedTime(&ms, start_evt, stop_evt));
+        return ms;
+    }
+
+    private:
+    hipEvent_t start_evt, stop_evt;
+};
+
+struct cpu_timer
+{
+    // torch.utils.benchmark.Timer(), there is a sync inside each timer callback
+    CK_TILE_HOST void start(const hipStream_t& s)
+    {
+        HIP_CHECK_ERROR(hipStreamSynchronize(s));
+        start_tick = high_res_now();
+    }
+    // torch.utils.benchmark.Timer(), there is a sync inside each timer callback
+    CK_TILE_HOST void stop(const hipStream_t& s)
+    {
+        HIP_CHECK_ERROR(hipStreamSynchronize(s));
+        stop_tick = high_res_now();
+    }
+    // return in ms
+    CK_TILE_HOST float duration() const
+    {
+        auto us = duration_us(start_tick, stop_tick);
+        return static_cast<float>(us) / 1e3;
+    }
+
+    private:
+    timepoint_t start_tick;
+    timepoint_t stop_tick;
+};
+
+} // namespace ck_tile