diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp index 03c531c1ad..10dd4eaa1f 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp @@ -43,8 +43,9 @@ using S = ck::Sequence; using F16 = ck::half_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using PassThrough = ck::tensor_operation::element_wise::PassThrough; @@ -190,11 +191,11 @@ int main(int argc, char* argv[]) if(std::is_same::value) { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); } else { - return HostTensorDescriptor({row, col}, {1_uz, stride}); + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); } }; diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp index 5167097b6d..556aa90f3d 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp @@ -43,8 +43,9 @@ using S = ck::Sequence; using I8 = std::int8_t; using I32 = std::int32_t; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using PassThrough = ck::tensor_operation::element_wise::PassThrough; @@ -190,11 +191,11 @@ int main(int argc, char* argv[]) if(std::is_same::value) { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); } else { - return HostTensorDescriptor({row, col}, {1_uz, stride}); + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); } }; diff --git a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp index 1049b5d07c..8f8b2e80fe 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp @@ -42,8 +42,9 @@ using S = ck::Sequence; using F16 = ck::half_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using PassThrough = ck::tensor_operation::element_wise::PassThrough; @@ -173,7 +174,7 @@ int main(int argc, char* argv[]) printf("arg3: time kernel (0=no, 1=yes)\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, " "beta\n"); - exit(0); + exit(1); } auto f_host_tensor_descriptor = @@ -182,11 +183,11 @@ int main(int argc, char* argv[]) if(std::is_same::value) { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); } else { - return HostTensorDescriptor({row, col}, {1_uz, stride}); + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); } }; diff --git a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp index 992e7c19c8..17e9ceccec 100644 --- a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp +++ b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp @@ -25,8 +25,9 @@ using S = ck::Sequence; using F16 = ck::half_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using PassThrough = ck::tensor_operation::element_wise::PassThrough; @@ -160,23 +161,22 @@ int main(int argc, char* argv[]) if(std::is_same::value) { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); } else { - return HostTensorDescriptor({row, col}, {1_uz, stride}); + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); } }; + ck::index_t StrideD = 0; + Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); - Tensor d_m_n(f_host_tensor_descriptor(M, N, 0, ELayout{})); + Tensor d_m_n(f_host_tensor_descriptor(M, N, StrideD, ELayout{})); Tensor e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); Tensor e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); - const auto StrideD = std::is_same::value - ? d_m_n.mDesc.GetStrides()[0] - : d_m_n.mDesc.GetStrides()[1]; std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "d_m_n: " << d_m_n.mDesc << std::endl; diff --git a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc index 796a5d3e9b..c05e0d19aa 100644 --- a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc +++ b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc @@ -6,6 +6,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC static_assert(sizeof(ck::int4_t) == sizeof(int8_t)); #endif using namespace ck::literals; + using Bypass = ck::tensor_layout::BypassLayoutVerification; ProblemSize ps = problem_size; // make mutable copy because default stride values of 0 need to be updated @@ -15,11 +16,11 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { if constexpr(std::is_same_v) { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); } else { - return HostTensorDescriptor({row, col}, {1_uz, stride}); + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); } }; @@ -43,7 +44,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl; std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl; - // If any user-provided leading stride <= 0, replace it with the one determined by the + // If any user-provided leading stride < 0, replace it with the one determined by the // created tensor descriptor. For RowMajor the leading stride is index 0, for ColMajor index 1. auto fetch_leading_stride = [](const auto& tensor, auto layout_tag) -> int { if constexpr(std::is_same_v) @@ -56,15 +57,15 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC } }; - if(StrideA <= 0) + if(StrideA < 0) StrideA = fetch_leading_stride(a_m_k, ALayout{}); - if(StrideB <= 0) + if(StrideB < 0) StrideB = fetch_leading_stride(b_k_n, BLayout{}); - if(StrideD0 <= 0) + if(StrideD0 < 0) StrideD0 = fetch_leading_stride(d0_m_n, D0Layout{}); - if(StrideD1 <= 0) + if(StrideD1 < 0) StrideD1 = fetch_leading_stride(d1_m_n, D1Layout{}); - if(StrideE <= 0) + if(StrideE < 0) StrideE = fetch_leading_stride(e_m_n_host_result, ELayout{}); switch(config.init_method) diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp index 4a701e7792..f4e6b4d6e3 100644 --- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp +++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp @@ -25,8 +25,9 @@ using S = ck::Sequence; using F16 = ck::half_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using ADataType = F16; using BDataType = F16; @@ -138,12 +139,12 @@ int main(int argc, char* argv[]) if(std::is_same::value) { return HostTensorDescriptor( - {batch_count, row, col}, {row * stride, stride, 1_uz}, layout); + {batch_count, row, col}, {row * stride, stride, 1_uz}, Bypass{}); } else { return HostTensorDescriptor( - {batch_count, row, col}, {col * stride, 1_uz, stride}, layout); + {batch_count, row, col}, {col * stride, 1_uz, stride}, Bypass{}); } }; diff --git a/example/24_batched_gemm/run_batched_gemm_example.inc b/example/24_batched_gemm/run_batched_gemm_example.inc index 182ab8d967..666f17ca08 100644 --- a/example/24_batched_gemm/run_batched_gemm_example.inc +++ b/example/24_batched_gemm/run_batched_gemm_example.inc @@ -31,6 +31,7 @@ struct ExecutionConfig final bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) { using namespace ck::literals; + using Bypass = ck::tensor_layout::BypassLayoutVerification; #if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4) static_assert(sizeof(ck::int4_t) == sizeof(int8_t)); @@ -62,12 +63,12 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co if(std::is_same::value) { return HostTensorDescriptor( - {batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout); + {batch_count_, row, col}, {batch_stride, stride, 1_uz}, Bypass{}); } else { return HostTensorDescriptor( - {batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout); + {batch_count_, row, col}, {batch_stride, 1_uz, stride}, Bypass{}); } }; diff --git a/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc b/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc index 5e56670fcf..34164b27d1 100644 --- a/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc +++ b/example/24_batched_gemm/run_batched_gemm_example_fp16int4_b_scale.inc @@ -116,6 +116,7 @@ inline __host__ __device__ constexpr double get_atol() bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) { using namespace ck::literals; + using Bypass = ck::tensor_layout::BypassLayoutVerification; auto& [M, N, @@ -138,12 +139,12 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co if constexpr(std::is_same_v) { return HostTensorDescriptor( - {batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout); + {batch_count_, row, col}, {batch_stride, stride, 1_uz}, Bypass{}); } else { return HostTensorDescriptor( - {batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout); + {batch_count_, row, col}, {batch_stride, 1_uz, stride}, Bypass{}); } }; diff --git a/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc b/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc index 6ed0b23407..1efbfbd540 100644 --- a/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc +++ b/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc @@ -37,6 +37,7 @@ struct ExecutionConfig final bool run_batched_gemm_rowwise(const ProblemSize& problem_size, const ExecutionConfig& config) { using namespace ck::literals; + using Bypass = ck::tensor_layout::BypassLayoutVerification; auto& [M, N, @@ -65,12 +66,12 @@ bool run_batched_gemm_rowwise(const ProblemSize& problem_size, const ExecutionCo if(std::is_same::value) { return HostTensorDescriptor( - {batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout); + {batch_count_, row, col}, {batch_stride, stride, 1_uz}, Bypass{}); } else { return HostTensorDescriptor( - {batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout); + {batch_count_, row, col}, {batch_stride, 1_uz, stride}, Bypass{}); } }; diff --git a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc index 7a03e9cacf..40cec7ef11 100644 --- a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc +++ b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc @@ -1,8 +1,10 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once +using Bypass = ck::tensor_layout::BypassLayoutVerification; + bool run_batched_gemm_gemm_example(int argc, char* argv[]) { bool do_verification = true; @@ -111,12 +113,12 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[]) if(std::is_same::value) { return HostTensorDescriptor( - {batch_count, row, col}, {batch_stride, stride, 1_uz}, layout); + {batch_count, row, col}, {batch_stride, stride, 1_uz}, Bypass{}); } else { return HostTensorDescriptor( - {batch_count, row, col}, {batch_stride, 1_uz, stride}, layout); + {batch_count, row, col}, {batch_stride, 1_uz, stride}, Bypass{}); } }; diff --git a/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc b/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc index bba6ae14a4..a3e1f325bd 100644 --- a/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc +++ b/example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc @@ -4,27 +4,21 @@ bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfig& config) { using namespace ck::literals; + using Bypass = ck::tensor_layout::BypassLayoutVerification; - ProblemSize ps = - problem_size; // make mutable copy because default stride values of 0 need to be updated - auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = ps; + auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size; - auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, int& stride, auto layout) { - if(std::is_same::value) - { - auto desc = HostTensorDescriptor({row, col}, {static_cast(stride), 1_uz}); - if(stride <= 0) - stride = desc.GetStrides()[0]; - return desc; - } - else - { - auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast(stride)}); - if(stride <= 0) - stride = desc.GetStrides()[1]; - return desc; - } - }; + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if constexpr(std::is_same_v) + { + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); + } + else + { + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); + } + }; Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); diff --git a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp index 055d253042..63343df3a8 100644 --- a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp +++ b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp @@ -27,8 +27,9 @@ using BF16 = ck::bhalf_t; using I8 = int8_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using PassThrough = ck::tensor_operation::element_wise::PassThrough; using Add = ck::tensor_operation::element_wise::Add; @@ -110,11 +111,11 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co if(std::is_same::value) { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); } else { - return HostTensorDescriptor({row, col}, {1_uz, stride}); + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); } }; diff --git a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp index 1ba8133ea7..78f7d954f0 100644 --- a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp +++ b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp @@ -26,8 +26,9 @@ using S = ck::Sequence; using F16 = ck::half_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using PassThrough = ck::tensor_operation::element_wise::PassThrough; using Add = ck::tensor_operation::element_wise::Add; @@ -109,11 +110,11 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co if(std::is_same::value) { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); } else { - return HostTensorDescriptor({row, col}, {1_uz, stride}); + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); } }; diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_bias_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_bias_fastgelu_bf16_i8.cpp index a30314f58c..d40d09540f 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_bias_fastgelu_bf16_i8.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_bias_fastgelu_bf16_i8.cpp @@ -27,7 +27,8 @@ using BF16 = ck::bhalf_t; using I8 = int8_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = BF16; using AsDataType = ck::Tuple; @@ -161,11 +162,11 @@ int main(int argc, char* argv[]) if(std::is_same::value) { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); } else { - return HostTensorDescriptor({row, col}, {1_uz, stride}); + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); } }; diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fastgelu_bf16_i8.cpp index 086a0f4834..102b7f50de 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fastgelu_bf16_i8.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fastgelu_bf16_i8.cpp @@ -27,7 +27,8 @@ using BF16 = ck::bhalf_t; using I8 = int8_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = BF16; using AsDataType = ck::Tuple; @@ -157,11 +158,11 @@ int main(int argc, char* argv[]) if(std::is_same::value) { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); } else { - return HostTensorDescriptor({row, col}, {1_uz, stride}); + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); } }; diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fp16.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fp16.cpp index 32345d1263..aeaa5fe776 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fp16.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fp16.cpp @@ -24,7 +24,8 @@ using S = ck::Sequence; using F16 = ck::half_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using PassThrough = ck::tensor_operation::element_wise::PassThrough; @@ -220,11 +221,11 @@ int main(int argc, char* argv[]) if(std::is_same::value) { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); } else { - return HostTensorDescriptor({row, col}, {1_uz, stride}); + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); } }; diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_multiply_bias_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_multiply_bias_fastgelu_bf16_i8.cpp index 00e2d7e33c..9363953a6e 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_multiply_bias_fastgelu_bf16_i8.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_multiply_bias_fastgelu_bf16_i8.cpp @@ -27,7 +27,8 @@ using BF16 = ck::bhalf_t; using I8 = int8_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = BF16; using AsDataType = ck::Tuple; @@ -160,11 +161,11 @@ int main(int argc, char* argv[]) if(std::is_same::value) { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); } else { - return HostTensorDescriptor({row, col}, {1_uz, stride}); + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); } }; diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp index 405eac7df1..a599f9d032 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp @@ -28,8 +28,9 @@ using BF16 = ck::bhalf_t; using I8 = int8_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = BF16; using AsDataType = ck::Tuple; @@ -121,27 +122,19 @@ int main(int argc, char* argv[]) exit(0); } - auto f_host_tensor_descriptor = [](std::size_t row, - std::size_t col, - ck::index_t& stride, - auto layout) { - using namespace ck::literals; + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; - if(std::is_same::value) - { - auto desc = HostTensorDescriptor({row, col}, {static_cast(stride), 1_uz}); - if(stride <= 0) - stride = desc.GetStrides()[0]; - return desc; - } - else - { - auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast(stride)}); - if(stride <= 0) - stride = desc.GetStrides()[1]; - return desc; - } - }; + if(std::is_same::value) + { + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); + } + else + { + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); + } + }; Tensor a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{})); Tensor b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{})); diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fastgelu_bf16_i8.cpp index 50e670bdf3..d7e316e1e0 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fastgelu_bf16_i8.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fastgelu_bf16_i8.cpp @@ -28,8 +28,9 @@ using BF16 = ck::bhalf_t; using I8 = int8_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = BF16; using AsDataType = ck::Tuple; @@ -121,27 +122,19 @@ int main(int argc, char* argv[]) exit(0); } - auto f_host_tensor_descriptor = [](std::size_t row, - std::size_t col, - ck::index_t& stride, - auto layout) { - using namespace ck::literals; + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; - if(std::is_same::value) - { - auto desc = HostTensorDescriptor({row, col}, {static_cast(stride), 1_uz}); - if(stride <= 0) - stride = desc.GetStrides()[0]; - return desc; - } - else - { - auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast(stride)}); - if(stride <= 0) - stride = desc.GetStrides()[1]; - return desc; - } - }; + if(std::is_same::value) + { + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); + } + else + { + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); + } + }; Tensor a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{})); Tensor b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{})); diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp index 50e1c21c8f..83cc61284e 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp @@ -28,8 +28,9 @@ using BF16 = ck::bhalf_t; using I8 = int8_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = BF16; using AsDataType = ck::Tuple; @@ -120,27 +121,19 @@ int main(int argc, char* argv[]) exit(0); } - auto f_host_tensor_descriptor = [](std::size_t row, - std::size_t col, - ck::index_t& stride, - auto layout) { - using namespace ck::literals; + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; - if(std::is_same::value) - { - auto desc = HostTensorDescriptor({row, col}, {static_cast(stride), 1_uz}); - if(stride <= 0) - stride = desc.GetStrides()[0]; - return desc; - } - else - { - auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast(stride)}); - if(stride <= 0) - stride = desc.GetStrides()[1]; - return desc; - } - }; + if(std::is_same::value) + { + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); + } + else + { + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); + } + }; Tensor a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{})); Tensor b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{})); diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp index 8da49ef85d..43637e4a1f 100644 --- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp +++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp @@ -28,8 +28,9 @@ using F16 = ck::half_t; using FP8 = ck::f8_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = FP8; using B0DataType = FP8; @@ -147,11 +148,11 @@ int main(int argc, char* argv[]) if(std::is_same::value) { - return HostTensorDescriptor({row, col}, {stride, 1_uz}); + return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{}); } else { - return HostTensorDescriptor({row, col}, {1_uz, stride}); + return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{}); } }; diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp index 72ea7f1cb6..2cb2dc17f4 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp @@ -28,8 +28,9 @@ using F16 = ck::half_t; using F8 = ck::f8_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = F8; using B0DataType = F8; @@ -242,7 +243,7 @@ int main(int argc, char* argv[]) printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg3: time kernel (0=no, 1=yes)\n"); printf("arg4 to 5: N, K, tokens\n"); - exit(0); + exit(1); } ck::index_t sorted_size = sorted_tile_num * MPerBlock; @@ -294,7 +295,7 @@ int main(int argc, char* argv[]) Tensor d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0})); Tensor d1_e_n( HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]})); - Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0})); + Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{})); Tensor e_t_n_host_result( HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{})); Tensor e_t_n_device_result( diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp index 66627a6de6..bca5ffec78 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp @@ -30,8 +30,9 @@ using F8 = ck::f8_t; using F32 = float; using I64 = int64_t; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = F8; using A1DataType = F32; @@ -312,7 +313,7 @@ int main(int argc, char* argv[]) Col{})); Tensor b0_preshuffled( HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{})); - Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0})); + Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{})); Tensor e_t_n_host_result( HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{})); Tensor e_t_n_device_result( diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp index 1cfe896b1b..d14885e7f2 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp @@ -29,8 +29,9 @@ using F16 = ck::half_t; using F8 = ck::f8_t; using F32 = float; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = F8; using B0DataType = I4; @@ -222,7 +223,7 @@ int main(int argc, char* argv[]) printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg3: time kernel (0=no, 1=yes)\n"); printf("arg4 to 5: N, K, tokens\n"); - exit(0); + exit(1); } if(tokens * topk > valid_size) @@ -268,10 +269,10 @@ int main(int argc, char* argv[]) HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{})); Tensor b0_preshuffled( HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{})); - Tensor d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0})); + Tensor d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}, Bypass{})); Tensor d1_e_n( HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]})); - Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0})); + Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{})); Tensor e_t_n_host_result( HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{})); Tensor e_t_n_device_result( diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp index a6c5a8914f..d80c75abe8 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp @@ -286,7 +286,7 @@ int main(int argc, char* argv[]) HostTensorDescriptor({tokens, topk, N}, {StrideDs[0] * topk, StrideDs[0], 0}, Bypass{})); Tensor d1_e_n( HostTensorDescriptor({experts, N}, {PerTokenQuant ? StrideDs[1] * N : 1, StrideDs[1]})); - Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0})); + Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{})); Tensor e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1})); Tensor e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1})); e_t_n_device_result.SetZero(); diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp index cc42c4b815..02369f344e 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp @@ -30,8 +30,9 @@ using F8 = ck::f8_t; using F32 = float; using I64 = int64_t; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = F8; using A1DataType = F32; @@ -305,7 +306,7 @@ int main(int argc, char* argv[]) Col{})); Tensor b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{})); - Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0})); + Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{})); Tensor e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1})); Tensor e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1})); e_t_n_device_result.SetZero(); diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp index d44ca19d2f..cafea72559 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp @@ -178,21 +178,17 @@ int main(int argc, char* argv[]) { // use default case } - else if(argc == 3) - { - // use default case - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - } - else if(argc == 7) + else if(argc == 3 || argc == 7) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); time_kernel = std::stoi(argv[3]); - N = std::stoi(argv[4]); - K = std::stoi(argv[5]); - tokens = std::stoi(argv[6]); + if(argc == 7) + { + N = std::stoi(argv[4]); + K = std::stoi(argv[5]); + tokens = std::stoi(argv[6]); + } } else { @@ -200,7 +196,7 @@ int main(int argc, char* argv[]) printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg3: time kernel (0=no, 1=yes)\n"); printf("arg4 to 6: N, K, tokens\n"); - exit(0); + exit(1); } ck::index_t StrideA = K; @@ -244,8 +240,8 @@ int main(int argc, char* argv[]) Tensor b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{})); Tensor b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{})); Tensor d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}, Bypass{})); - Tensor d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]})); - Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0})); + Tensor d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}, Bypass{})); + Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{})); Tensor e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1})); Tensor e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1})); e_t_n_device_result.SetZero(); diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp index 69c0d6558f..0c51a24679 100644 --- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp +++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp @@ -31,8 +31,9 @@ using F32 = float; using XDataType = ck::e8m0_bexp_t; using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = F4; using A1DataType = XPackedDataType; @@ -285,7 +286,7 @@ int main(int argc, char* argv[]) HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2}, {N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN}, Col{})); - Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0})); + Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{})); Tensor e_t_k_n_host_result( HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{})); Tensor e_t_k_n_device_result( diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp index 2f7762386d..b6d5d8f211 100644 --- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp +++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp @@ -31,8 +31,9 @@ using F32 = float; using XDataType = ck::e8m0_bexp_t; using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = F4; using A1DataType = XPackedDataType; @@ -282,7 +283,7 @@ int main(int argc, char* argv[]) HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2}, {N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN}, Col{})); - Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0})); + Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{})); Tensor e_t_k_n_host_result( HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{})); Tensor e_t_k_n_device_result( diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp index 4ef068c41f..1adf039b70 100644 --- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp +++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp @@ -32,8 +32,9 @@ using XDataType = ck::e8m0_bexp_t; using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t using I64 = int64_t; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = F4; using A1DataType = XPackedDataType; @@ -315,7 +316,7 @@ int main(int argc, char* argv[]) HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2}, {N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN}, Col{})); - Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0})); + Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{})); Tensor e_t_k_n_host_result( HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{})); Tensor e_t_k_n_device_result( diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp index 317b0f9f15..61a63b47ac 100644 --- a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp +++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp @@ -31,8 +31,9 @@ using F32 = float; using XDataType = ck::e8m0_bexp_t; using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = F4; using A1DataType = XPackedDataType; @@ -290,7 +291,7 @@ int main(int argc, char* argv[]) HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N}, {N * Scale_Stride_BN, 1, Scale_Stride_BN}, Col{})); - Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0})); + Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{})); Tensor e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1})); Tensor e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1})); diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp index 5bb6454d2a..2670468c4b 100644 --- a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp +++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp @@ -31,8 +31,9 @@ using F32 = float; using XDataType = ck::e8m0_bexp_t; using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = F4; using A1DataType = XPackedDataType; @@ -290,7 +291,7 @@ int main(int argc, char* argv[]) HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N}, {N * Scale_Stride_BN, 1, Scale_Stride_BN}, Col{})); - Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0})); + Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{})); Tensor e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1})); Tensor e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1})); diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp index 333f8a3d52..c3454be84a 100644 --- a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp +++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp @@ -32,8 +32,9 @@ using XDataType = ck::e8m0_bexp_t; using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t using I64 = int64_t; -using Row = ck::tensor_layout::gemm::RowMajor; -using Col = ck::tensor_layout::gemm::ColumnMajor; +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using Bypass = ck::tensor_layout::BypassLayoutVerification; using A0DataType = F4; using A1DataType = XPackedDataType; @@ -325,7 +326,7 @@ int main(int argc, char* argv[]) HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N}, {N * Scale_Stride_BN, 1, Scale_Stride_BN}, Col{})); - Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0})); + Tensor d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{})); Tensor e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1})); Tensor e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));