mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 10:09:41 +00:00
[CK][Examples] Fixing stride issues in ck examples by workaround - Bypassing hostTensor validation.
Signed-off-by: Michal Kulikowski <Michal.Kulikowski@amd.com>
[ROCm/composable_kernel commit: b9789a0742]
This commit is contained in:
committed by
Michał Kulikowski
parent
2a01918313
commit
31939e7b2b
@@ -43,8 +43,9 @@ using S = ck::Sequence<Is...>;
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
@@ -190,11 +191,11 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -43,8 +43,9 @@ using S = ck::Sequence<Is...>;
|
||||
using I8 = std::int8_t;
|
||||
using I32 = std::int32_t;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
@@ -190,11 +191,11 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -42,8 +42,9 @@ using S = ck::Sequence<Is...>;
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
@@ -173,7 +174,7 @@ int main(int argc, char* argv[])
|
||||
printf("arg3: time kernel (0=no, 1=yes)\n");
|
||||
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, "
|
||||
"beta\n");
|
||||
exit(0);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
@@ -182,11 +183,11 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -25,8 +25,9 @@ using S = ck::Sequence<Is...>;
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
@@ -160,23 +161,22 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
ck::index_t StrideD = 0;
|
||||
|
||||
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
|
||||
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
|
||||
Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, 0, ELayout{}));
|
||||
Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, ELayout{}));
|
||||
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
|
||||
const auto StrideD = std::is_same<decltype(ELayout{}), ck::tensor_layout::gemm::RowMajor>::value
|
||||
? d_m_n.mDesc.GetStrides()[0]
|
||||
: d_m_n.mDesc.GetStrides()[1];
|
||||
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
|
||||
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
|
||||
std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
|
||||
|
||||
@@ -6,6 +6,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
|
||||
static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
|
||||
#endif
|
||||
using namespace ck::literals;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
ProblemSize ps =
|
||||
problem_size; // make mutable copy because default stride values of 0 need to be updated
|
||||
@@ -15,11 +16,11 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -43,7 +44,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
|
||||
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
|
||||
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
|
||||
|
||||
// If any user-provided leading stride <= 0, replace it with the one determined by the
|
||||
// If any user-provided leading stride < 0, replace it with the one determined by the
|
||||
// created tensor descriptor. For RowMajor the leading stride is index 0, for ColMajor index 1.
|
||||
auto fetch_leading_stride = [](const auto& tensor, auto layout_tag) -> int {
|
||||
if constexpr(std::is_same_v<decltype(layout_tag), ck::tensor_layout::gemm::RowMajor>)
|
||||
@@ -56,15 +57,15 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
|
||||
}
|
||||
};
|
||||
|
||||
if(StrideA <= 0)
|
||||
if(StrideA < 0)
|
||||
StrideA = fetch_leading_stride(a_m_k, ALayout{});
|
||||
if(StrideB <= 0)
|
||||
if(StrideB < 0)
|
||||
StrideB = fetch_leading_stride(b_k_n, BLayout{});
|
||||
if(StrideD0 <= 0)
|
||||
if(StrideD0 < 0)
|
||||
StrideD0 = fetch_leading_stride(d0_m_n, D0Layout{});
|
||||
if(StrideD1 <= 0)
|
||||
if(StrideD1 < 0)
|
||||
StrideD1 = fetch_leading_stride(d1_m_n, D1Layout{});
|
||||
if(StrideE <= 0)
|
||||
if(StrideE < 0)
|
||||
StrideE = fetch_leading_stride(e_m_n_host_result, ELayout{});
|
||||
|
||||
switch(config.init_method)
|
||||
|
||||
@@ -25,8 +25,9 @@ using S = ck::Sequence<Is...>;
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
@@ -138,12 +139,12 @@ int main(int argc, char* argv[])
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor(
|
||||
{batch_count, row, col}, {row * stride, stride, 1_uz}, layout);
|
||||
{batch_count, row, col}, {row * stride, stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(
|
||||
{batch_count, row, col}, {col * stride, 1_uz, stride}, layout);
|
||||
{batch_count, row, col}, {col * stride, 1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -31,6 +31,7 @@ struct ExecutionConfig final
|
||||
bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
|
||||
{
|
||||
using namespace ck::literals;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
|
||||
static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
|
||||
@@ -62,12 +63,12 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor(
|
||||
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
|
||||
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(
|
||||
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
|
||||
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -116,6 +116,7 @@ inline __host__ __device__ constexpr double get_atol()
|
||||
bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
|
||||
{
|
||||
using namespace ck::literals;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
auto& [M,
|
||||
N,
|
||||
@@ -138,12 +139,12 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
|
||||
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
return HostTensorDescriptor(
|
||||
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
|
||||
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(
|
||||
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
|
||||
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -37,6 +37,7 @@ struct ExecutionConfig final
|
||||
bool run_batched_gemm_rowwise(const ProblemSize& problem_size, const ExecutionConfig& config)
|
||||
{
|
||||
using namespace ck::literals;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
auto& [M,
|
||||
N,
|
||||
@@ -65,12 +66,12 @@ bool run_batched_gemm_rowwise(const ProblemSize& problem_size, const ExecutionCo
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor(
|
||||
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
|
||||
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(
|
||||
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
|
||||
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
bool run_batched_gemm_gemm_example(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
@@ -111,12 +113,12 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[])
|
||||
if(std::is_same<decltype(layout), Row>::value)
|
||||
{
|
||||
return HostTensorDescriptor(
|
||||
{batch_count, row, col}, {batch_stride, stride, 1_uz}, layout);
|
||||
{batch_count, row, col}, {batch_stride, stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(
|
||||
{batch_count, row, col}, {batch_stride, 1_uz, stride}, layout);
|
||||
{batch_count, row, col}, {batch_stride, 1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -4,27 +4,21 @@
|
||||
bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfig& config)
|
||||
{
|
||||
using namespace ck::literals;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
ProblemSize ps =
|
||||
problem_size; // make mutable copy because default stride values of 0 need to be updated
|
||||
auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = ps;
|
||||
auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
|
||||
|
||||
auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, int& stride, auto layout) {
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[0];
|
||||
return desc;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[1];
|
||||
return desc;
|
||||
}
|
||||
};
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
|
||||
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
|
||||
|
||||
@@ -27,8 +27,9 @@ using BF16 = ck::bhalf_t;
|
||||
using I8 = int8_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
@@ -110,11 +111,11 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -26,8 +26,9 @@ using S = ck::Sequence<Is...>;
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
@@ -109,11 +110,11 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -27,7 +27,8 @@ using BF16 = ck::bhalf_t;
|
||||
using I8 = int8_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = BF16;
|
||||
using AsDataType = ck::Tuple<A0DataType>;
|
||||
@@ -161,11 +162,11 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -27,7 +27,8 @@ using BF16 = ck::bhalf_t;
|
||||
using I8 = int8_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = BF16;
|
||||
using AsDataType = ck::Tuple<A0DataType>;
|
||||
@@ -157,11 +158,11 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -24,7 +24,8 @@ using S = ck::Sequence<Is...>;
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
@@ -220,11 +221,11 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -27,7 +27,8 @@ using BF16 = ck::bhalf_t;
|
||||
using I8 = int8_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = BF16;
|
||||
using AsDataType = ck::Tuple<A0DataType>;
|
||||
@@ -160,11 +161,11 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -28,8 +28,9 @@ using BF16 = ck::bhalf_t;
|
||||
using I8 = int8_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = BF16;
|
||||
using AsDataType = ck::Tuple<A0DataType>;
|
||||
@@ -121,27 +122,19 @@ int main(int argc, char* argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_host_tensor_descriptor = [](std::size_t row,
|
||||
std::size_t col,
|
||||
ck::index_t& stride,
|
||||
auto layout) {
|
||||
using namespace ck::literals;
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[0];
|
||||
return desc;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[1];
|
||||
return desc;
|
||||
}
|
||||
};
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
|
||||
Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
|
||||
|
||||
@@ -28,8 +28,9 @@ using BF16 = ck::bhalf_t;
|
||||
using I8 = int8_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = BF16;
|
||||
using AsDataType = ck::Tuple<A0DataType>;
|
||||
@@ -121,27 +122,19 @@ int main(int argc, char* argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_host_tensor_descriptor = [](std::size_t row,
|
||||
std::size_t col,
|
||||
ck::index_t& stride,
|
||||
auto layout) {
|
||||
using namespace ck::literals;
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[0];
|
||||
return desc;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[1];
|
||||
return desc;
|
||||
}
|
||||
};
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
|
||||
Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
|
||||
|
||||
@@ -28,8 +28,9 @@ using BF16 = ck::bhalf_t;
|
||||
using I8 = int8_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = BF16;
|
||||
using AsDataType = ck::Tuple<A0DataType>;
|
||||
@@ -120,27 +121,19 @@ int main(int argc, char* argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_host_tensor_descriptor = [](std::size_t row,
|
||||
std::size_t col,
|
||||
ck::index_t& stride,
|
||||
auto layout) {
|
||||
using namespace ck::literals;
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[0];
|
||||
return desc;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[1];
|
||||
return desc;
|
||||
}
|
||||
};
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
|
||||
Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
|
||||
|
||||
@@ -28,8 +28,9 @@ using F16 = ck::half_t;
|
||||
using FP8 = ck::f8_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = FP8;
|
||||
using B0DataType = FP8;
|
||||
@@ -147,11 +148,11 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -28,8 +28,9 @@ using F16 = ck::half_t;
|
||||
using F8 = ck::f8_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = F8;
|
||||
using B0DataType = F8;
|
||||
@@ -242,7 +243,7 @@ int main(int argc, char* argv[])
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: time kernel (0=no, 1=yes)\n");
|
||||
printf("arg4 to 5: N, K, tokens\n");
|
||||
exit(0);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
ck::index_t sorted_size = sorted_tile_num * MPerBlock;
|
||||
@@ -294,7 +295,7 @@ int main(int argc, char* argv[])
|
||||
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
|
||||
Tensor<D1DataType> d1_e_n(
|
||||
HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
|
||||
Tensor<EDataType> e_t_n_host_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
Tensor<EDataType> e_t_n_device_result(
|
||||
|
||||
@@ -30,8 +30,9 @@ using F8 = ck::f8_t;
|
||||
using F32 = float;
|
||||
using I64 = int64_t;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = F8;
|
||||
using A1DataType = F32;
|
||||
@@ -312,7 +313,7 @@ int main(int argc, char* argv[])
|
||||
Col{}));
|
||||
Tensor<B0DataType> b0_preshuffled(
|
||||
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
|
||||
Tensor<EDataType> e_t_n_host_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
Tensor<EDataType> e_t_n_device_result(
|
||||
|
||||
@@ -29,8 +29,9 @@ using F16 = ck::half_t;
|
||||
using F8 = ck::f8_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = F8;
|
||||
using B0DataType = I4;
|
||||
@@ -222,7 +223,7 @@ int main(int argc, char* argv[])
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: time kernel (0=no, 1=yes)\n");
|
||||
printf("arg4 to 5: N, K, tokens\n");
|
||||
exit(0);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if(tokens * topk > valid_size)
|
||||
@@ -268,10 +269,10 @@ int main(int argc, char* argv[])
|
||||
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
|
||||
Tensor<B0DataType> b0_preshuffled(
|
||||
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
|
||||
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
|
||||
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}, Bypass{}));
|
||||
Tensor<D1DataType> d1_e_n(
|
||||
HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
|
||||
Tensor<EDataType> e_t_n_host_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
Tensor<EDataType> e_t_n_device_result(
|
||||
|
||||
@@ -286,7 +286,7 @@ int main(int argc, char* argv[])
|
||||
HostTensorDescriptor({tokens, topk, N}, {StrideDs[0] * topk, StrideDs[0], 0}, Bypass{}));
|
||||
Tensor<D1DataType> d1_e_n(
|
||||
HostTensorDescriptor({experts, N}, {PerTokenQuant ? StrideDs[1] * N : 1, StrideDs[1]}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
|
||||
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
e_t_n_device_result.SetZero();
|
||||
|
||||
@@ -30,8 +30,9 @@ using F8 = ck::f8_t;
|
||||
using F32 = float;
|
||||
using I64 = int64_t;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = F8;
|
||||
using A1DataType = F32;
|
||||
@@ -305,7 +306,7 @@ int main(int argc, char* argv[])
|
||||
Col{}));
|
||||
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
|
||||
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
e_t_n_device_result.SetZero();
|
||||
|
||||
@@ -178,21 +178,17 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
// use default case
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 7)
|
||||
else if(argc == 3 || argc == 7)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
N = std::stoi(argv[4]);
|
||||
K = std::stoi(argv[5]);
|
||||
tokens = std::stoi(argv[6]);
|
||||
if(argc == 7)
|
||||
{
|
||||
N = std::stoi(argv[4]);
|
||||
K = std::stoi(argv[5]);
|
||||
tokens = std::stoi(argv[6]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -200,7 +196,7 @@ int main(int argc, char* argv[])
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: time kernel (0=no, 1=yes)\n");
|
||||
printf("arg4 to 6: N, K, tokens\n");
|
||||
exit(0);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
ck::index_t StrideA = K;
|
||||
@@ -244,8 +240,8 @@ int main(int argc, char* argv[])
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
|
||||
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}, Bypass{}));
|
||||
Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}, Bypass{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
|
||||
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
e_t_n_device_result.SetZero();
|
||||
|
||||
@@ -31,8 +31,9 @@ using F32 = float;
|
||||
using XDataType = ck::e8m0_bexp_t;
|
||||
using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = F4;
|
||||
using A1DataType = XPackedDataType;
|
||||
@@ -285,7 +286,7 @@ int main(int argc, char* argv[])
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
|
||||
{N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
|
||||
Tensor<EDataType> e_t_k_n_host_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
Tensor<EDataType> e_t_k_n_device_result(
|
||||
|
||||
@@ -31,8 +31,9 @@ using F32 = float;
|
||||
using XDataType = ck::e8m0_bexp_t;
|
||||
using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = F4;
|
||||
using A1DataType = XPackedDataType;
|
||||
@@ -282,7 +283,7 @@ int main(int argc, char* argv[])
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
|
||||
{N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
|
||||
Tensor<EDataType> e_t_k_n_host_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
Tensor<EDataType> e_t_k_n_device_result(
|
||||
|
||||
@@ -32,8 +32,9 @@ using XDataType = ck::e8m0_bexp_t;
|
||||
using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
|
||||
using I64 = int64_t;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = F4;
|
||||
using A1DataType = XPackedDataType;
|
||||
@@ -315,7 +316,7 @@ int main(int argc, char* argv[])
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
|
||||
{N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
|
||||
Tensor<EDataType> e_t_k_n_host_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
Tensor<EDataType> e_t_k_n_device_result(
|
||||
|
||||
@@ -31,8 +31,9 @@ using F32 = float;
|
||||
using XDataType = ck::e8m0_bexp_t;
|
||||
using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = F4;
|
||||
using A1DataType = XPackedDataType;
|
||||
@@ -290,7 +291,7 @@ int main(int argc, char* argv[])
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
|
||||
{N * Scale_Stride_BN, 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
|
||||
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
|
||||
|
||||
@@ -31,8 +31,9 @@ using F32 = float;
|
||||
using XDataType = ck::e8m0_bexp_t;
|
||||
using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = F4;
|
||||
using A1DataType = XPackedDataType;
|
||||
@@ -290,7 +291,7 @@ int main(int argc, char* argv[])
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
|
||||
{N * Scale_Stride_BN, 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
|
||||
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
|
||||
|
||||
@@ -32,8 +32,9 @@ using XDataType = ck::e8m0_bexp_t;
|
||||
using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
|
||||
using I64 = int64_t;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = F4;
|
||||
using A1DataType = XPackedDataType;
|
||||
@@ -325,7 +326,7 @@ int main(int argc, char* argv[])
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
|
||||
{N * Scale_Stride_BN, 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
|
||||
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user