revert contraction deviceop changes

2026-04-20 06:49:15 +00:00 · 2024-04-01 00:26:01 +00:00
parent 38f109c460
commit 943199a991
5 changed files with 23 additions and 227 deletions
--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
@@ -656,26 +656,16 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
        {
            bool valid_as_access = true;
            static_for<0, NumATensor, 1>{}([&](auto i) {
-                // vector memory access of A: could be on M or AK1 dimension
-                if constexpr(ABlockTransferSrcVectorDim == 1)
+                const bool valid_a_vector_size =
+                    arg.as_max_read_elems_[i] % ABlockTransferSrcScalarPerVector == 0;
+                const bool valid_a_access_dim_m =
+                    ABlockTransferSrcVectorDim == 1 && arg.as_mz_consecutive_[i];
+                const bool valid_a_access_dim_k =
+                    ABlockTransferSrcVectorDim == 2 && arg.as_kz_consecutive_[i];
+                const bool valid_a_access_dim = valid_a_access_dim_m || valid_a_access_dim_k;
+                if(!(valid_a_vector_size && valid_a_access_dim))
                {
-                    if(!(arg.a_mz_stride_[i] == 1 && arg.as_grid_desc_ak0_m_ak1_[i].GetLength(I1) %
-                                                             ABlockTransferSrcScalarPerVector ==
-                                                         0) &&
-                       ABlockTransferSrcScalarPerVector != 1)
-                    {
-                        all_valid = false;
-                    }
-                }
-                else
-                {
-                    if(!(arg.a_kz_stride_[i] == 1 && arg.as_grid_desc_ak0_m_ak1_[i].GetLength(I2) %
-                                                             ABlockTransferSrcScalarPerVector ==
-                                                         0) &&
-                       ABlockTransferSrcScalarPerVector != 1)
-                    {
-                        all_valid = false;
-                    }
+                    valid_as_access = false;
                }
            });
            if(!valid_as_access)
@@ -694,23 +684,7 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
                const bool valid_b_access_dim = valid_b_access_dim_n || valid_b_access_dim_k;
                if(!(valid_b_vector_size && valid_b_access_dim))
                {
-                    if(!(arg.b_nz_stride_[i] == 1 && arg.bs_grid_desc_bk0_n_bk1_[i].GetLength(I1) %
-                                                             BBlockTransferSrcScalarPerVector ==
-                                                         0) &&
-                       BBlockTransferSrcScalarPerVector != 1)
-                    {
-                        all_valid = false;
-                    }
-                }
-                else
-                {
-                    if(!(arg.b_kz_stride_[i] == 1 && arg.bs_grid_desc_bk0_n_bk1_[i].GetLength(I2) %
-                                                             BBlockTransferSrcScalarPerVector ==
-                                                         0) &&
-                       BBlockTransferSrcScalarPerVector != 1)
-                    {
-                        all_valid = false;
-                    }
+                    valid_bs_access = false;
                }
            });
            if(!valid_bs_access)
@@ -720,22 +694,16 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle

            bool valid_ds_access = true;
            static_for<0, NumDTensor, 1>{}([&](auto i) {
-                if(!(arg.ds_nz_stride_[i] == 1 &&
-                     arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) %
-                             CDEBlockTransferScalarPerVector_NPerBlock ==
-                         0) &&
-                   CDEBlockTransferScalarPerVector_NPerBlock != 1)
+                const bool valid_d_vector_size =
+                    arg.ds_max_read_elems_[i] % CDEBlockTransferScalarPerVector_NPerBlock == 0;
+                // Vector read of Ds is always on N dimension.
+                const bool valid_d_access_dim = arg.ds_nz_consecutive_[i];
+                if(!(valid_d_vector_size && valid_d_access_dim))
                {
                    valid_ds_access = false;
                }
            });
-
-            // vector memory access of E: always on NPerBlock dimension
-            if(!(arg.e_nz_stride_ == 1 &&
-                 arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) %
-                         CDEBlockTransferScalarPerVector_NPerBlock ==
-                     0) &&
-               CDEBlockTransferScalarPerVector_NPerBlock != 1)
+            if(!valid_ds_access)
            {
                return false;
            }
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp
@@ -169,78 +169,6 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
    static constexpr auto I2 = Number<2>{};
    static constexpr auto I3 = Number<3>{};

-#if 0
-    static constexpr auto matrix_padder =
-        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
-
-    static auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideAs)
-    {
-        const auto a_grid_desc_mraw_kraw = [&]() {
-            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, AsLayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
-                                                    make_tuple(StrideAs, I1));
-            }
-            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, AsLayout>)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
-                                                    make_tuple(I1, StrideAs));
-            }
-        }();
-
-        return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
-    }
-
-    static auto MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideBs)
-    {
-        const auto b_grid_desc_nraw_kraw = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, BsLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
-                                                    make_tuple(I1, StrideBs));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BsLayout>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
-                                                    make_tuple(StrideBs, I1));
-            }
-        }();
-
-        return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
-    }
-
-    template <typename ELay>
-    static auto MakeEGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideE)
-    {
-        const auto e_grid_desc_mraw_nraw = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELay>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
-                                                    make_tuple(StrideE, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELay>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
-                                                    make_tuple(I1, StrideE));
-            }
-        }();
-
-        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
-    }
-
-    static auto MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
-                                         const std::array<index_t, NumDTensor>& NRaws,
-                                         const std::array<index_t, NumDTensor>& DsStride)
-    {
-        return generate_tuple(
-            [&](auto i) {
-                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
-
-                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(MRaws[i], NRaws[i], DsStride[i]);
-            },
-            Number<NumDTensor>{});
-    }
-#endif
    using ComputeDataType = EDataType;

    // GridwiseGemm
@@ -424,15 +352,6 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
            }
        }

-        void Print() const
-        {
-            // std::cout << "A[M, K]: " << as_grid_desc_m_k_ << std::endl;
-            // std::cout << "B[N, K]: " << bs_grid_desc_n_k_ << std::endl;
-            // static_for<0, NumDTensor, 1>{}(
-            //[&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; });
-            // std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
-        }
-
        //  private:
        // pointers
        typename GridwiseGemm::AsGridPointer p_as_grid_;
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -10,38 +10,9 @@

 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_util.hpp"
+
 namespace ck {
-
-// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
-// and sometimes useless instructions:
-//   1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
-//   instead
-//   2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
-//   tensor coordinate instead
-//   3. Don't use a pointer to VGPR buffer, use vector instead
-
-namespace detail {
-// TODO: How to fix this? It uses an struct instead of lambda because lambda
-// doesn't have constructor
-template <index_t VectorDim, index_t ScalarPerVector>
-struct lambda_scalar_per_access
-{
-    __host__ __device__ constexpr auto operator()(index_t i) const
-    {
-        return (i == VectorDim) ? ScalarPerVector : 1;
-    }
-};
-
-template <index_t VectorDim>
-struct lambda_scalar_step_in_vector
-{
-    __host__ __device__ constexpr auto operator()(index_t i) const
-    {
-        return (i == VectorDim) ? 1 : 0;
-    }
-};
-} // namespace detail
-
 // Assume:
 //   1. src:
 //     1. SrcDesc is known at compile-time
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -7,44 +7,13 @@
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor/static_tensor.hpp"
 #include "ck/utility/is_detected.hpp"

+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_util.hpp"
+
 namespace ck {

-namespace detail {
-// TODO: How to fix this? It uses an struct instead of lambda because lambda
-// doesn't have constructor
-template <index_t SrcVectorDim,
-          index_t SrcScalarPerVector,
-          index_t DstVectorDim,
-          index_t DstScalarPerVector>
-struct lambda_scalar_per_access_for_src_and_dst
-{
-    __host__ __device__ constexpr auto operator()(index_t i) const
-    {
-        if(i == SrcVectorDim && i == DstVectorDim)
-        {
-            return math::lcm(SrcScalarPerVector, DstScalarPerVector);
-        }
-        else if(i == SrcVectorDim)
-        {
-            return SrcScalarPerVector;
-        }
-        else if(i == DstVectorDim)
-        {
-            return DstScalarPerVector;
-        }
-        else
-        {
-            return 1;
-        }
-    }
-};
-
-} // namespace detail
-
 // Assume:
 //   1. src_desc and dst_desc are not known at compile-time
 //   2. SrcBuffer and DstBuffer are DynamicBuffer
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp
@@ -10,40 +10,9 @@
 #include "ck/utility/is_detected.hpp"
 #include "ck/tensor/static_tensor.hpp"

+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_util.hpp"
+
 namespace ck {
-
-namespace detail {
-// TODO: How to fix this? It uses an struct instead of lambda because lambda
-// doesn't have constructor
-template <index_t SrcVectorDim,
-          index_t SrcScalarPerVector,
-          index_t DstVectorDim,
-          index_t DstScalarPerVector>
-struct lambda_scalar_per_access_for_src_and_dst
-{
-    __host__ __device__ constexpr auto operator()(index_t i) const
-    {
-        if(i == SrcVectorDim && i == DstVectorDim)
-        {
-            return math::lcm(SrcScalarPerVector, DstScalarPerVector);
-        }
-        else if(i == SrcVectorDim)
-        {
-            return SrcScalarPerVector;
-        }
-        else if(i == DstVectorDim)
-        {
-            return DstScalarPerVector;
-        }
-        else
-        {
-            return 1;
-        }
-    }
-};
-
-} // namespace detail
-
 // Thread-level multi-source, multi-destination tensor slice data movement
 // Assume:
 //   1. All sources and destinations are DynamicBuffer