Absolute include path (#281)

* ad gelu and fast_gelu * added GeLU and fast GeLU * clean up * add gemm+fastgelu example * add gemm+gelu instances * update profiler * clean up * clean up * adding gemm+bias+activation * clean * adding bias * clean * adding gemm multiple d * debugging * add gemm bias add fastgelu * rename, clean * refactoring; add readme * refactor * refactor * refactor * refactor * refactor * refactor * fix * fix * update example * update example * rename * update example * add ckProfiler * clean * clean * clean * clean * add client app example * update readme * delete obselete files * remove old client app * delete old file * cleaning * clean * remove half * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path for all examples * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * fix header path * revert client app example * clean build * fix build * temporary disable client test on Jenkins * clean * clean * clean
2026-04-19 22:39:03 +00:00 · 2022-06-24 20:51:04 -05:00
parent a49115b95e
commit d1db6a0c3e
499 changed files with 3000 additions and 24130 deletions
--- a/include/ck/tensor_description/cluster_descriptor.hpp
+++ b/include/ck/tensor_description/cluster_descriptor.hpp
@@ -1,8 +1,7 @@
-#ifndef CK_CLUSTER_DESCRIPTOR_HPP
-#define CK_CLUSTER_DESCRIPTOR_HPP
+#pragma once

-#include "common_header.hpp"
-#include "tensor_adaptor.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"

 namespace ck {

@@ -30,4 +29,3 @@ __host__ __device__ constexpr auto make_cluster_descriptor(
 }

 } // namespace ck
-#endif
--- a/include/ck/tensor_description/multi_index_transform.hpp
+++ b/include/ck/tensor_description/multi_index_transform.hpp
@@ -1,8 +1,7 @@
-#ifndef CK_MULTI_INDEX_TRANSFORM_HPP
-#define CK_MULTI_INDEX_TRANSFORM_HPP
+#pragma once

-#include "common_header.hpp"
-#include "multi_index.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/multi_index.hpp"

 namespace ck {

@@ -1950,4 +1949,3 @@ struct Modulo
    }
 };
 } // namespace ck
-#endif
--- a/include/ck/tensor_description/multi_index_transform_helper.hpp
+++ b/include/ck/tensor_description/multi_index_transform_helper.hpp
@@ -1,8 +1,7 @@
-#ifndef CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
-#define CK_MULTI_INDEX_TRANSFORM_HELPER_HPP
+#pragma once

-#include "common_header.hpp"
-#include "multi_index_transform.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform.hpp"

 namespace ck {

@@ -126,4 +125,3 @@ __host__ __device__ constexpr auto make_modulo_transform(const Modulus& modulus,
    return Modulo<Modulus, UpLength>{modulus, up_length};
 }
 } // namespace ck
-#endif
--- a/include/ck/tensor_description/tensor_adaptor.hpp
+++ b/include/ck/tensor_description/tensor_adaptor.hpp
@@ -1,9 +1,8 @@
-#ifndef CK_TENSOR_ADAPTOR_HPP
-#define CK_TENSOR_ADAPTOR_HPP
+#pragma once

-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"

 namespace ck {

@@ -478,4 +477,3 @@ __host__ __device__ constexpr auto chain_tensor_adaptors(const X& x, const Xs&..
 }

 } // namespace ck
-#endif
--- a/include/ck/tensor_description/tensor_descriptor.hpp
+++ b/include/ck/tensor_description/tensor_descriptor.hpp
@@ -1,8 +1,7 @@
-#ifndef CK_TENSOR_DESCRIPTOR_HPP
-#define CK_TENSOR_DESCRIPTOR_HPP
+#pragma once

-#include "common_header.hpp"
-#include "multi_index_transform.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform.hpp"

 namespace ck {

@@ -604,4 +603,3 @@ using TensorCoordinateStep_t = decltype(make_tensor_coordinate_step(
    TensorDesc{}, MultiIndex<remove_cvref_t<TensorDesc>::GetNumOfDimension()>{}));

 } // namespace ck
-#endif
--- a/include/ck/tensor_description/tensor_descriptor_helper.hpp
+++ b/include/ck/tensor_description/tensor_descriptor_helper.hpp
@@ -1,7 +1,8 @@
 #pragma once
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "multi_index_transform_helper.hpp"
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"

 namespace ck {

--- a/include/ck/tensor_description/tensor_space_filling_curve.hpp
+++ b/include/ck/tensor_description/tensor_space_filling_curve.hpp
@@ -0,0 +1,157 @@
+#pragma once
+
+#include "ck/utility/math.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/utility/sequence_helper.hpp"
+#include "ck/utility/statically_indexed_array_multi_index.hpp"
+#include "ck/utility/tuple_helper.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+
+namespace ck {
+
+template <typename TensorLengths,
+          typename DimAccessOrder,
+          typename ScalarsPerAccess> // # of scalars per access in each dimension
+struct SpaceFillingCurve
+{
+    static constexpr index_t nDim = TensorLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    static constexpr index_t ScalarPerVector =
+        reduce_on_sequence(ScalarsPerAccess{}, math::multiplies{}, Number<1>{});
+
+    static constexpr auto access_lengths   = TensorLengths{} / ScalarsPerAccess{};
+    static constexpr auto dim_access_order = DimAccessOrder{};
+    static constexpr auto ordered_access_lengths =
+        container_reorder_given_new2old(access_lengths, dim_access_order);
+
+    static constexpr auto to_index_adaptor = make_single_stage_tensor_adaptor(
+        make_tuple(make_merge_transform(ordered_access_lengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, nDim, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    __host__ __device__ static constexpr index_t GetNumOfAccess()
+    {
+        static_assert(TensorLengths::Size() == ScalarsPerAccess::Size());
+        static_assert(TensorLengths{} % ScalarsPerAccess{} ==
+                      typename uniform_sequence_gen<TensorLengths::Size(), 0>::type{});
+
+        return reduce_on_sequence(TensorLengths{}, math::multiplies{}, Number<1>{}) /
+               ScalarPerVector;
+    }
+
+    template <index_t AccessIdx1dBegin, index_t AccessIdx1dEnd>
+    static __device__ __host__ constexpr auto GetStepBetween(Number<AccessIdx1dBegin>,
+                                                             Number<AccessIdx1dEnd>)
+    {
+        static_assert(AccessIdx1dBegin >= 0, "1D index should be non-negative");
+        static_assert(AccessIdx1dBegin < GetNumOfAccess(), "1D index should be larger than 0");
+        static_assert(AccessIdx1dEnd >= 0, "1D index should be non-negative");
+        static_assert(AccessIdx1dEnd < GetNumOfAccess(), "1D index should be larger than 0");
+
+        constexpr auto idx_begin = GetIndex(Number<AccessIdx1dBegin>{});
+        constexpr auto idx_end   = GetIndex(Number<AccessIdx1dEnd>{});
+        return idx_end - idx_begin;
+    }
+
+    template <index_t AccessIdx1d>
+    static __device__ __host__ constexpr auto GetForwardStep(Number<AccessIdx1d>)
+    {
+        static_assert(AccessIdx1d < GetNumOfAccess(), "1D index should be larger than 0");
+        return GetStepBetween(Number<AccessIdx1d>{}, Number<AccessIdx1d + 1>{});
+    }
+
+    template <index_t AccessIdx1d>
+    static __device__ __host__ constexpr auto GetBackwardStep(Number<AccessIdx1d>)
+    {
+        static_assert(AccessIdx1d > 0, "1D index should be larger than 0");
+
+        return GetStepBetween(Number<AccessIdx1d>{}, Number<AccessIdx1d - 1>{});
+    }
+
+    template <index_t AccessIdx1d>
+    static __device__ __host__ constexpr Index GetIndex(Number<AccessIdx1d>)
+    {
+#if 0
+        /*
+         * \todo: TensorAdaptor::CalculateBottomIndex does NOT return constexpr as expected.
+         */
+        constexpr auto ordered_access_idx = to_index_adaptor.CalculateBottomIndex(make_multi_index(Number<AccessIdx1d>{}));
+#else
+
+        constexpr auto access_strides = container_reverse_exclusive_scan(
+            ordered_access_lengths, math::multiplies{}, Number<1>{});
+
+        constexpr auto idx_1d = Number<AccessIdx1d>{};
+        // Given tensor strides \p access_lengths, and 1D index of space-filling-curve, compute the
+        // idim-th element of multidimensional index.
+        // All constexpr variables have to be captured by VALUE.
+        constexpr auto compute_index = [ idx_1d, access_strides ](auto idim) constexpr
+        {
+            constexpr auto compute_index_impl = [ idx_1d, access_strides ](auto jdim) constexpr
+            {
+                auto res = idx_1d.value;
+                auto id  = 0;
+
+                static_for<0, jdim.value + 1, 1>{}([&](auto kdim) {
+                    id = res / access_strides[kdim].value;
+                    res -= id * access_strides[kdim].value;
+                });
+
+                return id;
+            };
+
+            constexpr auto id = compute_index_impl(idim);
+            return Number<id>{};
+        };
+
+        constexpr auto ordered_access_idx = generate_tuple(compute_index, Number<nDim>{});
+#endif
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto idim) {
+                index_t tmp = ordered_access_idx[I0];
+
+                static_for<1, idim, 1>{}(
+                    [&](auto j) { tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; });
+
+                forward_sweep_(idim) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate multi-dim tensor index
+        auto idx_md = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto idim) {
+                ordered_idx(idim) = forward_sweep[idim] ? ordered_access_idx[idim]
+                                                        : ordered_access_lengths[idim] - 1 -
+                                                              ordered_access_idx[idim];
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
+                   ScalarsPerAccess{};
+        }();
+        return idx_md;
+    }
+
+    // FIXME: rename this function
+    template <index_t AccessIdx1d>
+    static __device__ __host__ constexpr auto GetIndexTupleOfNumber(Number<AccessIdx1d>)
+    {
+        constexpr auto idx = GetIndex(Number<AccessIdx1d>{});
+
+        return generate_tuple([&](auto i) { return Number<idx[i]>{}; }, Number<nDim>{});
+    }
+};
+
+} // namespace ck