experimenting

[ROCm/composable_kernel commit: 766b0a9eaf]
2026-05-19 12:30:16 +00:00 · 2019-03-24 12:09:57 -05:00
parent 6f1651f8a7
commit cd883e7581
33 changed files with 1886 additions and 1822 deletions
--- a/src/include/blockwise_2d_tensor_op.hip.hpp
+++ b/src/include/blockwise_2d_tensor_op.hip.hpp
@@ -1,7 +1,7 @@
 #pragma once
 #include "ConstantTensorDescriptor.hip.hpp"

-template <unsigned BlockSize, class Float, class DstDesc, class F>
+template <index_t BlockSize, class Float, class DstDesc, class F>
 __device__ void
 blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f)
 {
@@ -20,19 +20,19 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
    }
 #endif

-    constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;
+    constexpr index_t NLoop = desc.GetElementSize() / BlockSize;

-    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
+    for(index_t iloop = 0; iloop < NLoop; ++iloop)
    {
-        unsigned is = threadIdx.x + iloop * BlockSize;
+        index_t is = threadIdx.x + iloop * BlockSize;

-        const unsigned did0 = is / desc.GetStride(I0);
+        const index_t did0 = is / desc.GetStride(I0);

        is -= did0 * desc.GetStride(I0);

-        const unsigned did1 = is / desc.GetStride(I1);
+        const index_t did1 = is / desc.GetStride(I1);

-        const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
+        const index_t dindex = dst_desc.Get1dIndex(did0, did1);

        f(p_dst[dindex]);
    }
@@ -41,17 +41,17 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst

    if(has_tail)
    {
-        unsigned is = threadIdx.x + NLoop * BlockSize;
+        index_t is = threadIdx.x + NLoop * BlockSize;

        if(is < desc.GetElementSize())
        {
-            const unsigned did0 = is / desc.GetStride(I0);
+            const index_t did0 = is / desc.GetStride(I0);

            is -= did0 * desc.GetStride(I0);

-            const unsigned did1 = is / desc.GetStride(I1);
+            const index_t did1 = is / desc.GetStride(I1);

-            const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
+            const index_t dindex = dst_desc.Get1dIndex(did0, did1);

            f(p_dst[dindex]);
        }
@@ -61,7 +61,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
 // Function: p_dst[reorder[i0], reorder[i1], reorder[i2], reorder[i3]] = p_src[i0,i1,i2,i3]
 // TODO: in order to optimize mem access for different mem type,
 // need to write specialized version
-template <unsigned BlockSize,
+template <index_t BlockSize,
          class Float,
          class SrcDesc,
          class DstDesc,
@@ -80,20 +80,20 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};

-    constexpr unsigned IR0 = DstFromSrcReorder{}.Get(I0);
-    constexpr unsigned IR1 = DstFromSrcReorder{}.Get(I1);
+    constexpr index_t IR0 = DstFromSrcReorder{}.Get(I0);
+    constexpr index_t IR1 = DstFromSrcReorder{}.Get(I1);

    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
    constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});

-    constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
+    constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;

-    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
+    for(index_t iloop = 0; iloop < NLoop; ++iloop)
    {
-        unsigned is = threadIdx.x + iloop * BlockSize;
+        index_t is = threadIdx.x + iloop * BlockSize;

-        unsigned did[2];
+        index_t did[2];

        did[0] = is / ref_desc.GetStride(I0);

@@ -101,9 +101,9 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds

        did[1] = is / ref_desc.GetStride(I1);

-        const unsigned aindex = src_desc.Get1dIndex(did[0], did[1]);
+        const index_t aindex = src_desc.Get1dIndex(did[0], did[1]);

-        const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);
+        const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);

        f(p_src[aindex], p_dst[bindex]);
    }
@@ -112,11 +112,11 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds

    if(has_tail)
    {
-        unsigned is = threadIdx.x + NLoop * BlockSize;
+        index_t is = threadIdx.x + NLoop * BlockSize;

        if(is < ref_desc.GetElementSize())
        {
-            unsigned did[2];
+            index_t did[2];

            did[0] = is / ref_desc.GetStride(I0);

@@ -124,16 +124,16 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds

            did[1] = is / ref_desc.GetStride(I1);

-            const unsigned aindex = src_desc.Get1dIndex(did[0], did[1]);
+            const index_t aindex = src_desc.Get1dIndex(did[0], did[1]);

-            const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);
+            const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]);

            f(p_src[aindex], p_dst[bindex]);
        }
    }
 }

-template <unsigned BlockSize, class Float, class DstDesc>
+template <index_t BlockSize, class Float, class DstDesc>
 __device__ void blockwise_2d_tensor_set_zero(DstDesc, Float* __restrict__ p_dst)
 {
    auto f_set_zero = [](Float& v) { v = Float(0); };
@@ -141,7 +141,7 @@ __device__ void blockwise_2d_tensor_set_zero(DstDesc, Float* __restrict__ p_dst)
    blockwise_2d_tensor_pointwise_operation_unary<BlockSize>(DstDesc{}, p_dst, f_set_zero);
 }

-template <unsigned BlockSize,
+template <index_t BlockSize,
          class Float,
          class SrcDesc,
          class DstDesc,
@@ -161,7 +161,7 @@ blockwise_2d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
        SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy);
 }

-template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
+template <index_t BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
 struct Blockwise2dTensorCopy1
 {
    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
@@ -175,17 +175,17 @@ struct Blockwise2dTensorCopy1

 // need to be aligned to float4 and float2
 // stride1 need to be 1 for both source and destination
-template <unsigned BlockSize,
+template <index_t BlockSize,
          class Float,
          class SrcDesc,
          class DstDesc,
          class SrcOpLengths,
-          unsigned ThreadPerDim0,
-          unsigned ThreadPerDim1>
+          index_t ThreadPerDim0,
+          index_t ThreadPerDim1>
 struct Blockwise2dTensorCopy2
 {
-    unsigned mThreadId0;
-    unsigned mThreadId1;
+    index_t mThreadId0;
+    index_t mThreadId1;

    __device__ Blockwise2dTensorCopy2()
    {
@@ -222,61 +222,61 @@ struct Blockwise2dTensorCopy2
        constexpr bool align_v2 =
            src_desc.GetStride(I0) % 2 == 0 && dst_desc.GetStride(I0) % 2 == 0;

-        constexpr unsigned L0 = SrcOpLengths{}.Get(I0);
-        constexpr unsigned L1 = SrcOpLengths{}.Get(I1);
+        constexpr index_t L0 = SrcOpLengths{}.Get(I0);
+        constexpr index_t L1 = SrcOpLengths{}.Get(I1);

-        constexpr unsigned Dim0Loop = L0 / ThreadPerDim0;
-        constexpr bool d0_has_tail  = (L0 > ThreadPerDim0 * Dim0Loop);
+        constexpr index_t Dim0Loop = L0 / ThreadPerDim0;
+        constexpr bool d0_has_tail = (L0 > ThreadPerDim0 * Dim0Loop);

-        constexpr unsigned Dim1V4Loop = align_v4 ? L1 / (ThreadPerDim1 * 4) : 0;
+        constexpr index_t Dim1V4Loop = align_v4 ? L1 / (ThreadPerDim1 * 4) : 0;

-        constexpr unsigned Dim1V2Loop =
+        constexpr index_t Dim1V2Loop =
            align_v2 ? (L1 - Dim1V4Loop * (ThreadPerDim1 * 4)) / (ThreadPerDim1 * 2) : 0;

-        constexpr unsigned Dim1V1Loop =
+        constexpr index_t Dim1V1Loop =
            (L1 - Dim1V4Loop * (ThreadPerDim1 * 4) - Dim1V2Loop * (ThreadPerDim1 * 2)) /
            ThreadPerDim1;

        constexpr bool d1_has_tail =
            (L1 > ThreadPerDim1 * (4 * Dim1V4Loop + 2 * Dim1V2Loop + Dim1V1Loop));

-        for(unsigned d0loop = 0; d0loop < Dim0Loop; ++d0loop)
+        for(index_t d0loop = 0; d0loop < Dim0Loop; ++d0loop)
        {
-            unsigned did0 = d0loop * ThreadPerDim0 + mThreadId0;
+            index_t did0 = d0loop * ThreadPerDim0 + mThreadId0;

            // v4
-            for(unsigned d1v4loop = 0; d1v4loop < Dim1V4Loop; ++d1v4loop)
+            for(index_t d1v4loop = 0; d1v4loop < Dim1V4Loop; ++d1v4loop)
            {
-                unsigned did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1;
+                index_t did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1;

-                const unsigned sindex = src_desc.Get1dIndex(did0, did1);
-                const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
+                const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                const index_t dindex = dst_desc.Get1dIndex(did0, did1);

                *(reinterpret_cast<Float4*>(p_dst + dindex)) =
                    *(reinterpret_cast<const Float4*>(p_src + sindex));
            }

            // v2
-            for(unsigned d1v2loop = 0; d1v2loop < Dim1V2Loop; ++d1v2loop)
+            for(index_t d1v2loop = 0; d1v2loop < Dim1V2Loop; ++d1v2loop)
            {
-                unsigned did1 =
+                index_t did1 =
                    Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + 2 * mThreadId1;

-                const unsigned sindex = src_desc.Get1dIndex(did0, did1);
-                const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
+                const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                const index_t dindex = dst_desc.Get1dIndex(did0, did1);

                *(reinterpret_cast<Float2*>(p_dst + dindex)) =
                    *(reinterpret_cast<const Float2*>(p_src + sindex));
            }

            // v1
-            for(unsigned d1v1loop = 0; d1v1loop < Dim1V1Loop; ++d1v1loop)
+            for(index_t d1v1loop = 0; d1v1loop < Dim1V1Loop; ++d1v1loop)
            {
-                unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 +
-                                d1v1loop * ThreadPerDim1 + mThreadId1;
+                index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 +
+                               d1v1loop * ThreadPerDim1 + mThreadId1;

-                const unsigned sindex = src_desc.Get1dIndex(did0, did1);
-                const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
+                const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                const index_t dindex = dst_desc.Get1dIndex(did0, did1);

                p_dst[dindex] = p_src[sindex];
            }
@@ -284,13 +284,13 @@ struct Blockwise2dTensorCopy2
            // dim-1 tail
            if(d1_has_tail)
            {
-                unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 +
-                                Dim1V1Loop * ThreadPerDim1 + mThreadId1;
+                index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 +
+                               Dim1V1Loop * ThreadPerDim1 + mThreadId1;

                if(did1 < L1)
                {
-                    const unsigned sindex = src_desc.Get1dIndex(did0, did1);
-                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
+                    const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                    const index_t dindex = dst_desc.Get1dIndex(did0, did1);

                    p_dst[dindex] = p_src[sindex];
                }
@@ -300,45 +300,44 @@ struct Blockwise2dTensorCopy2
        // dim-0 tail
        if(d0_has_tail)
        {
-            unsigned did0 = Dim0Loop * ThreadPerDim0 + mThreadId0;
+            index_t did0 = Dim0Loop * ThreadPerDim0 + mThreadId0;

            if(did0 < L0)
            {

                // v4
-                for(unsigned d1v4loop = 0; d1v4loop < Dim1V4Loop; ++d1v4loop)
+                for(index_t d1v4loop = 0; d1v4loop < Dim1V4Loop; ++d1v4loop)
                {
-                    unsigned did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1;
+                    index_t did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1;

-                    const unsigned sindex = src_desc.Get1dIndex(did0, did1);
-                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
+                    const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                    const index_t dindex = dst_desc.Get1dIndex(did0, did1);

                    *(reinterpret_cast<Float4*>(p_dst + dindex)) =
                        *(reinterpret_cast<const Float4*>(p_src + sindex));
                }

                // v2
-                for(unsigned d1v2loop = 0; d1v2loop < Dim1V2Loop; ++d1v2loop)
+                for(index_t d1v2loop = 0; d1v2loop < Dim1V2Loop; ++d1v2loop)
                {
-                    unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 +
-                                    2 * mThreadId1;
+                    index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 +
+                                   2 * mThreadId1;

-                    const unsigned sindex = src_desc.Get1dIndex(did0, did1);
-                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
+                    const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                    const index_t dindex = dst_desc.Get1dIndex(did0, did1);

                    *(reinterpret_cast<Float2*>(p_dst + dindex)) =
                        *(reinterpret_cast<const Float2*>(p_src + sindex));
                }

                // v1
-                for(unsigned d1v1loop = 0; d1v1loop < Dim1V1Loop; ++d1v1loop)
+                for(index_t d1v1loop = 0; d1v1loop < Dim1V1Loop; ++d1v1loop)
                {
-                    unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 +
-                                    Dim1V2Loop * 2 * ThreadPerDim1 + d1v1loop * ThreadPerDim1 +
-                                    mThreadId1;
+                    index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 +
+                                   d1v1loop * ThreadPerDim1 + mThreadId1;

-                    const unsigned sindex = src_desc.Get1dIndex(did0, did1);
-                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
+                    const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                    const index_t dindex = dst_desc.Get1dIndex(did0, did1);

                    p_dst[dindex] = p_src[sindex];
                }
@@ -346,14 +345,13 @@ struct Blockwise2dTensorCopy2
                // tail
                if(d1_has_tail)
                {
-                    unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 +
-                                    Dim1V2Loop * 2 * ThreadPerDim1 + Dim1V1Loop * ThreadPerDim1 +
-                                    mThreadId1;
+                    index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 +
+                                   Dim1V1Loop * ThreadPerDim1 + mThreadId1;

                    if(did1 < L1)
                    {
-                        const unsigned sindex = src_desc.Get1dIndex(did0, did1);
-                        const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
+                        const index_t sindex = src_desc.Get1dIndex(did0, did1);
+                        const index_t dindex = dst_desc.Get1dIndex(did0, did1);

                        p_dst[dindex] = p_src[sindex];
                    }
@@ -365,18 +363,18 @@ struct Blockwise2dTensorCopy2

 // starting point need to be aligned to float4 or float2 or float
 // stride1 need to be 1 for both source and destination
-template <unsigned BlockSize,
+template <index_t BlockSize,
          class Float,
          class SrcDesc,
          class DstDesc,
          class CopyLengths,
-          unsigned DataPerRead>
+          index_t DataPerRead>
 struct Blockwise2dTensorCopy3
 {
    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;

-    unsigned mSrcMyThreadOffset;
-    unsigned mDstMyThreadOffset;
+    index_t mSrcMyThreadOffset;
+    index_t mDstMyThreadOffset;

    __device__ Blockwise2dTensorCopy3()
    {
@@ -394,11 +392,11 @@ struct Blockwise2dTensorCopy3
                          DstDesc{}.GetStride(I0) % DataPerRead == 0,
                      "src and dst stride should be multiple of DataPerRead to keep alignment");

-        constexpr unsigned L0 = CopyLengths{}.Get(I0);
-        constexpr unsigned L1 = CopyLengths{}.Get(I1);
+        constexpr index_t L0 = CopyLengths{}.Get(I0);
+        constexpr index_t L1 = CopyLengths{}.Get(I1);

-        constexpr unsigned thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;
-        constexpr unsigned thread_per_d0 = BlockSize / thread_per_d1;
+        constexpr index_t thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;
+        constexpr index_t thread_per_d0 = BlockSize / thread_per_d1;

        // we allow out-of-bound read from src in D1 dimension,
        //   but we need to make sure dst stride is big enough,
@@ -408,7 +406,7 @@ struct Blockwise2dTensorCopy3

        static_assert(thread_per_d0 >= 1, "wrong! not enough threads to cover one line\n");

-        constexpr unsigned num_active_thread = thread_per_d0 * thread_per_d1;
+        constexpr index_t num_active_thread = thread_per_d0 * thread_per_d1;

        if(BlockSize > num_active_thread)
        {
@@ -418,8 +416,8 @@ struct Blockwise2dTensorCopy3
            }
        }

-        const unsigned thread_id_d0 = get_thread_local_1d_id() / thread_per_d1;
-        const unsigned thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1;
+        const index_t thread_id_d0 = get_thread_local_1d_id() / thread_per_d1;
+        const index_t thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1;

        mSrcMyThreadOffset = SrcDesc{}.Get1dIndex(thread_id_d0, thread_id_d1 * DataPerRead);
        mDstMyThreadOffset = DstDesc{}.Get1dIndex(thread_id_d0, thread_id_d1 * DataPerRead);
@@ -430,13 +428,13 @@ struct Blockwise2dTensorCopy3
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};

-        constexpr unsigned L0 = CopyLengths{}.Get(I0);
-        constexpr unsigned L1 = CopyLengths{}.Get(I1);
+        constexpr index_t L0 = CopyLengths{}.Get(I0);
+        constexpr index_t L1 = CopyLengths{}.Get(I1);

-        constexpr unsigned thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;
-        constexpr unsigned thread_per_d0 = BlockSize / thread_per_d1;
+        constexpr index_t thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;
+        constexpr index_t thread_per_d0 = BlockSize / thread_per_d1;

-        constexpr unsigned num_active_thread = thread_per_d0 * thread_per_d1;
+        constexpr index_t num_active_thread = thread_per_d0 * thread_per_d1;

        if(BlockSize > num_active_thread)
        {
@@ -446,18 +444,18 @@ struct Blockwise2dTensorCopy3
            }
        }

-        constexpr unsigned nloop_d0 = L0 / thread_per_d0;
+        constexpr index_t nloop_d0 = L0 / thread_per_d0;

-        constexpr unsigned src_loop_stride = SrcDesc{}.GetStride(I0) * thread_per_d0;
-        constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
+        constexpr index_t src_loop_stride = SrcDesc{}.GetStride(I0) * thread_per_d0;
+        constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;

-        auto f_copy = [&](unsigned iloop) {
+        auto f_copy = [&](index_t iloop) {
            *(reinterpret_cast<vector_t*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
                *(reinterpret_cast<const vector_t*>(p_src + mSrcMyThreadOffset +
                                                    iloop * src_loop_stride));
        };

-        for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
+        for(index_t iloop = 0; iloop < nloop_d0; ++iloop)
        {
            f_copy(iloop);
        }
@@ -466,7 +464,7 @@ struct Blockwise2dTensorCopy3

        if(has_tail_d0)
        {
-            constexpr unsigned tail_d0 = L0 - nloop_d0 * thread_per_d0;
+            constexpr index_t tail_d0 = L0 - nloop_d0 * thread_per_d0;

            if(get_thread_local_1d_id() < tail_d0 * thread_per_d1)
            {
@@ -475,18 +473,18 @@ struct Blockwise2dTensorCopy3
        }
    }

-    __device__ constexpr unsigned GetRegisterClipboardSize() const
+    __device__ constexpr index_t GetRegisterClipboardSize() const
    {
        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");

        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};

-        constexpr unsigned L0 = CopyLengths{}.Get(I0);
-        constexpr unsigned L1 = CopyLengths{}.Get(I1);
+        constexpr index_t L0 = CopyLengths{}.Get(I0);
+        constexpr index_t L1 = CopyLengths{}.Get(I1);

-        constexpr unsigned thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;
-        constexpr unsigned thread_per_d0 = BlockSize / thread_per_d1;
+        constexpr index_t thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;
+        constexpr index_t thread_per_d0 = BlockSize / thread_per_d1;

        return DataPerRead * (L0 + thread_per_d0 - 1) / thread_per_d0;
    }
@@ -497,13 +495,13 @@ struct Blockwise2dTensorCopy3
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};

-        constexpr unsigned L0 = CopyLengths{}.Get(I0);
-        constexpr unsigned L1 = CopyLengths{}.Get(I1);
+        constexpr index_t L0 = CopyLengths{}.Get(I0);
+        constexpr index_t L1 = CopyLengths{}.Get(I1);

-        constexpr unsigned thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;
-        constexpr unsigned thread_per_d0 = BlockSize / thread_per_d1;
+        constexpr index_t thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;
+        constexpr index_t thread_per_d0 = BlockSize / thread_per_d1;

-        constexpr unsigned num_active_thread = thread_per_d0 * thread_per_d1;
+        constexpr index_t num_active_thread = thread_per_d0 * thread_per_d1;

        if(BlockSize > num_active_thread)
        {
@@ -513,18 +511,18 @@ struct Blockwise2dTensorCopy3
            }
        }

-        constexpr unsigned nloop_d0 = L0 / thread_per_d0;
+        constexpr index_t nloop_d0 = L0 / thread_per_d0;

-        constexpr unsigned src_loop_stride = SrcDesc{}.GetStride(I0) * thread_per_d0;
-        constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
+        constexpr index_t src_loop_stride = SrcDesc{}.GetStride(I0) * thread_per_d0;
+        constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;

-        auto f_copy = [&](unsigned iloop) {
+        auto f_copy = [&](index_t iloop) {
            *(reinterpret_cast<vector_t*>(p_clipboard + iloop * 4)) =
                *(reinterpret_cast<const vector_t*>(p_src + mSrcMyThreadOffset +
                                                    iloop * src_loop_stride));
        };

-        for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
+        for(index_t iloop = 0; iloop < nloop_d0; ++iloop)
        {
            f_copy(iloop);
        }
@@ -533,7 +531,7 @@ struct Blockwise2dTensorCopy3

        if(has_tail_d0)
        {
-            constexpr unsigned tail_d0 = L0 - nloop_d0 * thread_per_d0;
+            constexpr index_t tail_d0 = L0 - nloop_d0 * thread_per_d0;

            if(get_thread_local_1d_id() < tail_d0 * thread_per_d1)
            {
@@ -548,13 +546,13 @@ struct Blockwise2dTensorCopy3
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};

-        constexpr unsigned L0 = CopyLengths{}.Get(I0);
-        constexpr unsigned L1 = CopyLengths{}.Get(I1);
+        constexpr index_t L0 = CopyLengths{}.Get(I0);
+        constexpr index_t L1 = CopyLengths{}.Get(I1);

-        constexpr unsigned thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;
-        constexpr unsigned thread_per_d0 = BlockSize / thread_per_d1;
+        constexpr index_t thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;
+        constexpr index_t thread_per_d0 = BlockSize / thread_per_d1;

-        constexpr unsigned num_active_thread = thread_per_d0 * thread_per_d1;
+        constexpr index_t num_active_thread = thread_per_d0 * thread_per_d1;

        if(BlockSize > num_active_thread)
        {
@@ -564,17 +562,17 @@ struct Blockwise2dTensorCopy3
            }
        }

-        constexpr unsigned nloop_d0 = L0 / thread_per_d0;
+        constexpr index_t nloop_d0 = L0 / thread_per_d0;

-        constexpr unsigned src_loop_stride = SrcDesc{}.GetStride(I0) * thread_per_d0;
-        constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
+        constexpr index_t src_loop_stride = SrcDesc{}.GetStride(I0) * thread_per_d0;
+        constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;

-        auto f_copy = [&](unsigned iloop) {
+        auto f_copy = [&](index_t iloop) {
            *(reinterpret_cast<vector_t*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
                *(reinterpret_cast<const vector_t*>(p_clipboard + iloop * 4));
        };

-        for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
+        for(index_t iloop = 0; iloop < nloop_d0; ++iloop)
        {
            f_copy(iloop);
        }
@@ -583,7 +581,7 @@ struct Blockwise2dTensorCopy3

        if(has_tail_d0)
        {
-            constexpr unsigned tail_d0 = L0 - nloop_d0 * thread_per_d0;
+            constexpr index_t tail_d0 = L0 - nloop_d0 * thread_per_d0;

            if(get_thread_local_1d_id() < tail_d0 * thread_per_d1)
            {