Gemm reduce max (#209)

* [What] Rename the example [Why] Prepare to add unary reduction * Add global oparation to the parameter * Add atomicmax * Fix compile error * Support atomicMax (hip library) * Rename the reduction example * Fix target name * use p_d1_grid as the indicator directly * Prevent performance issue. Let passthrough handle it. * Implement the function template the specialize the float2 * No need to separate into two lines * Remove empty line * add comment * Fix compile error due to merge from develop * make the implementation of atomic_max / atomic_add explicit for each datatype * Refine typo * For future CI test * Fix compiler error in ckProfiler * Merge commit 'de2769e3a6695b38a20529261273ddc5cdaab2fe' * simply use remove_pointer * Rename type and var * Refine example * Modify reducemax example * Fix bug in reduction * Change initialize range * Implement F64 version of atomicMax * Move reduction code together * Add buffer atomic_max * Fix coding style by clang-format * Integrate new api of DeviceGemmReduce_Xdl_CShuffle * Integrate Batch gemm reduction * Fix example * fix example * clean up * Fix batch gemm tensor operation * Fix coding style * Fix template augument * Fix clang format * Keep flexible of different stride for each D tensor * Fix compile error for ckProfiler * Fix typo * [What] Fix naming [Why] Prepare to add out elementop * Add DoutElementOp Co-authored-by: Chao Liu <chao.liu2@amd.com> Co-authored-by: rocking <chunylai@amd.com> [ROCm/composable_kernel commit: 0ffe956ab1]
2026-05-19 12:30:16 +00:00 · 2022-05-20 10:56:56 +08:00
parent 8bdd05f366
commit 7100ce8382
28 changed files with 1298 additions and 626 deletions
--- a/library/include/ck/library/host_tensor/device.hpp
+++ b/library/include/ck/library/host_tensor/device.hpp
@@ -10,6 +10,15 @@
 #include "stream_config.hpp"
 #include "ck/options.hpp"

+template <typename T>
+__global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
+{
+    for(uint64_t i = threadIdx.x; i < buffer_element_size; i += blockDim.x)
+    {
+        p[i] = x;
+    }
+}
+
 inline void hip_check_error(hipError_t x)
 {
    if(x != hipSuccess)
@@ -30,6 +39,16 @@ struct DeviceMem
    void ToDevice(const void* p);
    void FromDevice(void* p);
    void SetZero();
+    template <typename T>
+    void SetValue(T x)
+    {
+        if(mMemSize % sizeof(T) != 0)
+        {
+            throw std::runtime_error("wrong! not entire DeviceMem will be set");
+        }
+
+        set_buffer_value<T><<<1, 1024>>>(static_cast<T*>(mpDeviceBuf), x, mMemSize / sizeof(T));
+    }
    ~DeviceMem();

    void* mpDeviceBuf;
@@ -74,8 +93,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
        printf("Warm up 1 time\n");

        // warm up
-        hipLaunchKernelGGL(
-            kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
+        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);

        printf("Start running %d times...\n", nrepeat);

@@ -84,8 +102,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,

        for(int i = 0; i < nrepeat; ++i)
        {
-            hipLaunchKernelGGL(
-                kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
+            kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
        }

        timer.End();
@@ -94,13 +111,12 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
    }
    else
    {
-        hipLaunchKernelGGL(
-            kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
+        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);

        return 0;
    }
 #else
-    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
+    kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);

    return 0;
 #endif