Gemm reduce max (#209)

* [What] Rename the example
[Why] Prepare to add unary reduction

* Add global oparation to the parameter

* Add atomicmax

* Fix compile error

* Support atomicMax (hip library)

* Rename the reduction example

* Fix target name

* use p_d1_grid as the indicator directly

* Prevent performance issue. Let passthrough handle it.

* Implement the function template the specialize the float2

* No need to separate into two lines

* Remove empty line

* add comment

* Fix compile error due to merge from develop

* make the implementation of atomic_max / atomic_add explicit for each datatype

* Refine typo

* For future CI test

* Fix compiler error in ckProfiler

* Merge commit 'de2769e3a6695b38a20529261273ddc5cdaab2fe'

* simply use remove_pointer

* Rename type and var

* Refine example

* Modify reducemax example

* Fix bug in reduction

* Change initialize range

* Implement F64 version of atomicMax

* Move reduction  code together

* Add buffer atomic_max

* Fix coding style by clang-format

* Integrate new api of DeviceGemmReduce_Xdl_CShuffle

* Integrate Batch gemm reduction

* Fix example

* fix example

* clean up

* Fix batch gemm tensor operation

* Fix coding style

* Fix template augument

* Fix clang format

* Keep flexible of different stride for each D tensor

* Fix compile error for ckProfiler

* Fix typo

* [What] Fix naming
[Why] Prepare to add out elementop

* Add DoutElementOp

Co-authored-by: Chao Liu <chao.liu2@amd.com>
Co-authored-by: rocking <chunylai@amd.com>

[ROCm/composable_kernel commit: 0ffe956ab1]
This commit is contained in:
rocking5566
2022-05-20 10:56:56 +08:00
committed by GitHub
parent 8bdd05f366
commit 7100ce8382
28 changed files with 1298 additions and 626 deletions

View File

@@ -10,6 +10,15 @@
#include "stream_config.hpp"
#include "ck/options.hpp"
template <typename T>
__global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
{
for(uint64_t i = threadIdx.x; i < buffer_element_size; i += blockDim.x)
{
p[i] = x;
}
}
inline void hip_check_error(hipError_t x)
{
if(x != hipSuccess)
@@ -30,6 +39,16 @@ struct DeviceMem
void ToDevice(const void* p);
void FromDevice(void* p);
void SetZero();
template <typename T>
void SetValue(T x)
{
if(mMemSize % sizeof(T) != 0)
{
throw std::runtime_error("wrong! not entire DeviceMem will be set");
}
set_buffer_value<T><<<1, 1024>>>(static_cast<T*>(mpDeviceBuf), x, mMemSize / sizeof(T));
}
~DeviceMem();
void* mpDeviceBuf;
@@ -74,8 +93,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
printf("Warm up 1 time\n");
// warm up
hipLaunchKernelGGL(
kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
printf("Start running %d times...\n", nrepeat);
@@ -84,8 +102,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
for(int i = 0; i < nrepeat; ++i)
{
hipLaunchKernelGGL(
kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
}
timer.End();
@@ -94,13 +111,12 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
}
else
{
hipLaunchKernelGGL(
kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
return 0;
}
#else
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
return 0;
#endif