mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-19 12:30:16 +00:00
Gemm reduce max (#209)
* [What] Rename the example
[Why] Prepare to add unary reduction
* Add global oparation to the parameter
* Add atomicmax
* Fix compile error
* Support atomicMax (hip library)
* Rename the reduction example
* Fix target name
* use p_d1_grid as the indicator directly
* Prevent performance issue. Let passthrough handle it.
* Implement the function template the specialize the float2
* No need to separate into two lines
* Remove empty line
* add comment
* Fix compile error due to merge from develop
* make the implementation of atomic_max / atomic_add explicit for each datatype
* Refine typo
* For future CI test
* Fix compiler error in ckProfiler
* Merge commit 'de2769e3a6695b38a20529261273ddc5cdaab2fe'
* simply use remove_pointer
* Rename type and var
* Refine example
* Modify reducemax example
* Fix bug in reduction
* Change initialize range
* Implement F64 version of atomicMax
* Move reduction code together
* Add buffer atomic_max
* Fix coding style by clang-format
* Integrate new api of DeviceGemmReduce_Xdl_CShuffle
* Integrate Batch gemm reduction
* Fix example
* fix example
* clean up
* Fix batch gemm tensor operation
* Fix coding style
* Fix template augument
* Fix clang format
* Keep flexible of different stride for each D tensor
* Fix compile error for ckProfiler
* Fix typo
* [What] Fix naming
[Why] Prepare to add out elementop
* Add DoutElementOp
Co-authored-by: Chao Liu <chao.liu2@amd.com>
Co-authored-by: rocking <chunylai@amd.com>
[ROCm/composable_kernel commit: 0ffe956ab1]
This commit is contained in:
@@ -10,6 +10,15 @@
|
||||
#include "stream_config.hpp"
|
||||
#include "ck/options.hpp"
|
||||
|
||||
template <typename T>
|
||||
__global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
|
||||
{
|
||||
for(uint64_t i = threadIdx.x; i < buffer_element_size; i += blockDim.x)
|
||||
{
|
||||
p[i] = x;
|
||||
}
|
||||
}
|
||||
|
||||
inline void hip_check_error(hipError_t x)
|
||||
{
|
||||
if(x != hipSuccess)
|
||||
@@ -30,6 +39,16 @@ struct DeviceMem
|
||||
void ToDevice(const void* p);
|
||||
void FromDevice(void* p);
|
||||
void SetZero();
|
||||
template <typename T>
|
||||
void SetValue(T x)
|
||||
{
|
||||
if(mMemSize % sizeof(T) != 0)
|
||||
{
|
||||
throw std::runtime_error("wrong! not entire DeviceMem will be set");
|
||||
}
|
||||
|
||||
set_buffer_value<T><<<1, 1024>>>(static_cast<T*>(mpDeviceBuf), x, mMemSize / sizeof(T));
|
||||
}
|
||||
~DeviceMem();
|
||||
|
||||
void* mpDeviceBuf;
|
||||
@@ -74,8 +93,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
|
||||
printf("Warm up 1 time\n");
|
||||
|
||||
// warm up
|
||||
hipLaunchKernelGGL(
|
||||
kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
|
||||
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
||||
|
||||
printf("Start running %d times...\n", nrepeat);
|
||||
|
||||
@@ -84,8 +102,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
|
||||
|
||||
for(int i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
hipLaunchKernelGGL(
|
||||
kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
|
||||
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
||||
}
|
||||
|
||||
timer.End();
|
||||
@@ -94,13 +111,12 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
|
||||
}
|
||||
else
|
||||
{
|
||||
hipLaunchKernelGGL(
|
||||
kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
|
||||
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
|
||||
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
||||
|
||||
return 0;
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user