mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-20 21:09:08 +00:00
Added bwd data v3r1 v4r1, tweaking v1 (#10)
* Added bwd data v3r1: breaking down compute into a series of load balanced GEMM, and launch in a single kernel * Added bwd data v4r1: like v3r1, but launch GEMMs in multiple kernels * Tweaked v1r1 and v1r2 (atomic) on AMD GPU
This commit is contained in:
@@ -49,6 +49,12 @@ struct integer_divide_ceiler
|
||||
}
|
||||
};
|
||||
|
||||
template <class X, class Y>
|
||||
__host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
|
||||
{
|
||||
return x / y;
|
||||
}
|
||||
|
||||
template <class X, class Y>
|
||||
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user