mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 14:59:17 +00:00
implement device batched gemm b scale for wmma (#2825)
* rebased on top of develop * fixed missing shuffeling and wrong indexing * added tests for batched_b_scale * added missing files * fixed wrong stride computation and removed k batching (for now) due to precision issues * reinstated k-batching with PRNG constrained to -1..1 * added specialization of GeneratorTensor_3 for int4 and fixed internal overflow * added k-batching to reference and increased tolerances for test * changed gemm_b_scale and gemm_universal tests to use correct parameters * adressed review commentsd * ported fixes back to non-batched version of b_scale * adressed review comments * run clang-format on older commits * add type-conversion to AccDataType and then to CDataType to exactly mimic GPU's behavior * added newline at end of file * reflected changes from muitl-abd branch in batched b_scale * fixed gfx11 issue * changed range for pki4 to -1...1 (-0.5...0.5 never really made sense for i4 anyway and always should have caused compiler errors, but since there was no int4 specialization of GeneratorTensor3 until now, this passed * run clang format * set range of i4 generation to 0...1 for upstream tests to pass. This replicated previous behavior, which however means that it is NOT properly tested. * reduced range for pk_i4 even further to 0..0 * removed failing xld instances. Failure now uncovered now that tests were fixed * removed generation of int4 values entierly * divide B buffer by BPackedSize --------- Co-authored-by: Kevin Abraham <kevin.abraham@streamhpc.com>
This commit is contained in:
@@ -264,7 +264,7 @@ struct GeneratorTensor_2<ck::pk_i4_t>
|
||||
{
|
||||
int hi = std::rand() % (max_value - min_value) + min_value + 8;
|
||||
int lo = std::rand() % (max_value - min_value) + min_value + 8;
|
||||
ck::pk_i4_t r = ((hi << 4) + lo) & 0xff;
|
||||
ck::pk_i4_t r = (((hi & 0xf) << 4) + (lo & 0xf));
|
||||
return r;
|
||||
}
|
||||
};
|
||||
@@ -436,6 +436,22 @@ struct GeneratorTensor_3<ck::f4x2_pk_t>
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GeneratorTensor_3<ck::pk_i4_t>
|
||||
{
|
||||
int min_value = 0;
|
||||
int max_value = 1;
|
||||
|
||||
template <typename... Is>
|
||||
ck::pk_i4_t operator()(Is...)
|
||||
{
|
||||
int hi = std::rand() % (max_value - min_value) + min_value + 8;
|
||||
int lo = std::rand() % (max_value - min_value) + min_value + 8;
|
||||
ck::pk_i4_t r = (((hi & 0xf) << 4) + (lo & 0xf));
|
||||
return r;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GeneratorTensor_3<ck::f6x32_pk_t>
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user