implement device batched gemm b scale for wmma (#2825)

* rebased on top of develop * fixed missing shuffeling and wrong indexing * added tests for batched_b_scale * added missing files * fixed wrong stride computation and removed k batching (for now) due to precision issues * reinstated k-batching with PRNG constrained to -1..1 * added specialization of GeneratorTensor_3 for int4 and fixed internal overflow * added k-batching to reference and increased tolerances for test * changed gemm_b_scale and gemm_universal tests to use correct parameters * adressed review commentsd * ported fixes back to non-batched version of b_scale * adressed review comments * run clang-format on older commits * add type-conversion to AccDataType and then to CDataType to exactly mimic GPU's behavior * added newline at end of file * reflected changes from muitl-abd branch in batched b_scale * fixed gfx11 issue * changed range for pki4 to -1...1 (-0.5...0.5 never really made sense for i4 anyway and always should have caused compiler errors, but since there was no int4 specialization of GeneratorTensor3 until now, this passed * run clang format * set range of i4 generation to 0...1 for upstream tests to pass. This replicated previous behavior, which however means that it is NOT properly tested. * reduced range for pk_i4 even further to 0..0 * removed failing xld instances. Failure now uncovered now that tests were fixed * removed generation of int4 values entierly * divide B buffer by BPackedSize --------- Co-authored-by: Kevin Abraham <kevin.abraham@streamhpc.com> [ROCm/composable_kernel commit: c4b2da9cbd]
2026-05-20 04:49:54 +00:00 · 2025-10-16 20:00:42 +02:00
parent 62afd9eb14
commit 06d76b160e
22 changed files with 1352 additions and 97 deletions
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -67,7 +67,6 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
  list(APPEND PROFILER_OPS profile_gemm_add.cpp)
  list(APPEND PROFILER_OPS profile_gemm_bias_add_reduce.cpp)
  list(APPEND PROFILER_OPS profile_gemm_splitk.cpp)
-  list(APPEND PROFILER_OPS profile_batched_gemm_b_scale.cpp)
  list(APPEND PROFILER_OPS profile_gemm_universal_batched.cpp)
  list(APPEND PROFILER_OPS profile_gemm_universal_streamk.cpp)
  list(APPEND PROFILER_OPS profile_conv_fwd_bias_relu.cpp)
@@ -89,6 +88,7 @@ endif()
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]")
  list(APPEND PROFILER_OPS profile_gemm_universal.cpp)
  list(APPEND PROFILER_OPS profile_batched_gemm.cpp)
+  list(APPEND PROFILER_OPS profile_batched_gemm_b_scale.cpp)
  list(APPEND PROFILER_OPS profile_gemm_b_scale.cpp)
  list(APPEND PROFILER_OPS profile_gemm_universal_reduce.cpp)
  list(APPEND PROFILER_OPS profile_grouped_conv_fwd.cpp)
@@ -191,7 +191,6 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[1
    list(APPEND DEVICE_INSTANCES device_gemm_mx_instance)
  endif()
  list(APPEND DEVICE_INSTANCES device_gemm_splitk_instance)
-  list(APPEND DEVICE_INSTANCES device_batched_gemm_b_scale_instance)
  list(APPEND DEVICE_INSTANCES device_gemm_universal_batched_instance)
  list(APPEND DEVICE_INSTANCES device_gemm_universal_streamk_instance)
  list(APPEND DEVICE_INSTANCES device_gemm_add_multiply_instance)
@@ -229,6 +228,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[1
  list(APPEND DEVICE_INSTANCES device_batched_gemm_instance)
  list(APPEND DEVICE_INSTANCES device_gemm_b_scale_instance)
  list(APPEND DEVICE_INSTANCES device_gemm_universal_reduce_instance)
+  list(APPEND DEVICE_INSTANCES device_batched_gemm_b_scale_instance)
  list(APPEND DEVICE_INSTANCES device_grouped_conv3d_fwd_instance)
  list(APPEND DEVICE_INSTANCES device_grouped_conv2d_bwd_data_instance)
  list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_data_instance)
--- a/profiler/src/profile_batched_gemm_b_scale.cpp
+++ b/profiler/src/profile_batched_gemm_b_scale.cpp
@@ -57,7 +57,7 @@ int profile_batched_gemm_b_scale(int argc, char* argv[])
        printf("arg6: initialization (0: no init; 1: integer value; 2: decimal value)\n");
        printf("arg7: print tensor value (0: no; 1: yes)\n");
        printf("arg8: time kernel (0=no, 1=yes)\n");
-        printf("arg9 to 15: M, N, K, StrideA, StrideB, StrideC, BatachCount\n");
+        printf("arg9 to 15: M, N, K, StrideA, StrideB, StrideC, BatchCount\n");
        printf("arg16: split k into mulitiple batch\n");
        printf("optional:\n");
        printf("arg17: number of warm-up cycles (default 1)\n");