Added wmma support for gemm quantization: (#2841)

- profiler for gemm quantization for DL/XDL - tests for gemm quantization for DL/XDL - implementation for gemm quantization for WMMA - profiler/tests for gemm qunatization for WMMA Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
2026-04-20 06:49:15 +00:00 · 2025-09-17 01:23:29 +02:00
parent 2723dbd332
commit f97b2a3f5d
21 changed files with 1167 additions and 8 deletions
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_common.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_common.hpp
@@ -3,6 +3,7 @@

 #pragma once

+#include <cstdint>
 #include <iostream>
 #include <sstream>

@@ -171,8 +172,8 @@ struct DeviceGemm_Wmma_CShuffleV3_Common
            // other hand, Split K for 16-bit outputs uses packed atomics so ScalarPerVectors cannot
            // be odd.
            constexpr bool AtomicsImplementationExists =
-                !(std::is_same_v<EDataType, ck::half_t> ||
-                  std::is_same_v<EDataType, ck::bhalf_t>) ||
+                !(std::is_same_v<EDataType, ck::half_t> || std::is_same_v<EDataType, ck::bhalf_t> ||
+                  std::is_same_v<EDataType, int8_t>) ||
                (CDEShuffleBlockTransferScalarPerVectors{}[0] % 2 == 0);

            if(has_main_k_block_loop)