CGEMM examples bf16, fp32, int8 (#332)

* Add int8 specialization for elementwise Add and Subtract. * CGEMM examples bf16, fp32, int8 * Add convert reference output to CDataType. * Skip BF16 data type during testing. * Lower K value to get rid of accumulation error. * Fix merge artifact. * Fix changed function name: GetElementSpaceSize() * Fix merge artifact. Co-authored-by: Adam Osewski <aosewski@amd.com>
2026-05-21 21:39:15 +00:00 · 2022-08-02 21:52:27 +02:00
parent 984b3722bf
commit fb0dc35861
9 changed files with 649 additions and 182 deletions
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
@@ -6,8 +6,9 @@
 #include <iostream>
 #include <sstream>

-#include "ck/tensor_operation/gpu/device/device_base.hpp"
 #include "ck/library/utility/host_tensor.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -91,7 +92,7 @@ struct ReferenceCGemm : public device::BaseOperator
                    v_c_real += v_a_real * v_b_real - v_a_imag * v_b_imag;
                }

-                arg.c_m_n_real_(m, n) = v_c_real;
+                arg.c_m_n_real_(m, n) = ck::type_convert<CDataType>(v_c_real);
            };

            auto f_mk_kn_mn_imag = [&](auto m, auto n) {
@@ -107,7 +108,7 @@ struct ReferenceCGemm : public device::BaseOperator
                    v_c_imag += v_a_real * v_b_imag + v_a_imag * v_b_real;
                }

-                arg.c_m_n_imag_(m, n) = v_c_imag;
+                arg.c_m_n_imag_(m, n) = ck::type_convert<CDataType>(v_c_imag);
            };

            make_ParallelTensorFunctor(f_mk_kn_mn_real,