Added low precision GEMM - bf16bf16f32of32

Feature Addition: Added a new variant of low precision GEMM to addon - BFloat16. The kernel takes bf16 type inputs and perform BF16 GEMM operations. The intermediate accumulation and output are in float. 1. Compute kernels will perform computations only if B matrix is reordered in accordance with the usage of AVX-512 BF16 instruction - dpbf16_ps 2. Kernel for packing B matrix is provided Change-Id: If5d08213068869eff060c9998596d2d2703a6793
2026-05-03 22:11:12 +00:00 · 2022-08-17 08:25:30 +00:00
parent 219c41ded9
commit 4e3e00fb7e
23 changed files with 7942 additions and 734 deletions
--- a/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h
+++ b/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h
@@ -37,6 +37,7 @@

 #include "lpgemm_types.h"
 #include "lpgemm_post_ops.h"
+#include "aocl_bf16_type.h"

 #define LPGEMM_5LOOP(A_type,B_type,C_type,LP_SFX) \
 void lpgemm_rowvar_ ## LP_SFX \
@@ -64,4 +65,5 @@ void lpgemm_rowvar_ ## LP_SFX \
 LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32);
 LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16);
 LPGEMM_5LOOP(float,float,float,f32f32f32of32);
+LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32);
 #endif // LPGEMM_5LOOP_INTF_H