mirror of
https://github.com/amd/blis.git
synced 2026-04-20 15:48:50 +00:00
Bug Fixes for GEMV AVX2 BF16 to F32 path
- Added the correct strides to be used while unreorder/convert B matrix in m=1 cases. - Modified Zero point vector loads to proper instructions. - Modified bf16 store in AVX2 GEMV M kenrel AMD Internal - [SWLCSG - 3602 ]
This commit is contained in:
@@ -1004,6 +1004,15 @@ LPGEMV_AVX2(bfloat16, bfloat16, float, bf16bf16f32of32)
|
||||
dim_t jc_cur_loop_rem = 0;
|
||||
dim_t n_sub_updated = 0;
|
||||
|
||||
dim_t nc0_updated = make_multiple_of_n( nc0, packb_min_NR );
|
||||
mem_b_size_req = sizeof( float ) * nc0_updated * k_updated;
|
||||
|
||||
lpgemm_alloc_mem_panel
|
||||
(
|
||||
mem_b_size_req, BLIS_BUFFER_FOR_B_PANEL,
|
||||
&mem_b, rntm
|
||||
);
|
||||
|
||||
if (mtag_b == REORDERED)
|
||||
{
|
||||
get_B_panel_reordered_start_offset_width
|
||||
@@ -1021,17 +1030,6 @@ LPGEMV_AVX2(bfloat16, bfloat16, float, bf16bf16f32of32)
|
||||
dim_t kc0_updated = kc0;
|
||||
kc0_updated += ( kc0_updated & 0x1 );
|
||||
|
||||
dim_t nc0_updated = make_multiple_of_n( nc0, packb_min_NR );
|
||||
mem_b_size_req = sizeof( float ) * nc0_updated * k_updated;
|
||||
|
||||
n_sub_updated = nc0_updated;
|
||||
|
||||
lpgemm_alloc_mem_panel
|
||||
(
|
||||
mem_b_size_req, BLIS_BUFFER_FOR_B_PANEL,
|
||||
&mem_b, rntm
|
||||
);
|
||||
|
||||
if( mtag_b == REORDERED )
|
||||
{
|
||||
float *b_unreorder =
|
||||
@@ -1043,8 +1041,8 @@ LPGEMV_AVX2(bfloat16, bfloat16, float, bf16bf16f32of32)
|
||||
unpackb_nr64_bf16_f32
|
||||
(
|
||||
( b + ( jc_cur_loop * k_updated ) +
|
||||
( jc_cur_loop_rem * kc0_updated) ),
|
||||
( b_unreorder ),
|
||||
( jc_cur_loop_rem * kc0_updated) + ( n_sub_updated * pc )),
|
||||
( b_unreorder + (nc0 * pc)),
|
||||
kc0, nc0 ,
|
||||
rs_b_use, cs_b_use, FALSE
|
||||
);
|
||||
@@ -1059,7 +1057,7 @@ LPGEMV_AVX2(bfloat16, bfloat16, float, bf16bf16f32of32)
|
||||
|
||||
cvt_bf16_f32
|
||||
(
|
||||
( cvt_b_buffer_bf16_f32 ),
|
||||
( cvt_b_buffer_bf16_f32 + (nc0 * pc) ),
|
||||
( b + ( rs_b * pc ) + ( cs_b * jc ) ), rs_b, cs_b,
|
||||
kc0, nc0,
|
||||
rs_b_use, cs_b_use
|
||||
|
||||
Reference in New Issue
Block a user