Bug fix in LPGEMV m=1 AVX2 kernel for post-ops

Details:
- Fixed loading of matadd and matmul pointers in GEMV
 lt16 kernel for AVX2 M=1 case.
- Hard-set row-stride of B to 1(inside GEMV), when it has
   already been reordered.

AMD-Internal:CPUPL-7197, CPUPL-7221
Co-authored-by:Balasubramanian, Vignesh <Vignesh.Balasubramanian@amd.com>
This commit is contained in:
Vankadari, Meghana
2025-08-22 18:15:05 +05:30
committed by GitHub
parent d29f3f0b5e
commit 5044b69d3d
3 changed files with 22 additions and 11 deletions

View File

@@ -202,8 +202,14 @@ LPGEMV(float, float, float, f32f32f32of32)
}
#endif
// The vector is already contiguous if reordered.
if (mtag_b == REORDERED)
{
rs_b_use = 1;
cs_b_use = 1;
}
// Pack B matrix if rs_b > 1
if( rs_b != 1 )
else if (rs_b != 1)
{
mem_b_size_req = sizeof( float ) * k;

View File

@@ -179,9 +179,14 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
#ifdef BLIS_KERNELS_ZEN4
}
#endif
// Pack B matrix if rs_b > 1, ignoring the mtag_b here.
// The vector is already contiguous if reordered.
if (mtag_b == REORDERED)
{
rs_b_use = 1;
cs_b_use = 1;
}
// For tiny sizes, it is better to pack B if it affects output accuracy.
if( ( rs_b != 1 ) )
else if( ( rs_b != 1 ) )
{
siz_t mem_b_size_req = sizeof( float ) * k;
pack_b_buffer_f32f32f32of32 =

View File

@@ -502,10 +502,10 @@ POST_OPS_MATRIX_ADD_1x16F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
}
else
{
@@ -576,10 +576,10 @@ POST_OPS_MATRIX_MUL_1x16F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
}
else
{
@@ -1109,10 +1109,10 @@ POST_OPS_MATRIX_ADD_1x16F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
}
else
{
@@ -1183,10 +1183,10 @@ POST_OPS_MATRIX_MUL_1x16F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
}
else
{