mirror of
https://github.com/amd/blis.git
synced 2026-04-19 23:28:52 +00:00
Bug fix in LPGEMV m=1 AVX2 kernel for post-ops
Details: - Fixed loading of matadd and matmul pointers in GEMV lt16 kernel for AVX2 M=1 case. - Hard-set row-stride of B to 1(inside GEMV), when it has already been reordered. AMD-Internal:CPUPL-7197, CPUPL-7221 Co-authored-by:Balasubramanian, Vignesh <Vignesh.Balasubramanian@amd.com>
This commit is contained in:
committed by
GitHub
parent
d29f3f0b5e
commit
5044b69d3d
@@ -202,8 +202,14 @@ LPGEMV(float, float, float, f32f32f32of32)
|
||||
}
|
||||
#endif
|
||||
|
||||
// The vector is already contiguous if reordered.
|
||||
if (mtag_b == REORDERED)
|
||||
{
|
||||
rs_b_use = 1;
|
||||
cs_b_use = 1;
|
||||
}
|
||||
// Pack B matrix if rs_b > 1
|
||||
if( rs_b != 1 )
|
||||
else if (rs_b != 1)
|
||||
{
|
||||
mem_b_size_req = sizeof( float ) * k;
|
||||
|
||||
|
||||
@@ -179,9 +179,14 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
|
||||
#ifdef BLIS_KERNELS_ZEN4
|
||||
}
|
||||
#endif
|
||||
// Pack B matrix if rs_b > 1, ignoring the mtag_b here.
|
||||
// The vector is already contiguous if reordered.
|
||||
if (mtag_b == REORDERED)
|
||||
{
|
||||
rs_b_use = 1;
|
||||
cs_b_use = 1;
|
||||
}
|
||||
// For tiny sizes, it is better to pack B if it affects output accuracy.
|
||||
if( ( rs_b != 1 ) )
|
||||
else if( ( rs_b != 1 ) )
|
||||
{
|
||||
siz_t mem_b_size_req = sizeof( float ) * k;
|
||||
pack_b_buffer_f32f32f32of32 =
|
||||
|
||||
@@ -502,10 +502,10 @@ POST_OPS_MATRIX_ADD_1x16F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -576,10 +576,10 @@ POST_OPS_MATRIX_MUL_1x16F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1109,10 +1109,10 @@ POST_OPS_MATRIX_ADD_1x16F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1183,10 +1183,10 @@ POST_OPS_MATRIX_MUL_1x16F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user