Fixed out-of-bound access in F32 matrix add/mul ops (#168)

- Modified the out-of-bound access in scale factors of matrix-add and
 matrix mul post-ops of f32 AVX512_256 kerenls.

[ AMD-Internal : CPUPL-7261 ]
This commit is contained in:
V, Varsha
2025-09-01 16:40:03 +05:30
committed by GitHub
parent fb2a682725
commit c5bd1feabd
2 changed files with 48 additions and 48 deletions

View File

@@ -696,16 +696,16 @@ POST_OPS_MATRIX_ADD_5x32F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
scl_fctr3 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
scl_fctr4 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
}
else
{
@@ -849,16 +849,16 @@ POST_OPS_MATRIX_MUL_5x32F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
scl_fctr3 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
scl_fctr4 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
}
else
{
@@ -1704,16 +1704,16 @@ POST_OPS_MATRIX_ADD_4x32F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
scl_fctr3 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
scl_fctr4 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
}
else
{
@@ -1839,16 +1839,16 @@ POST_OPS_MATRIX_MUL_4x32F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
scl_fctr3 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
scl_fctr4 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
}
else
{
@@ -2576,16 +2576,16 @@ POST_OPS_MATRIX_ADD_3x32F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
scl_fctr3 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
scl_fctr4 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
}
else
{
@@ -2695,16 +2695,16 @@ POST_OPS_MATRIX_MUL_3x32F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
scl_fctr3 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
scl_fctr4 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
}
else
{
@@ -3316,16 +3316,16 @@ POST_OPS_MATRIX_ADD_2x32F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
scl_fctr3 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
scl_fctr4 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
}
else
{
@@ -3420,16 +3420,16 @@ POST_OPS_MATRIX_MUL_2x32F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
scl_fctr3 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
scl_fctr4 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
}
else
{
@@ -3922,16 +3922,16 @@ POST_OPS_MATRIX_ADD_1x32F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
scl_fctr3 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
scl_fctr4 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
}
else
{
@@ -4012,16 +4012,16 @@ POST_OPS_MATRIX_MUL_1x32F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
scl_fctr3 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
scl_fctr4 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
}
else
{

View File

@@ -793,16 +793,16 @@ POST_OPS_MATRIX_ADD_6x32F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
scl_fctr3 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
scl_fctr4 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
}
else
{
@@ -963,16 +963,16 @@ POST_OPS_MATRIX_MUL_6x32F:
{
scl_fctr1 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
scl_fctr2 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
scl_fctr3 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
scl_fctr4 =
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
}
else
{