mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Fixed out-of-bound access in F32 matrix add/mul ops (#168)
- Modified the out-of-bound access in scale factors of matrix-add and matrix mul post-ops of f32 AVX512_256 kerenls. [ AMD-Internal : CPUPL-7261 ]
This commit is contained in:
@@ -696,16 +696,16 @@ POST_OPS_MATRIX_ADD_5x32F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
scl_fctr3 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
|
||||
scl_fctr4 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -849,16 +849,16 @@ POST_OPS_MATRIX_MUL_5x32F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
scl_fctr3 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
|
||||
scl_fctr4 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1704,16 +1704,16 @@ POST_OPS_MATRIX_ADD_4x32F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
scl_fctr3 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
|
||||
scl_fctr4 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1839,16 +1839,16 @@ POST_OPS_MATRIX_MUL_4x32F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
scl_fctr3 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
|
||||
scl_fctr4 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -2576,16 +2576,16 @@ POST_OPS_MATRIX_ADD_3x32F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
scl_fctr3 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
|
||||
scl_fctr4 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -2695,16 +2695,16 @@ POST_OPS_MATRIX_MUL_3x32F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
scl_fctr3 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
|
||||
scl_fctr4 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -3316,16 +3316,16 @@ POST_OPS_MATRIX_ADD_2x32F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
scl_fctr3 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
|
||||
scl_fctr4 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -3420,16 +3420,16 @@ POST_OPS_MATRIX_MUL_2x32F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
scl_fctr3 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
|
||||
scl_fctr4 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -3922,16 +3922,16 @@ POST_OPS_MATRIX_ADD_1x32F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
scl_fctr3 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
|
||||
scl_fctr4 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -4012,16 +4012,16 @@ POST_OPS_MATRIX_MUL_1x32F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
scl_fctr3 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
|
||||
scl_fctr4 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -793,16 +793,16 @@ POST_OPS_MATRIX_ADD_6x32F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
scl_fctr3 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
|
||||
scl_fctr4 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -963,16 +963,16 @@ POST_OPS_MATRIX_MUL_6x32F:
|
||||
{
|
||||
scl_fctr1 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 0 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 0 * 8 ) );
|
||||
scl_fctr2 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 1 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 1 * 8 ) );
|
||||
scl_fctr3 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 2 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 2 * 8 ) );
|
||||
scl_fctr4 =
|
||||
_mm256_loadu_ps( ( float* )post_ops_list_temp->scale_factor +
|
||||
post_ops_attr.post_op_c_j + ( 3 * 16 ) );
|
||||
post_ops_attr.post_op_c_j + ( 3 * 8 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user