Implemented a new set of kernels for f32 using 32 YMM regs

Details:
- These kernels are picked from cntx when GEMM is invoked
  on machines that support AVX512 instructions by forcing the
  AVX2 path using AOCL_ENABLE_INSTRUCTIONS=AVX2 during run-time.
- This path uses the same blocksizes and pack kernels as AVX512
  path.
- GEMV is disabled currently as AVX2 kernels for GEMV are not
  implemented.

AMD-Internal: [SWLCSG-3519]
Change-Id: I75401fac48478fe99edb8e71fa44d36dd7513ae5
This commit is contained in:
Meghana Vankadari
2025-04-01 09:10:51 +00:00
parent 48c7452b08
commit 4745cf876e
11 changed files with 4722 additions and 192 deletions

View File

@@ -720,36 +720,36 @@ POST_OPS_MATRIX_ADD_5x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
// c[1:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
// c[2:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
// c[3:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
// c[4:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,4,12,13);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,4,12,13);
}
else
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
// c[1:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
// c[2:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
// c[3:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
// c[4:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr5,scl_fctr5,4,12,13);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr5,scl_fctr5,4,12,13);
}
}
else
@@ -760,36 +760,36 @@ POST_OPS_MATRIX_ADD_5x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
// c[3:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
// c[4:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,4,12,13);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,4,12,13);
}
else
{
// c[0:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
// c[3:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
// c[4:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr5,scl_fctr5,4,12,13);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr5,scl_fctr5,4,12,13);
}
}
POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
@@ -903,36 +903,36 @@ POST_OPS_MATRIX_MUL_5x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
// c[3:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
// c[4:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,4,12,13);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,4,12,13);
}
else
{
// c[0:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
// c[3:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
// c[4:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr5,scl_fctr5,4,12,13);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr5,scl_fctr5,4,12,13);
}
}
POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
@@ -1676,30 +1676,30 @@ POST_OPS_MATRIX_ADD_4x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
// c[1:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
// c[2:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
// c[3:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
}
else
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
// c[1:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
// c[2:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
// c[3:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
}
}
else
@@ -1710,30 +1710,30 @@ POST_OPS_MATRIX_ADD_4x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
// c[3:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
}
else
{
// c[0:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
// c[3:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
}
}
POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
@@ -1835,30 +1835,30 @@ POST_OPS_MATRIX_MUL_4x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
// c[3:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
}
else
{
// c[0:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
// c[3:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
}
}
POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
@@ -2495,24 +2495,24 @@ POST_OPS_MATRIX_ADD_3x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
// c[1:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
// c[2:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
}
else
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
// c[1:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
// c[2:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
}
}
else
@@ -2523,24 +2523,24 @@ POST_OPS_MATRIX_ADD_3x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
}
else
{
// c[0:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
}
}
POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
@@ -2630,24 +2630,24 @@ POST_OPS_MATRIX_MUL_3x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
}
else
{
// c[0:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
}
}
POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
@@ -3172,18 +3172,18 @@ POST_OPS_MATRIX_ADD_2x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
// c[1:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
}
else
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
// c[1:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
}
}
else
@@ -3194,18 +3194,18 @@ POST_OPS_MATRIX_ADD_2x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
}
else
{
// c[0:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
}
}
POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
@@ -3283,18 +3283,18 @@ POST_OPS_MATRIX_MUL_2x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
}
else
{
// c[0:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
}
}
POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
@@ -3723,12 +3723,12 @@ POST_OPS_MATRIX_ADD_1x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
}
else
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
}
}
else
@@ -3739,12 +3739,12 @@ POST_OPS_MATRIX_ADD_1x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
}
else
{
// c[0:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
}
}
POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
@@ -3813,12 +3813,12 @@ POST_OPS_MATRIX_MUL_1x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
}
else
{
// c[0:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
}
}
POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
@@ -4399,36 +4399,36 @@ POST_OPS_MATRIX_ADD_5x8F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,0,4);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,0,4);
// c[1:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,1,6);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,1,6);
// c[2:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,2,8);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,2,8);
// c[3:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,3,10);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,3,10);
// c[4:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,4,12);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,4,12);
}
else
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,0,4);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,0,4);
// c[1:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr2,1,6);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr2,1,6);
// c[2:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr3,2,8);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr3,2,8);
// c[3:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr4,3,10);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr4,3,10);
// c[4:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr5,4,12);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr5,4,12);
}
}
else
@@ -5163,30 +5163,30 @@ POST_OPS_MATRIX_ADD_4x8F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,0,4);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,0,4);
// c[1:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,1,6);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,1,6);
// c[2:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,2,8);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,2,8);
// c[3:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,3,10);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,3,10);
}
else
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,0,4);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,0,4);
// c[1:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr2,1,6);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr2,1,6);
// c[2:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr3,2,8);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr3,2,8);
// c[3:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr4,3,10);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr4,3,10);
}
}
else
@@ -5829,24 +5829,24 @@ POST_OPS_MATRIX_ADD_3x8F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,0,4);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,0,4);
// c[1:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,1,6);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,1,6);
// c[2:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,2,8);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,2,8);
}
else
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,0,4);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,0,4);
// c[1:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr2,1,6);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr2,1,6);
// c[2:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr3,2,8);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr3,2,8);
}
}
else
@@ -6398,18 +6398,18 @@ POST_OPS_MATRIX_ADD_2x8F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,0,4);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,0,4);
// c[1:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,1,6);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,1,6);
}
else
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,0,4);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,0,4);
// c[1:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr2,1,6);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr2,1,6);
}
}
else
@@ -6866,12 +6866,12 @@ POST_OPS_MATRIX_ADD_1x8F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,0,4);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,0,4);
}
else
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,0,4);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,0,4);
}
}
else

View File

@@ -266,6 +266,12 @@ multiply with Beta, and add to alpha*A*B*/
ymm ## r_ind0 = _mm256_add_ps( scr0, ymm ## r_ind0 ); \
ymm ## r_ind1 = _mm256_add_ps( scr1, ymm ## r_ind1 ); \
#define F32_MATRIX_ADD_4COL_YMM(scr0,scr1,scr2,scr3,m_ind,r_ind0,r_ind1,r_ind2,r_ind3) \
ymm ## r_ind0 = _mm256_add_ps( scr0, ymm ## r_ind0 ); \
ymm ## r_ind1 = _mm256_add_ps( scr1, ymm ## r_ind1 ); \
ymm ## r_ind2 = _mm256_add_ps( scr2, ymm ## r_ind2 ); \
ymm ## r_ind3 = _mm256_add_ps( scr3, ymm ## r_ind3 ); \
#define F32_F32_MATRIX_ADD_LOAD_XMM_1ELE(scr,scl_fct,m_ind,n_ind) \
scr = ( __m128 )_mm_load_ss \
( \
@@ -317,11 +323,18 @@ multiply with Beta, and add to alpha*A*B*/
#ifdef F32_F32_MATRIX_ADD_2COL
#undef F32_F32_MATRIX_ADD_2COL
#endif
#define F32_F32_MATRIX_ADD_2COL(scr0,scr1,scl_fct0,scl_fct1,m_ind,r_ind0,r_ind1) \
#define F32_F32_MATRIX_ADD_2COL_YMM(scr0,scr1,scl_fct0,scl_fct1,m_ind,r_ind0,r_ind1) \
F32_F32_MATRIX_ADD_LOAD_YMM(scr0,scl_fct0,m_ind,0); \
F32_F32_MATRIX_ADD_LOAD_YMM(scr1,scl_fct1,m_ind,1); \
F32_MATRIX_ADD_2COL_YMM(scr0,scr1,m_ind,r_ind0,r_ind1); \
#define F32_F32_MATRIX_ADD_4COL_YMM(scr0,scr1,scr2,scr3,scl_fct0,scl_fct1,scl_fct2,scl_fct3,m_ind,r_ind0,r_ind1,r_ind2,r_ind3) \
F32_F32_MATRIX_ADD_LOAD_YMM(scr0,scl_fct0,m_ind,0); \
F32_F32_MATRIX_ADD_LOAD_YMM(scr1,scl_fct1,m_ind,1); \
F32_F32_MATRIX_ADD_LOAD_YMM(scr2,scl_fct2,m_ind,2); \
F32_F32_MATRIX_ADD_LOAD_YMM(scr3,scl_fct3,m_ind,3); \
F32_MATRIX_ADD_4COL_YMM(scr0,scr1,scr2,scr3,m_ind,r_ind0,r_ind1, r_ind2, r_ind3); \
//Matrix-Add helpers for BF16 input.
#define BF16_F32_MATRIX_ADD_LOAD_YMM(scr,scl_fct,m_ind,n_ind) \
scr = (__m256)( _mm256_sllv_epi32 \
@@ -338,15 +351,12 @@ multiply with Beta, and add to alpha*A*B*/
); \
scr = _mm256_mul_ps( scr, scl_fct ); \
#ifdef BF16_F32_MATRIX_ADD_2COL
#undef BF16_F32_MATRIX_ADD_2COL
#endif
#define BF16_F32_MATRIX_ADD_2COL(scr0,scr1,scl_fct0,scl_fct1,m_ind,r_ind0,r_ind1) \
#define BF16_F32_MATRIX_ADD_2COL_YMM(scr0,scr1,scl_fct0,scl_fct1,m_ind,r_ind0,r_ind1) \
BF16_F32_MATRIX_ADD_LOAD_YMM(scr0,scl_fct0,m_ind,0); \
BF16_F32_MATRIX_ADD_LOAD_YMM(scr1,scl_fct1,m_ind,1); \
F32_MATRIX_ADD_2COL_YMM(scr0,scr1,m_ind,r_ind0,r_ind1); \
#define BF16_F32_MATRIX_ADD_1COL(scr0,scl_fct0,m_ind,r_ind0) \
#define BF16_F32_MATRIX_ADD_1COL_YMM(scr0,scl_fct0,m_ind,r_ind0) \
BF16_F32_MATRIX_ADD_LOAD_YMM(scr0,scl_fct0,m_ind,0); \
F32_MATRIX_ADD_1COL_YMM(scr0,m_ind,r_ind0); \
@@ -424,6 +434,12 @@ multiply with Beta, and add to alpha*A*B*/
ymm ## r_ind0 = _mm256_mul_ps( scr0, ymm ## r_ind0 ); \
ymm ## r_ind1 = _mm256_mul_ps( scr1, ymm ## r_ind1 ); \
#define F32_MATRIX_MUL_4COL_YMM(scr0,scr1,scr2,scr3,m_ind,r_ind0,r_ind1,r_ind2,r_ind3) \
ymm ## r_ind0 = _mm256_mul_ps( scr0, ymm ## r_ind0 ); \
ymm ## r_ind1 = _mm256_mul_ps( scr1, ymm ## r_ind1 ); \
ymm ## r_ind2 = _mm256_mul_ps( scr2, ymm ## r_ind2 ); \
ymm ## r_ind3 = _mm256_mul_ps( scr3, ymm ## r_ind3 ); \
#define F32_F32_MATRIX_MUL_LOAD_XMM_1ELE(scr,scl_fct,m_ind,n_ind) \
scr = ( __m128 )_mm_load_ss \
( \
@@ -472,14 +488,18 @@ multiply with Beta, and add to alpha*A*B*/
F32_F32_MATRIX_MUL_LOAD_YMM(scr0,scl_fct0,m_ind,0); \
F32_MATRIX_MUL_1COL_YMM(scr0,m_ind,r_ind0); \
#ifdef F32_F32_MATRIX_MUL_2COL
#undef F32_F32_MATRIX_MUL_2COL
#endif
#define F32_F32_MATRIX_MUL_2COL(scr0,scr1,scl_fct0,scl_fct1,m_ind,r_ind0,r_ind1) \
#define F32_F32_MATRIX_MUL_2COL_YMM(scr0,scr1,scl_fct0,scl_fct1,m_ind,r_ind0,r_ind1) \
F32_F32_MATRIX_MUL_LOAD_YMM(scr0,scl_fct0,m_ind,0); \
F32_F32_MATRIX_MUL_LOAD_YMM(scr1,scl_fct1,m_ind,1); \
F32_MATRIX_MUL_2COL_YMM(scr0,scr1,m_ind,r_ind0,r_ind1); \
#define F32_F32_MATRIX_MUL_4COL_YMM(scr0,scr1,scr2,scr3,scl_fct0,scl_fct1,scl_fct2,scl_fct3,m_ind,r_ind0,r_ind1,r_ind2,r_ind3) \
F32_F32_MATRIX_MUL_LOAD_YMM(scr0,scl_fct0,m_ind,0); \
F32_F32_MATRIX_MUL_LOAD_YMM(scr1,scl_fct1,m_ind,1); \
F32_F32_MATRIX_MUL_LOAD_YMM(scr2,scl_fct2,m_ind,2); \
F32_F32_MATRIX_MUL_LOAD_YMM(scr3,scl_fct3,m_ind,3); \
F32_MATRIX_MUL_4COL_YMM(scr0,scr1,scr2,scr3,m_ind,r_ind0,r_ind1, r_ind2,r_ind3); \
//BF16->F32 Matrix Mul Helpers
#define BF16_F32_MATRIX_MUL_LOAD_XMM_1ELE(scr,scl_fct,m_ind,n_ind) \
BF16_F32_MATRIX_ADD_LOAD_XMM_1ELE(scr,scl_fct,m_ind,n_ind) \
@@ -517,6 +537,7 @@ multiply with Beta, and add to alpha*A*B*/
BF16_F32_MATRIX_MUL_LOAD_YMM(scr1,scl_fct1,m_ind,1); \
F32_MATRIX_MUL_2COL_YMM(scr0,scr1,m_ind,r_ind0,r_ind1); \
// TANH
#define TANH_F32S_AVX2(reg, r, r2, x, z, dn, q) \
\

View File

@@ -919,42 +919,42 @@ POST_OPS_MATRIX_ADD_6x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
// c[1:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
// c[2:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
// c[3:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
// c[4:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,4,12,13);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,4,12,13);
// c[5:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,5,14,15);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,5,14,15);
}
else
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
// c[1:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
// c[2:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
// c[3:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
// c[4:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr5,scl_fctr5,4,12,13);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr5,scl_fctr5,4,12,13);
// c[5:0-15]
BF16_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr6,scl_fctr6,5,14,15);
BF16_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr6,scl_fctr6,5,14,15);
}
}
else
@@ -965,42 +965,42 @@ POST_OPS_MATRIX_ADD_6x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
// c[3:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
// c[4:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,4,12,13);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,4,12,13);
// c[5:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,5,14,15);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,5,14,15);
}
else
{
// c[0:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
// c[3:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
// c[4:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr5,scl_fctr5,4,12,13);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr5,scl_fctr5,4,12,13);
// c[5:0-15]
F32_F32_MATRIX_ADD_2COL(ymm1,ymm2,scl_fctr6,scl_fctr6,5,14,15);
F32_F32_MATRIX_ADD_2COL_YMM(ymm1,ymm2,scl_fctr6,scl_fctr6,5,14,15);
}
}
POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
@@ -1126,42 +1126,42 @@ POST_OPS_MATRIX_MUL_6x16F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,2,8,9);
// c[3:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,3,10,11);
// c[4:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,4,12,13);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,4,12,13);
// c[5:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr2,5,14,15);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr2,5,14,15);
}
else
{
// c[0:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr1,scl_fctr1,0,4,5);
// c[1:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr2,scl_fctr2,1,6,7);
// c[2:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr3,scl_fctr3,2,8,9);
// c[3:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr4,scl_fctr4,3,10,11);
// c[4:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr5,scl_fctr5,4,12,13);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr5,scl_fctr5,4,12,13);
// c[5:0-15]
F32_F32_MATRIX_MUL_2COL(ymm1,ymm2,scl_fctr6,scl_fctr6,5,14,15);
F32_F32_MATRIX_MUL_2COL_YMM(ymm1,ymm2,scl_fctr6,scl_fctr6,5,14,15);
}
}
POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
@@ -1962,42 +1962,42 @@ POST_OPS_MATRIX_ADD_6x8F:
( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,0,4);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,0,4);
// c[1:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,1,6);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,1,6);
// c[2:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,2,8);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,2,8);
// c[3:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,3,10);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,3,10);
// c[4:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,4,12);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,4,12);
// c[5:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,5,14);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,5,14);
}
else
{
// c[0:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr1,0,4);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr1,0,4);
// c[1:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr2,1,6);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr2,1,6);
// c[2:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr3,2,8);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr3,2,8);
// c[3:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr4,3,10);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr4,3,10);
// c[4:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr5,4,12);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr5,4,12);
// c[5:0-15]
BF16_F32_MATRIX_ADD_1COL(ymm1,scl_fctr6,5,14);
BF16_F32_MATRIX_ADD_1COL_YMM(ymm1,scl_fctr6,5,14);
}
}
else

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff