Standardize Zen kernel names (2)

Further changes to fix inconsistencies in naming of zen kernels.

AMD-Internal: [CPUPL-6579]
This commit is contained in:
Smyth, Edward
2025-09-17 21:48:34 +01:00
committed by GitHub
parent e59eabaf58
commit e3b22f495e
12 changed files with 110 additions and 110 deletions

View File

@@ -50293,7 +50293,7 @@ void bli_dgemmsup_cv_zen4_asm_24x8m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x8(
bli_dgemmsup_cv_zen4_asm_24x8(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -50301,7 +50301,7 @@ void bli_dgemmsup_cv_zen4_asm_24x8m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x8(
bli_dgemmsup_cv_zen4_asm_16x8(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -50309,7 +50309,7 @@ void bli_dgemmsup_cv_zen4_asm_24x8m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x8(
bli_dgemmsup_cv_zen4_asm_8x8(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -51788,7 +51788,7 @@ void bli_dgemmsup_cv_zen4_asm_24x7m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x7(
bli_dgemmsup_cv_zen4_asm_24x7(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -51796,7 +51796,7 @@ void bli_dgemmsup_cv_zen4_asm_24x7m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x7(
bli_dgemmsup_cv_zen4_asm_16x7(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -51804,7 +51804,7 @@ void bli_dgemmsup_cv_zen4_asm_24x7m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x7(
bli_dgemmsup_cv_zen4_asm_8x7(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -53171,7 +53171,7 @@ void bli_dgemmsup_cv_zen4_asm_24x6m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x6(
bli_dgemmsup_cv_zen4_asm_24x6(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -53179,7 +53179,7 @@ void bli_dgemmsup_cv_zen4_asm_24x6m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x6(
bli_dgemmsup_cv_zen4_asm_16x6(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -53187,7 +53187,7 @@ void bli_dgemmsup_cv_zen4_asm_24x6m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x6(
bli_dgemmsup_cv_zen4_asm_8x6(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -54437,7 +54437,7 @@ void bli_dgemmsup_cv_zen4_asm_24x5m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x5(
bli_dgemmsup_cv_zen4_asm_24x5(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -54445,7 +54445,7 @@ void bli_dgemmsup_cv_zen4_asm_24x5m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x5(
bli_dgemmsup_cv_zen4_asm_16x5(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -54453,7 +54453,7 @@ void bli_dgemmsup_cv_zen4_asm_24x5m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x5(
bli_dgemmsup_cv_zen4_asm_8x5(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -55576,7 +55576,7 @@ void bli_dgemmsup_cv_zen4_asm_24x4m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x4(
bli_dgemmsup_cv_zen4_asm_24x4(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -55584,7 +55584,7 @@ void bli_dgemmsup_cv_zen4_asm_24x4m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x4(
bli_dgemmsup_cv_zen4_asm_16x4(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -55592,7 +55592,7 @@ void bli_dgemmsup_cv_zen4_asm_24x4m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x4(
bli_dgemmsup_cv_zen4_asm_8x4(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -56591,7 +56591,7 @@ void bli_dgemmsup_cv_zen4_asm_24x3m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x3(
bli_dgemmsup_cv_zen4_asm_24x3(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -56599,7 +56599,7 @@ void bli_dgemmsup_cv_zen4_asm_24x3m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x3(
bli_dgemmsup_cv_zen4_asm_16x3(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -56607,7 +56607,7 @@ void bli_dgemmsup_cv_zen4_asm_24x3m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x3(
bli_dgemmsup_cv_zen4_asm_8x3(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -57483,7 +57483,7 @@ void bli_dgemmsup_cv_zen4_asm_24x2m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x2(
bli_dgemmsup_cv_zen4_asm_24x2(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -57491,7 +57491,7 @@ void bli_dgemmsup_cv_zen4_asm_24x2m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x2(
bli_dgemmsup_cv_zen4_asm_16x2(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -57499,7 +57499,7 @@ void bli_dgemmsup_cv_zen4_asm_24x2m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x2(
bli_dgemmsup_cv_zen4_asm_8x2(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -58250,7 +58250,7 @@ void bli_dgemmsup_cv_zen4_asm_24x1m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x1(
bli_dgemmsup_cv_zen4_asm_24x1(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -58258,7 +58258,7 @@ void bli_dgemmsup_cv_zen4_asm_24x1m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x1(
bli_dgemmsup_cv_zen4_asm_16x1(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -58266,7 +58266,7 @@ void bli_dgemmsup_cv_zen4_asm_24x1m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x1(
bli_dgemmsup_cv_zen4_asm_8x1(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);

View File

@@ -304,7 +304,7 @@
else { STORE_COL_UPPER(M, n_rem) } \
c += 8 * rs_c; \
void bli_dgemmsup_rv_zen4_asm_8x8m
void bli_dgemmsup_cv_zen4_asm_8x8m
(
conj_t conja,
conj_t conjb,
@@ -353,7 +353,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8m
}
}
void bli_dgemmsup_rv_zen4_asm_8x8m_lower
void bli_dgemmsup_cv_zen4_asm_8x8m_lower
(
conj_t conja,
conj_t conjb,
@@ -402,7 +402,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8m_lower
}
}
void bli_dgemmsup_rv_zen4_asm_8x8m_upper
void bli_dgemmsup_cv_zen4_asm_8x8m_upper
(
conj_t conja,
conj_t conjb,
@@ -468,7 +468,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8m_upper
|********|
________
*/
void bli_dgemmsup_rv_zen4_asm_8x8m_lower_mle8
void bli_dgemmsup_cv_zen4_asm_8x8m_lower_mle8
(
conj_t conja,
conj_t conjb,
@@ -533,7 +533,7 @@ void bli_dgemmsup_rv_zen4_asm_8x8m_lower_mle8
|-------*|
________
*/
void bli_dgemmsup_rv_zen4_asm_8x8m_upper_mle8
void bli_dgemmsup_cv_zen4_asm_8x8m_upper_mle8
(
conj_t conja,
conj_t conjb,
@@ -691,7 +691,7 @@ void bli_dgemmsup_cv_zen4_asm_24x8m_lower_0
// call row major 8x8m upper diagonal kernel after
// inducing transpose to solve column major lower
// triangular GEMM
bli_dgemmsup_rv_zen4_asm_8x8m_upper_mle8
bli_dgemmsup_cv_zen4_asm_8x8m_upper_mle8
(
conjb,
conja,
@@ -801,7 +801,7 @@ void bli_dgemmsup_cv_zen4_asm_24x8m_lower_1
// call row major 8x8m upper diagonal kernel after
// inducing transpose to solve column major lower
// triangular GEMM
bli_dgemmsup_rv_zen4_asm_8x8m_upper_mle8
bli_dgemmsup_cv_zen4_asm_8x8m_upper_mle8
(
conjb,
conja,
@@ -911,7 +911,7 @@ void bli_dgemmsup_cv_zen4_asm_24x8m_lower_2
// call row major 8x8m upper diagonal kernel after
// inducing transpose to solve column major lower
// triangular GEMM
bli_dgemmsup_rv_zen4_asm_8x8m_upper_mle8
bli_dgemmsup_cv_zen4_asm_8x8m_upper_mle8
(
conjb,
conja,
@@ -1081,7 +1081,7 @@ void bli_dgemmsup_cv_zen4_asm_24x8m_upper_0
// call row major 8x8m lower diagonal kernel after
// inducing transpose to solve column major upper
// triangular GEMM
bli_dgemmsup_rv_zen4_asm_8x8m_lower_mle8
bli_dgemmsup_cv_zen4_asm_8x8m_lower_mle8
(
conjb,
conja,
@@ -1187,7 +1187,7 @@ void bli_dgemmsup_cv_zen4_asm_24x8m_upper_1
// call row major 8x8m lower diagonal kernel after
// inducing transpose to solve column major upper
// triangular GEMM
bli_dgemmsup_rv_zen4_asm_8x8m_lower_mle8
bli_dgemmsup_cv_zen4_asm_8x8m_lower_mle8
(
conjb,
conja,
@@ -1287,7 +1287,7 @@ void bli_dgemmsup_cv_zen4_asm_24x8m_upper_2
// call row major 8x8m lower diagonal kernel after
// inducing transpose to solve column major upper
// triangular GEMM
bli_dgemmsup_rv_zen4_asm_8x8m_lower_mle8
bli_dgemmsup_cv_zen4_asm_8x8m_lower_mle8
(
conjb,
conja,

View File

@@ -393,7 +393,7 @@
* Prefetch of A matrix is not done in edge-case kernels.
*/
void bli_dgemmsup_rv_zen4_asm_24x1
void bli_dgemmsup_cv_zen4_asm_24x1
(
conj_t conja,
conj_t conjb,
@@ -1110,7 +1110,7 @@ void bli_dgemmsup_rv_zen4_asm_24x1
}
void bli_dgemmsup_rv_zen4_asm_16x1
void bli_dgemmsup_cv_zen4_asm_16x1
(
conj_t conja,
conj_t conjb,
@@ -1744,7 +1744,7 @@ void bli_dgemmsup_rv_zen4_asm_16x1
}
void bli_dgemmsup_rv_zen4_asm_8x1
void bli_dgemmsup_cv_zen4_asm_8x1
(
conj_t conja,
conj_t conjb,

View File

@@ -393,7 +393,7 @@
* Prefetch of A matrix is not done in edge-case kernels.
*/
void bli_dgemmsup_rv_zen4_asm_24x2
void bli_dgemmsup_cv_zen4_asm_24x2
(
conj_t conja,
conj_t conjb,
@@ -1239,7 +1239,7 @@ void bli_dgemmsup_rv_zen4_asm_24x2
}
void bli_dgemmsup_rv_zen4_asm_16x2
void bli_dgemmsup_cv_zen4_asm_16x2
(
conj_t conja,
conj_t conjb,
@@ -1969,7 +1969,7 @@ void bli_dgemmsup_rv_zen4_asm_16x2
}
void bli_dgemmsup_rv_zen4_asm_8x2
void bli_dgemmsup_cv_zen4_asm_8x2
(
conj_t conja,
conj_t conjb,

View File

@@ -393,7 +393,7 @@
* Prefetch of A matrix is not done in edge-case kernels.
*/
void bli_dgemmsup_rv_zen4_asm_24x3
void bli_dgemmsup_cv_zen4_asm_24x3
(
conj_t conja,
conj_t conjb,
@@ -1361,7 +1361,7 @@ void bli_dgemmsup_rv_zen4_asm_24x3
}
void bli_dgemmsup_rv_zen4_asm_16x3
void bli_dgemmsup_cv_zen4_asm_16x3
(
conj_t conja,
conj_t conjb,
@@ -2182,7 +2182,7 @@ void bli_dgemmsup_rv_zen4_asm_16x3
}
void bli_dgemmsup_rv_zen4_asm_8x3
void bli_dgemmsup_cv_zen4_asm_8x3
(
conj_t conja,
conj_t conjb,

View File

@@ -393,7 +393,7 @@
* Prefetch of A matrix is not done in edge-case kernels.
*/
void bli_dgemmsup_rv_zen4_asm_24x4
void bli_dgemmsup_cv_zen4_asm_24x4
(
conj_t conja,
conj_t conjb,
@@ -1444,7 +1444,7 @@ void bli_dgemmsup_rv_zen4_asm_24x4
}
void bli_dgemmsup_rv_zen4_asm_16x4
void bli_dgemmsup_cv_zen4_asm_16x4
(
conj_t conja,
conj_t conjb,
@@ -2362,7 +2362,7 @@ void bli_dgemmsup_rv_zen4_asm_16x4
}
void bli_dgemmsup_rv_zen4_asm_8x4
void bli_dgemmsup_cv_zen4_asm_8x4
(
conj_t conja,
conj_t conjb,

View File

@@ -393,7 +393,7 @@
* Prefetch of A matrix is not done in edge-case kernels.
*/
void bli_dgemmsup_rv_zen4_asm_24x5
void bli_dgemmsup_cv_zen4_asm_24x5
(
conj_t conja,
conj_t conjb,
@@ -1614,7 +1614,7 @@ void bli_dgemmsup_rv_zen4_asm_24x5
}
void bli_dgemmsup_rv_zen4_asm_16x5
void bli_dgemmsup_cv_zen4_asm_16x5
(
conj_t conja,
conj_t conjb,
@@ -2669,7 +2669,7 @@ void bli_dgemmsup_rv_zen4_asm_16x5
}
void bli_dgemmsup_rv_zen4_asm_8x5
void bli_dgemmsup_cv_zen4_asm_8x5
(
conj_t conja,
conj_t conjb,

View File

@@ -393,7 +393,7 @@
* Prefetch of A matrix is not done in edge-case kernels.
*/
void bli_dgemmsup_rv_zen4_asm_24x6
void bli_dgemmsup_cv_zen4_asm_24x6
(
conj_t conja,
conj_t conjb,
@@ -1733,7 +1733,7 @@ void bli_dgemmsup_rv_zen4_asm_24x6
}
void bli_dgemmsup_rv_zen4_asm_16x6
void bli_dgemmsup_cv_zen4_asm_16x6
(
conj_t conja,
conj_t conjb,
@@ -2883,7 +2883,7 @@ void bli_dgemmsup_rv_zen4_asm_16x6
}
void bli_dgemmsup_rv_zen4_asm_8x6
void bli_dgemmsup_cv_zen4_asm_8x6
(
conj_t conja,
conj_t conjb,

View File

@@ -393,7 +393,7 @@
* Prefetch of A matrix is not done in edge-case kernels.
*/
void bli_dgemmsup_rv_zen4_asm_24x7
void bli_dgemmsup_cv_zen4_asm_24x7
(
conj_t conja,
conj_t conjb,
@@ -1850,7 +1850,7 @@ void bli_dgemmsup_rv_zen4_asm_24x7
}
void bli_dgemmsup_rv_zen4_asm_16x7
void bli_dgemmsup_cv_zen4_asm_16x7
(
conj_t conja,
conj_t conjb,
@@ -3045,7 +3045,7 @@ void bli_dgemmsup_rv_zen4_asm_16x7
}
void bli_dgemmsup_rv_zen4_asm_8x7
void bli_dgemmsup_cv_zen4_asm_8x7
(
conj_t conja,
conj_t conjb,

View File

@@ -356,7 +356,7 @@
* Prefetch of A matrix is not done in edge-case kernels.
*/
void bli_dgemmsup_rv_zen4_asm_24x8
void bli_dgemmsup_cv_zen4_asm_24x8
(
conj_t conja,
conj_t conjb,
@@ -1930,7 +1930,7 @@ void bli_dgemmsup_rv_zen4_asm_24x8
}
void bli_dgemmsup_rv_zen4_asm_16x8
void bli_dgemmsup_cv_zen4_asm_16x8
(
conj_t conja,
conj_t conjb,
@@ -3208,7 +3208,7 @@ void bli_dgemmsup_rv_zen4_asm_16x8
}
void bli_dgemmsup_rv_zen4_asm_8x8
void bli_dgemmsup_cv_zen4_asm_8x8
(
conj_t conja,
conj_t conjb,

View File

@@ -304,12 +304,12 @@ GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_24x3m_new)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_24x2m_new)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_24x1m_new)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x8)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x8)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x8)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x8m)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x8m_lower)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x8m_upper)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_24x8)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_16x8)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_8x8)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_8x8m)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_8x8m_lower)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_8x8m_upper)
/* DGEMMT 24x8 triangular kernels */
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_24x8m_lower_0)
@@ -323,33 +323,33 @@ GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen4_asm_4x4m)
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen4_asm_4x4m_lower)
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen4_asm_4x4m_upper)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x7)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x7)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x7)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_24x7)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_16x7)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_8x7)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x6)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x6)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x6)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_24x6)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_16x6)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_8x6)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x5)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x5)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x5)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_24x5)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_16x5)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_8x5)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x4)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x4)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x4)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_24x4)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_16x4)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_8x4)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x3)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x3)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x3)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_24x3)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_16x3)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_8x3)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x2)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x2)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x2)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_24x2)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_16x2)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_8x2)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x1)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x1)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x1)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_24x1)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_16x1)
GEMMSUP_KER_PROT( double, d, gemmsup_cv_zen4_asm_8x1)
// Cgemm sup CV kernels
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_24x4m )

View File

@@ -1816,7 +1816,7 @@ void bli_dgemmsup_cv_zen5_asm_24x8m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x8(
bli_dgemmsup_cv_zen4_asm_24x8(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -1824,7 +1824,7 @@ void bli_dgemmsup_cv_zen5_asm_24x8m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x8(
bli_dgemmsup_cv_zen4_asm_16x8(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -1832,7 +1832,7 @@ void bli_dgemmsup_cv_zen5_asm_24x8m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x8(
bli_dgemmsup_cv_zen4_asm_8x8(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -3311,7 +3311,7 @@ void bli_dgemmsup_cv_zen5_asm_24x7m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x7(
bli_dgemmsup_cv_zen4_asm_24x7(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -3319,7 +3319,7 @@ void bli_dgemmsup_cv_zen5_asm_24x7m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x7(
bli_dgemmsup_cv_zen4_asm_16x7(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -3327,7 +3327,7 @@ void bli_dgemmsup_cv_zen5_asm_24x7m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x7(
bli_dgemmsup_cv_zen4_asm_8x7(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -4694,7 +4694,7 @@ void bli_dgemmsup_cv_zen5_asm_24x6m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x6(
bli_dgemmsup_cv_zen4_asm_24x6(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -4702,7 +4702,7 @@ void bli_dgemmsup_cv_zen5_asm_24x6m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x6(
bli_dgemmsup_cv_zen4_asm_16x6(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -4710,7 +4710,7 @@ void bli_dgemmsup_cv_zen5_asm_24x6m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x6(
bli_dgemmsup_cv_zen4_asm_8x6(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -5960,7 +5960,7 @@ void bli_dgemmsup_cv_zen5_asm_24x5m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x5(
bli_dgemmsup_cv_zen4_asm_24x5(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -5968,7 +5968,7 @@ void bli_dgemmsup_cv_zen5_asm_24x5m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x5(
bli_dgemmsup_cv_zen4_asm_16x5(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -5976,7 +5976,7 @@ void bli_dgemmsup_cv_zen5_asm_24x5m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x5(
bli_dgemmsup_cv_zen4_asm_8x5(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -7099,7 +7099,7 @@ void bli_dgemmsup_cv_zen5_asm_24x4m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x4(
bli_dgemmsup_cv_zen4_asm_24x4(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -7107,7 +7107,7 @@ void bli_dgemmsup_cv_zen5_asm_24x4m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x4(
bli_dgemmsup_cv_zen4_asm_16x4(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -7115,7 +7115,7 @@ void bli_dgemmsup_cv_zen5_asm_24x4m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x4(
bli_dgemmsup_cv_zen4_asm_8x4(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -8114,7 +8114,7 @@ void bli_dgemmsup_cv_zen5_asm_24x3m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x3(
bli_dgemmsup_cv_zen4_asm_24x3(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -8122,7 +8122,7 @@ void bli_dgemmsup_cv_zen5_asm_24x3m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x3(
bli_dgemmsup_cv_zen4_asm_16x3(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -8130,7 +8130,7 @@ void bli_dgemmsup_cv_zen5_asm_24x3m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x3(
bli_dgemmsup_cv_zen4_asm_8x3(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -9007,7 +9007,7 @@ void bli_dgemmsup_cv_zen5_asm_24x2m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x2(
bli_dgemmsup_cv_zen4_asm_24x2(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -9015,7 +9015,7 @@ void bli_dgemmsup_cv_zen5_asm_24x2m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x2(
bli_dgemmsup_cv_zen4_asm_16x2(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -9023,7 +9023,7 @@ void bli_dgemmsup_cv_zen5_asm_24x2m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x2(
bli_dgemmsup_cv_zen4_asm_8x2(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -9776,7 +9776,7 @@ void bli_dgemmsup_cv_zen5_asm_24x1m
// covers the range 16 < m_left <= 24 by using masked load/store instructions
if( 16 < m_left )
{
bli_dgemmsup_rv_zen4_asm_24x1(
bli_dgemmsup_cv_zen4_asm_24x1(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -9784,7 +9784,7 @@ void bli_dgemmsup_cv_zen5_asm_24x1m
// covers the range 8 < m_left <= 16 by using masked load/store instructions
else if( 8 < m_left )
{
bli_dgemmsup_rv_zen4_asm_16x1(
bli_dgemmsup_cv_zen4_asm_16x1(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);
@@ -9792,7 +9792,7 @@ void bli_dgemmsup_cv_zen5_asm_24x1m
// covers the range 0 < m_left <= 8 by using masked load/store instructions
else if( 0 < m_left )
{
bli_dgemmsup_rv_zen4_asm_8x1(
bli_dgemmsup_cv_zen4_asm_8x1(
conja, conjb, m_left, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx);