Standardize Zen kernel names

Naming of Zen kernels and associated files was inconsistent with BLIS
conventions for other sub-configurations and between different Zen
generations. Other anomalies existed, e.g. dgemmsup 24x column
preferred kernels names with _rv_ instead of _cv_. This patch renames
kernels and file names to address these issues.

AMD-Internal: [CPUPL-6579]
This commit is contained in:
Smyth, Edward
2025-08-19 18:19:51 +01:00
committed by GitHub
parent aa95a8ce4a
commit 509aa07785
145 changed files with 2247 additions and 1984 deletions

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2019 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -118,8 +118,8 @@ void bli_cntx_init_haswell( cntx_t* cntx )
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_10,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
@@ -134,8 +134,8 @@ void bli_cntx_init_haswell( cntx_t* cntx )
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
#else
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_10,
#endif
cntx
);

View File

@@ -89,8 +89,8 @@ void bli_cntx_init_knl( cntx_t* cntx )
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_10,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
@@ -103,8 +103,8 @@ void bli_cntx_init_knl( cntx_t* cntx )
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
#else
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_10,
#endif
cntx
);

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2023 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -82,8 +82,8 @@ void bli_cntx_init_skx( cntx_t* cntx )
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_10,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
@@ -96,8 +96,8 @@ void bli_cntx_init_skx( cntx_t* cntx )
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
#else
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_10,
#endif
cntx
);
@@ -133,23 +133,23 @@ void bli_cntx_init_skx( cntx_t* cntx )
bli_cntx_set_l3_sup_kers
(
30,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64m_avx512, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64n_avx512, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen4_asm_6x64m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen4_asm_6x64n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -114,22 +114,22 @@ void bli_cntx_init_zen( cntx_t* cntx )
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
// axpbyv
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10,
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int10,
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int_10,
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int_10,
BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int,
BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int,
// axpyv
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5,
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5,
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_10,
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int_5,
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int_5,
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5,
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int_5,
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int_5,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
@@ -138,14 +138,14 @@ void bli_cntx_init_zen( cntx_t* cntx )
BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_zen_int,
// scalv
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_10,
BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_zen_int,
BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int,
// swapv
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int_8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int_8,
// copyv
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -129,22 +129,22 @@ void bli_cntx_init_zen2( cntx_t* cntx )
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
// axpbyv
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10,
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int10,
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int_10,
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int_10,
BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int,
BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int,
// axpyv
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5,
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5,
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_10,
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int_5,
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int_5,
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10,
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5,
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int_10,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int_10,
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int_5,
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int_5,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
@@ -153,14 +153,14 @@ void bli_cntx_init_zen2( cntx_t* cntx )
BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_zen_int,
// scalv
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_10,
BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_zen_int,
BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int,
// swapv
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int_8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int_8,
// copyv
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -132,22 +132,22 @@ void bli_cntx_init_zen3( cntx_t* cntx )
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
// axpbyv
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10,
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int10,
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int_10,
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int_10,
BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int,
BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int,
// axpyv
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5,
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5,
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_10,
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int_5,
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int_5,
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10,
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5,
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int_10,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int_10,
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int_5,
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int_5,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
@@ -156,14 +156,14 @@ void bli_cntx_init_zen3( cntx_t* cntx )
BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_zen_int,
// scalv
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_10,
BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_zen_int,
BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int,
// swapv
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int_8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int_8,
// copyv
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,

View File

@@ -80,14 +80,14 @@ void bli_cntx_init_zen4( cntx_t* cntx )
13,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_skx_asm_32x12_l2, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_avx512_asm_8x24, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_8x24, TRUE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_zen4_asm_24x4, FALSE,
/*bli_zgemm_zen4_asm_12x4 is a column preferred kernel*/
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen4_asm_12x4, FALSE,
// Different GEMM kernels are used for TRSM for zen4 architecture
BLIS_GEMM_FOR_TRSM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_FOR_TRSM_UKR, BLIS_DOUBLE, bli_dgemm_avx512_asm_8x24, TRUE,
BLIS_GEMM_FOR_TRSM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_8x24, TRUE,
BLIS_GEMM_FOR_TRSM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen4_asm_4x12, TRUE,
// gemmtrsm_l
@@ -156,29 +156,29 @@ void bli_cntx_init_zen4( cntx_t* cntx )
(
32,
// addv
BLIS_ADDV_KER, BLIS_DOUBLE, bli_daddv_zen_int_avx512,
BLIS_ADDV_KER, BLIS_DOUBLE, bli_daddv_zen4_int,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int_avx512,
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen4_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen4_int,
// axpbyv
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10,
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int_avx512,
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int_10,
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen4_int,
BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int,
BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int,
// axpyv
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_avx512,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_avx512,
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5,
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int_avx512,
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen4_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen4_int,
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int_5,
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen4_int,
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int_avx512,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int_avx512,
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int_avx512,
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen4_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen4_int,
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int_5,
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen4_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
@@ -186,27 +186,27 @@ void bli_cntx_init_zen4( cntx_t* cntx )
BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_zen_int,
// scalv
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_avx512,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_avx512,
BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_zen_int_avx512,
BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int_avx512,
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen4_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen4_int,
BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_zen4_int,
BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen4_int,
// swapv
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int_8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int_8,
// copyv
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen4_asm_avx512,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen4_asm_avx512,
BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen4_asm_avx512,
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen4_asm,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen4_asm,
BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen4_asm,
// setv
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int_avx512,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int_avx512,
BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_zen_int_avx512,
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen4_int,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen4_int,
BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_zen4_int,
// scal2v
BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_zen_int_avx512,
BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_zen4_int,
BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_zen_int,
cntx
);
@@ -299,23 +299,23 @@ void bli_cntx_init_zen4( cntx_t* cntx )
bli_cntx_set_l3_sup_kers
(
32,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64m_avx512, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64n_avx512, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen4_asm_6x64m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen4_asm_6x64n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_cv_zen4_asm_24x4m, FALSE,
BLIS_RRC, BLIS_SCOMPLEX, bli_cgemmsup_cv_zen4_asm_24x4m, FALSE,
@@ -397,14 +397,14 @@ void bli_cntx_init_zen4( cntx_t* cntx )
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,

View File

@@ -82,14 +82,14 @@ void bli_cntx_init_zen5( cntx_t* cntx )
13,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_skx_asm_32x12_l2, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_avx512_asm_8x24, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_8x24, TRUE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_zen4_asm_24x4, FALSE,
/*bli_zgemm_zen4_asm_12x4 is a column preferred kernel*/
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen4_asm_12x4, FALSE,
// Different GEMM kernels are used for TRSM for zen4 architecture
BLIS_GEMM_FOR_TRSM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_FOR_TRSM_UKR, BLIS_DOUBLE, bli_dgemm_avx512_asm_8x24, TRUE,
BLIS_GEMM_FOR_TRSM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_8x24, TRUE,
BLIS_GEMM_FOR_TRSM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen4_asm_4x12, TRUE,
// gemmtrsm_l
@@ -158,29 +158,29 @@ void bli_cntx_init_zen5( cntx_t* cntx )
(
32,
// addv
BLIS_ADDV_KER, BLIS_DOUBLE, bli_daddv_zen_int_avx512,
BLIS_ADDV_KER, BLIS_DOUBLE, bli_daddv_zen4_int,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int_avx512,
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen4_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen4_int,
// axpbyv
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10,
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int_avx512,
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int_10,
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen4_int,
BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int,
BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int,
// axpyv
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_avx512,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_avx512,
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5,
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int_avx512,
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen4_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen4_int,
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int_5,
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen4_int,
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int_avx512,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int_avx512,
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int_avx512,
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen4_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen4_int,
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int_5,
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen4_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
@@ -188,27 +188,27 @@ void bli_cntx_init_zen5( cntx_t* cntx )
BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_zen_int,
// scalv
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_avx512,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_avx512,
BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_zen_int_avx512,
BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int_avx512,
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen4_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen4_int,
BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_zen4_int,
BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen4_int,
// swapv
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int_8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int_8,
// copyv
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen4_asm_avx512,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen5_asm_avx512,
BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen4_asm_avx512,
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen4_asm,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen5_asm,
BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen4_asm,
// setv
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int_avx512,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int_avx512,
BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_zen_int_avx512,
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen4_int,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen4_int,
BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_zen4_int,
// scal2v
BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_zen_int_avx512,
BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_zen4_int,
BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_zen_int,
cntx
);
@@ -301,23 +301,23 @@ void bli_cntx_init_zen5( cntx_t* cntx )
bli_cntx_set_l3_sup_kers
(
32,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64m_avx512, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64n_avx512, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen4_asm_6x64m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen4_asm_6x64n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_cv_zen4_asm_24x4m, FALSE,
BLIS_RRC, BLIS_SCOMPLEX, bli_cgemmsup_cv_zen4_asm_24x4m, FALSE,
@@ -398,14 +398,14 @@ void bli_cntx_init_zen5( cntx_t* cntx )
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,

View File

@@ -307,8 +307,8 @@ void bli_dgemv_unf_var1
case BLIS_ARCH_ZEN5:
#if defined(BLIS_KERNELS_ZEN5)
gemv_kr_ptr = bli_dgemv_t_zen4_int; // DGEMV
scalv_kr_ptr = bli_dscalv_zen_int_avx512; // DSCALV
copyv_kr_ptr = bli_dcopyv_zen5_asm_avx512; // DCOPYV
scalv_kr_ptr = bli_dscalv_zen4_int; // DSCALV
copyv_kr_ptr = bli_dcopyv_zen5_asm; // DCOPYV
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
fast_path_thresh = 12000;
#endif
@@ -318,8 +318,8 @@ void bli_dgemv_unf_var1
#if defined(BLIS_KERNELS_ZEN4)
gemv_kr_ptr = bli_dgemv_t_zen4_int; // DGEMV
scalv_kr_ptr = bli_dscalv_zen_int_avx512; // DSCALV
copyv_kr_ptr = bli_dcopyv_zen4_asm_avx512; // DCOPYV
scalv_kr_ptr = bli_dscalv_zen4_int; // DSCALV
copyv_kr_ptr = bli_dcopyv_zen4_asm; // DCOPYV
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
fast_path_thresh = 11000;
#endif
@@ -925,7 +925,7 @@ void bli_zgemv_unf_var1
factor of DOTXF kernel
*/
dotxf_kr_ptr = bli_zdotxf_zen_int_8_avx512;
dotxf_kr_ptr = bli_zdotxf_zen4_int_8;
b_fuse = 8;
scal2v_kr_ptr = bli_zscal2v_zen_int;

View File

@@ -330,7 +330,7 @@ void bli_dgemv_unf_var2 (
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
bli_dgemv_n_avx2(
bli_dgemv_n_zen(
transa,
conjx,
m,
@@ -630,7 +630,7 @@ void bli_sgemv_unf_var2
/* If beta is zero, use setv. Otherwise, scale by beta. */
/* y = beta * y; */
/* beta=0 case is handled by scalv internally */
bli_sscalv_zen_int10
bli_sscalv_zen_int_10
(
BLIS_NO_CONJUGATE,
n_elem,
@@ -736,7 +736,7 @@ void bli_zgemv_unf_var2
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
axpyf_kr_ptr = bli_zaxpyf_zen_int_8_avx512;
axpyf_kr_ptr = bli_zaxpyf_zen4_int_8;
b_fuse = 8;
scal2v_kr_ptr = bli_zscal2v_zen_int;
@@ -745,7 +745,7 @@ void bli_zgemv_unf_var2
copyv_kr_ptr = bli_zcopyv_zen_int;
setv_kr_ptr = bli_zsetv_zen_int_avx512;
setv_kr_ptr = bli_zsetv_zen4_int;
break;
#endif
case BLIS_ARCH_ZEN:

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2019 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -308,7 +308,7 @@ void bli_dtrsv_unf_var1
#if defined(BLIS_KERNELS_ZEN4)
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
kfp_df = bli_ddotxf_zen_int_avx512;
kfp_df = bli_ddotxf_zen4_int;
b_fuse = 8;
break;
#endif

View File

@@ -313,12 +313,12 @@ void bli_dtrsv_unf_var2
{
if ( m < 2500 )
{
kfp_af = bli_daxpyf_zen_int8_avx512;
kfp_af = bli_daxpyf_zen4_int_8;
b_fuse = 8;
}
else
{
kfp_af = bli_daxpyf_zen_int12_avx512;
kfp_af = bli_daxpyf_zen4_int_12;
b_fuse = 12;
}
#if defined(BLIS_ENABLE_OPENMP)
@@ -331,7 +331,7 @@ void bli_dtrsv_unf_var2
// If NT == 1, don't use MT kernel.
if ( n_threads > 1 )
{
kfp_af = bli_daxpyf_zen_int32_avx512_mt;
kfp_af = bli_daxpyf_zen4_int_32_mt;
b_fuse = 32;
}
}

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -189,7 +189,7 @@ void bli_gemm_ker_var2
( bli_obj_is_real( b ) ) // check if B is real
)
{
bli_dgemm_avx512_asm_8x24_macro_kernel
bli_dgemm_zen4_asm_8x24_macro_kernel
(
n, m, k, buf_c, buf_a, buf_b, rs_c, buf_beta
);

View File

@@ -410,7 +410,7 @@ err_t bli_dgemm_tiny
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
#if defined(BLIS_FAMILY_ZEN5) || defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64)
return bli_dgemm_tiny_24x8
return bli_dgemm_tiny_zen4_24x8
(
1 * (transa == BLIS_CONJ_NO_TRANSPOSE),
1 * (transb == BLIS_CONJ_NO_TRANSPOSE),
@@ -431,7 +431,7 @@ err_t bli_dgemm_tiny
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
return bli_dgemm_tiny_6x8
return bli_dgemm_tiny_zen_6x8
(
1 * (transa == BLIS_CONJ_NO_TRANSPOSE),
1 * (transb == BLIS_CONJ_NO_TRANSPOSE),
@@ -466,7 +466,7 @@ err_t bli_dgemm_tiny
((m + k-n) < 1500) && ((n + k-m) < 1500) ) ||
((n <= 100) && (k <=100)))))
{
return bli_dgemm_tiny_24x8
return bli_dgemm_tiny_zen4_24x8
(
1 * (transa == BLIS_CONJ_NO_TRANSPOSE),
1 * (transb == BLIS_CONJ_NO_TRANSPOSE),
@@ -490,7 +490,7 @@ err_t bli_dgemm_tiny
case BLIS_ARCH_ZEN3:
if( ( (m <= 8) || ( (m <= 1000) && (n <= 24) && (k >= 4) ) ) && (k <= 1500) )
{
return bli_dgemm_tiny_6x8
return bli_dgemm_tiny_zen_6x8
(
1 * (transa == BLIS_CONJ_NO_TRANSPOSE),
1 * (transb == BLIS_CONJ_NO_TRANSPOSE),

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -81,17 +81,17 @@ typedef void (*gemmt_ker_ft)
#if defined(BLIS_KERNELS_ZEN4)
gemmt_ker_ft ker_fpus_zen4[3] =
{
bli_dgemmsup_rv_zen4_asm_24x8m_upper_0,
bli_dgemmsup_rv_zen4_asm_24x8m_upper_1,
bli_dgemmsup_rv_zen4_asm_24x8m_upper_2
bli_dgemmsup_cv_zen4_asm_24x8m_upper_0,
bli_dgemmsup_cv_zen4_asm_24x8m_upper_1,
bli_dgemmsup_cv_zen4_asm_24x8m_upper_2
};
//Look-up table for Gemmt Lower Variant Kernels
gemmt_ker_ft ker_fpls_zen4[3] =
{
bli_dgemmsup_rv_zen4_asm_24x8m_lower_0,
bli_dgemmsup_rv_zen4_asm_24x8m_lower_1,
bli_dgemmsup_rv_zen4_asm_24x8m_lower_2
bli_dgemmsup_cv_zen4_asm_24x8m_lower_0,
bli_dgemmsup_cv_zen4_asm_24x8m_lower_1,
bli_dgemmsup_cv_zen4_asm_24x8m_lower_2
};
#endif

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -58,7 +58,7 @@ void bli_trsm_front
#if 0
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
gint_t status = bli_trsm_small( side, alpha, a, b, cntx, cntl );
gint_t status = bli_trsm_small_zen( side, alpha, a, b, cntx, cntl );
if ( status == BLIS_SUCCESS ) return;
#endif
#endif

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -45,7 +45,7 @@ void bli_trsm_front
);
#ifdef BLIS_ENABLE_SMALL_MATRIX
err_t bli_trsm_small
err_t bli_trsm_small_zen
(
side_t side,
obj_t* alpha,

View File

@@ -319,7 +319,7 @@ f77_int idamax_blis_impl
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
// AVX512 Kernel
amaxv_fun_ptr = bli_damaxv_zen_int_avx512;
amaxv_fun_ptr = bli_damaxv_zen4_int;
break;
#endif
case BLIS_ARCH_ZEN:

View File

@@ -215,14 +215,14 @@ void saxpy_blis_impl
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
axpyv_ker_ptr = bli_saxpyv_zen_int_avx512;
axpyv_ker_ptr = bli_saxpyv_zen4_int;
break;
#endif
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
axpyv_ker_ptr = bli_saxpyv_zen_int10;
axpyv_ker_ptr = bli_saxpyv_zen_int_10;
break;
default:
@@ -354,7 +354,7 @@ void daxpy_blis_impl
{
case BLIS_ARCH_ZEN5:
#if defined(BLIS_KERNELS_ZEN4)
axpyv_ker_ptr = bli_daxpyv_zen_int_avx512;
axpyv_ker_ptr = bli_daxpyv_zen4_int;
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
fast_path_thresh = 34000;
#endif
@@ -362,7 +362,7 @@ void daxpy_blis_impl
#endif
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
axpyv_ker_ptr = bli_daxpyv_zen_int_avx512;
axpyv_ker_ptr = bli_daxpyv_zen4_int;
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
fast_path_thresh = 11000;
#endif
@@ -373,7 +373,7 @@ void daxpy_blis_impl
case BLIS_ARCH_ZEN3:
// AVX2 Kernel
axpyv_ker_ptr = bli_daxpyv_zen_int10;
axpyv_ker_ptr = bli_daxpyv_zen_int_10;
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
fast_path_thresh = 4000;
#endif
@@ -590,7 +590,7 @@ void caxpy_blis_impl
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{
bli_caxpyv_zen_int5
bli_caxpyv_zen_int_5
(
BLIS_NO_CONJUGATE,
n_elem,
@@ -722,7 +722,7 @@ void zaxpy_blis_impl
#if defined(BLIS_KERNELS_ZEN4)
// AVX512 Kernel
axpyv_ker_ptr = bli_zaxpyv_zen_int_avx512;
axpyv_ker_ptr = bli_zaxpyv_zen4_int;
break;
#endif
case BLIS_ARCH_ZEN:
@@ -730,7 +730,7 @@ void zaxpy_blis_impl
case BLIS_ARCH_ZEN3:
// AVX2 Kernel
axpyv_ker_ptr = bli_zaxpyv_zen_int5;
axpyv_ker_ptr = bli_zaxpyv_zen_int_5;
break;
default:

View File

@@ -181,7 +181,7 @@ void scopy_blis_impl
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
copyv_ker_ptr = bli_scopyv_zen4_asm_avx512;
copyv_ker_ptr = bli_scopyv_zen4_asm;
break;
#endif
case BLIS_ARCH_ZEN:
@@ -311,7 +311,7 @@ void dcopy_blis_impl
case BLIS_ARCH_ZEN5:
#if defined(BLIS_KERNELS_ZEN5)
// For Zen4 and Zen5, kernel implemented in AVX512 is used
copyv_ker_ptr = bli_dcopyv_zen5_asm_avx512;
copyv_ker_ptr = bli_dcopyv_zen5_asm;
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
fast_path_thresh = 43000;
#endif
@@ -320,7 +320,7 @@ void dcopy_blis_impl
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
// For Zen4 and Zen5, kernel implemented in AVX512 is used
copyv_ker_ptr = bli_dcopyv_zen4_asm_avx512;
copyv_ker_ptr = bli_dcopyv_zen4_asm;
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
fast_path_thresh = 3300;
#endif
@@ -558,7 +558,7 @@ void zcopy_blis_impl
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
// For Zen4 and Zen5 architecture, kernel implemented in AVX512 is used
copyv_ker_ptr = bli_zcopyv_zen4_asm_avx512;
copyv_ker_ptr = bli_zcopyv_zen4_asm;
break;
#endif
case BLIS_ARCH_ZEN:

View File

@@ -206,7 +206,7 @@ float sdot_blis_impl
#if defined(BLIS_KERNELS_ZEN4)
// AVX-512 Kernel
dotv_ker_ptr = bli_sdotv_zen_int_avx512;
dotv_ker_ptr = bli_sdotv_zen4_int;
break;
#endif
@@ -215,7 +215,7 @@ float sdot_blis_impl
case BLIS_ARCH_ZEN3:
// AVX-2 Kernel
dotv_ker_ptr = bli_sdotv_zen_int10;
dotv_ker_ptr = bli_sdotv_zen_int_10;
break;
default:
@@ -347,7 +347,7 @@ double ddot_blis_impl
#if defined(BLIS_KERNELS_ZEN5)
// AVX-512 Kernel
dotv_ker_ptr = bli_ddotv_zen_int_avx512;
dotv_ker_ptr = bli_ddotv_zen4_int;
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
fast_path_thresh = 6600;
#endif
@@ -358,7 +358,7 @@ double ddot_blis_impl
#if defined(BLIS_KERNELS_ZEN4)
// AVX-512 Kernel
dotv_ker_ptr = bli_ddotv_zen_int_avx512;
dotv_ker_ptr = bli_ddotv_zen4_int;
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
fast_path_thresh = 5600;
#endif
@@ -370,7 +370,7 @@ double ddot_blis_impl
case BLIS_ARCH_ZEN3:
// AVX2 Kernel
dotv_ker_ptr = bli_ddotv_zen_int10;
dotv_ker_ptr = bli_ddotv_zen_int_10;
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
fast_path_thresh = 2500;
#endif
@@ -686,7 +686,7 @@ scomplex cdotu_blis_impl
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{
/* Call BLIS kernel. */
bli_cdotv_zen_int5
bli_cdotv_zen_int_5
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
@@ -807,14 +807,14 @@ dcomplex zdotu_blis_impl
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
zdotv_ker_ptr = bli_zdotv_zen_int_avx512;
zdotv_ker_ptr = bli_zdotv_zen4_int;
break;
#endif
case BLIS_ARCH_ZEN3:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN:
zdotv_ker_ptr = bli_zdotv_zen_int5;
zdotv_ker_ptr = bli_zdotv_zen_int_5;
break;
default:
@@ -1097,7 +1097,7 @@ scomplex cdotc_blis_impl
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{
/* Call BLIS kernel. */
bli_cdotv_zen_int5
bli_cdotv_zen_int_5
(
BLIS_CONJUGATE,
BLIS_NO_CONJUGATE,
@@ -1220,15 +1220,15 @@ dcomplex zdotc_blis_impl
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
// Currently only the AVX512 intrinsic kernel is enabled.
zdotv_ker_ptr = bli_zdotv_zen_int_avx512;
// zdotv_ker_ptr = bli_zdotv_zen4_asm_avx512;
zdotv_ker_ptr = bli_zdotv_zen4_int;
// zdotv_ker_ptr = bli_zdotv_zen4_asm;
break;
#endif
case BLIS_ARCH_ZEN3:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN:
zdotv_ker_ptr = bli_zdotv_zen_int5;
zdotv_ker_ptr = bli_zdotv_zen_int_5;
break;
default:

View File

@@ -689,7 +689,7 @@ void dgemm_blis_impl
if ( arch_id == BLIS_ARCH_ZEN || arch_id == BLIS_ARCH_ZEN2 ||
arch_id == BLIS_ARCH_ZEN3 )
{
k1_status = bli_dgemm_8x6_avx2_k1_nn
k1_status = bli_dgemm_zen_int_8x6_k1_nn
(
m0, n0, k0,
(double*)alpha,
@@ -702,7 +702,7 @@ void dgemm_blis_impl
#if defined(BLIS_FAMILY_ZEN5) || defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64)
else if ( arch_id == BLIS_ARCH_ZEN5 || arch_id == BLIS_ARCH_ZEN4 )
{
k1_status = bli_dgemm_24x8_avx512_k1_nn
k1_status = bli_dgemm_zen4_int_24x8_k1_nn
(
m0, n0, k0,
(double*)alpha,
@@ -1179,7 +1179,7 @@ void zgemm_blis_impl
if ( arch_id == BLIS_ARCH_ZEN || arch_id == BLIS_ARCH_ZEN2 ||
arch_id == BLIS_ARCH_ZEN3 )
{
k1_status = bli_zgemm_4x4_avx2_k1_nn
k1_status = bli_zgemm_zen_int_4x4_k1_nn
(
m0, n0, k0,
(dcomplex*)alpha,
@@ -1197,7 +1197,7 @@ void zgemm_blis_impl
// This holds true irrespective of the broadcast direction( n0 )
if( m0 < 30 )
{
k1_status = bli_zgemm_4x4_avx2_k1_nn
k1_status = bli_zgemm_zen_int_4x4_k1_nn
(
m0, n0, k0,
(dcomplex*)alpha,
@@ -1209,7 +1209,7 @@ void zgemm_blis_impl
}
else
{
k1_status = bli_zgemm_16x4_avx512_k1_nn
k1_status = bli_zgemm_zen4_int_16x4_k1_nn
(
m0, n0, k0,
(dcomplex*)alpha,
@@ -1226,7 +1226,7 @@ void zgemm_blis_impl
// ( i.e, small or tiny sizes ), or if the load directon( m0 ) < 10
if( ( m0 < 30 && n0 < 30 ) || m0 < 10 )
{
k1_status = bli_zgemm_4x4_avx2_k1_nn
k1_status = bli_zgemm_zen_int_4x4_k1_nn
(
m0, n0, k0,
(dcomplex*)alpha,
@@ -1238,7 +1238,7 @@ void zgemm_blis_impl
}
else
{
k1_status = bli_zgemm_16x4_avx512_k1_nn
k1_status = bli_zgemm_zen4_int_16x4_k1_nn
(
m0, n0, k0,
(dcomplex*)alpha,
@@ -1740,7 +1740,7 @@ void cgemm_blis_impl
if ( ( arch_id == BLIS_ARCH_ZEN4 ) || ( arch_id == BLIS_ARCH_ZEN5 ) )
{
bli_cgemm_32x4_avx512_k1_nn
bli_cgemm_zen4_int_32x4_k1_nn
(
m0, n0, k0,
(scomplex*)alpha,

View File

@@ -739,7 +739,7 @@ void cgemv_blis_impl
scomplex rho;
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{
bli_cdotv_zen_int5
bli_cdotv_zen_int_5
(
conja,
BLIS_NO_CONJUGATE,
@@ -987,7 +987,7 @@ void zgemv_blis_impl
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{
bli_zdotv_zen_int5
bli_zdotv_zen_int_5
(
conja,
BLIS_NO_CONJUGATE,

View File

@@ -165,13 +165,13 @@ void sscal_blis_impl
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
scalv_ker_ptr = bli_sscalv_zen_int_avx512;
scalv_ker_ptr = bli_sscalv_zen4_int;
break;
#endif
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
scalv_ker_ptr = bli_sscalv_zen_int10;
scalv_ker_ptr = bli_sscalv_zen_int_10;
break;
default:
@@ -257,7 +257,7 @@ void dscal_blis_impl
case BLIS_ARCH_ZEN5:
#if defined(BLIS_KERNELS_ZEN5)
// AVX512 Kernel
scalv_ker_ptr = bli_dscalv_zen_int_avx512;
scalv_ker_ptr = bli_dscalv_zen4_int;
#ifdef BLIS_ENABLE_OPENMP
ST_THRESH = 63894;
#endif
@@ -266,7 +266,7 @@ void dscal_blis_impl
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
// AVX512 Kernel
scalv_ker_ptr = bli_dscalv_zen_int_avx512;
scalv_ker_ptr = bli_dscalv_zen4_int;
#ifdef BLIS_ENABLE_OPENMP
ST_THRESH = 27500;
#endif
@@ -277,7 +277,7 @@ void dscal_blis_impl
case BLIS_ARCH_ZEN3:
// AVX2 Kernel
scalv_ker_ptr = bli_dscalv_zen_int10;
scalv_ker_ptr = bli_dscalv_zen_int_10;
#ifdef BLIS_ENABLE_OPENMP
ST_THRESH = 30000;
#endif
@@ -459,7 +459,7 @@ void zdscal_blis_impl
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
// AVX512 Kernel
scalv_ker_ptr = bli_zdscalv_zen_int_avx512;
scalv_ker_ptr = bli_zdscalv_zen4_int;
break;
#endif
case BLIS_ARCH_ZEN:
@@ -467,7 +467,7 @@ void zdscal_blis_impl
case BLIS_ARCH_ZEN3:
// AVX2 Kernel
scalv_ker_ptr = bli_zdscalv_zen_int10;
scalv_ker_ptr = bli_zdscalv_zen_int_10;
break;
default:
@@ -627,7 +627,7 @@ void cscal_blis_impl
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
// AVX512 Kernel
scalv_ker_ptr = bli_cscalv_zen_int_avx512;
scalv_ker_ptr = bli_cscalv_zen4_int;
break;
#endif
case BLIS_ARCH_ZEN:
@@ -719,7 +719,7 @@ void zscal_blis_impl
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
// AVX512 Kernel
scalv_ker_ptr = bli_zscalv_zen_int_avx512;
scalv_ker_ptr = bli_zscalv_zen4_int;
break;
#endif
case BLIS_ARCH_ZEN:

View File

@@ -163,7 +163,7 @@ void sswap_blis_impl
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
/* Call BLIS kernel */
bli_sswapv_zen_int8
bli_sswapv_zen_int_8
(
n0,
x0, incx0,
@@ -264,7 +264,7 @@ void dswap_blis_impl
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
bli_dswapv_zen_int8
bli_dswapv_zen_int_8
(
n0,
x0, incx0,

View File

@@ -776,7 +776,7 @@ void strsm_blis_impl
(is_parallel && (m0+n0)<320))
{
err_t small_status;
small_status = bli_trsm_small
small_status = bli_trsm_small_zen
(
blis_side,
&alphao,
@@ -1181,22 +1181,22 @@ void dtrsm_blis_impl
{
if ( m0 <= 120 )
{
ker_ft = bli_trsm_small_AVX512;
ker_ft = bli_trsm_small_zen4;
}
else if ( (log10(n0) + (0.65*log10(m0)) ) < 4.4 )
{
ker_ft = bli_trsm_small_ZEN5;
ker_ft = bli_trsm_small_zen5;
}
}
else //if ( blis_side == BLIS_RIGHT )
{
if ( (log10(m0) + (3.2*log10(n0)) ) < 7 )
{
ker_ft = bli_trsm_small_AVX512;
ker_ft = bli_trsm_small_zen4;
}
else if ( (log10(m0) + (0.85*log10(n0)) ) < 5 )
{
ker_ft = bli_trsm_small_ZEN5;
ker_ft = bli_trsm_small_zen5;
}
}
break;
@@ -1210,11 +1210,11 @@ void dtrsm_blis_impl
except for sizes where n is multiple of 8.*/
if (((n0 % 8 == 0) && (n0 < 50)) || ((m0 > 50) && (n0 > 50)))
{
ker_ft = bli_trsm_small_AVX512;
ker_ft = bli_trsm_small_zen4;
}
else
{
ker_ft = bli_trsm_small;
ker_ft = bli_trsm_small_zen;
}
}
break;
@@ -1226,7 +1226,7 @@ void dtrsm_blis_impl
if ((!is_parallel && ((dim_a < 1500) && (size_b < 5e6)) ) ||
(is_parallel && (m0+n0)<200))
{
ker_ft = bli_trsm_small;
ker_ft = bli_trsm_small_zen;
}
break;
}
@@ -1242,7 +1242,7 @@ void dtrsm_blis_impl
{
if ( n0 < 4300 )
{
ker_ft = bli_trsm_small_mt_ZEN5;
ker_ft = bli_trsm_small_zen5_mt;
}
else
{
@@ -1253,7 +1253,7 @@ void dtrsm_blis_impl
{
if ( (n0 < 1812 || m0 < 3220) && (m0 < 14000) )
{
ker_ft = bli_trsm_small_mt_ZEN5;
ker_ft = bli_trsm_small_zen5_mt;
}
else
{
@@ -1268,7 +1268,7 @@ void dtrsm_blis_impl
if( (ker_ft == NULL) && (is_parallel) &&
((dim_a < 2500) && (size_b < 5e6)) )
{
ker_ft = bli_trsm_small_mt_AVX512;
ker_ft = bli_trsm_small_zen4_mt;
}
break;
#endif// BLIS_KERNELS_ZEN4
@@ -1279,7 +1279,7 @@ void dtrsm_blis_impl
if( (ker_ft == NULL) && (is_parallel) &&
((dim_a < 2500) && (size_b < 5e6)) )
{
ker_ft = bli_trsm_small_mt;
ker_ft = bli_trsm_small_zen_mt;
}
break;
}
@@ -1723,7 +1723,7 @@ void ztrsm_blis_impl
{
if (!bli_obj_has_conj(&ao)) // if transa == 'C', go to native code path
{
ker_ft = bli_trsm_small_mt_ZEN5; // 12x4 non fused kernel for ZEN5
ker_ft = bli_trsm_small_zen5_mt; // 12x4 non fused kernel for ZEN5
}
}
break;
@@ -1735,7 +1735,7 @@ void ztrsm_blis_impl
{
if (!bli_obj_has_conj(&ao))
{
ker_ft = bli_trsm_small_mt_AVX512; // 4x4 fused kernel for ZEN4
ker_ft = bli_trsm_small_zen4_mt; // 4x4 fused kernel for ZEN4
}
else
{
@@ -1744,7 +1744,7 @@ void ztrsm_blis_impl
// better accuracy in large sizes
if (dim_a <= 500)
#endif
ker_ft = bli_trsm_small_mt;
ker_ft = bli_trsm_small_zen_mt;
}
}
break;
@@ -1772,22 +1772,22 @@ void ztrsm_blis_impl
{
if ( m0 <= 88 )
{
ker_ft = bli_trsm_small_AVX512;
ker_ft = bli_trsm_small_zen4;
}
else if ( (log10(n0) + (0.15*log10(m0)) ) < 2.924 )
{
ker_ft = bli_trsm_small_ZEN5;
ker_ft = bli_trsm_small_zen5;
}
}
else //if ( blis_side == BLIS_RIGHT )
{
if ( (log10(m0) + (2.8*log10(n0)) ) < 6 )
{
ker_ft = bli_trsm_small_AVX512;
ker_ft = bli_trsm_small_zen4;
}
else if ( (log10(m0) + (1.058*log10(n0)) ) < 5.373 )
{
ker_ft = bli_trsm_small_ZEN5;
ker_ft = bli_trsm_small_zen5;
}
}
break;
@@ -1800,7 +1800,7 @@ void ztrsm_blis_impl
// conjugate
if (!bli_obj_has_conj(&ao))
{
ker_ft = bli_trsm_small_AVX512;
ker_ft = bli_trsm_small_zen4;
}
else
{
@@ -1809,7 +1809,7 @@ void ztrsm_blis_impl
// better accuracy in large sizes
if (dim_a <= 500)
#endif
ker_ft = bli_trsm_small;
ker_ft = bli_trsm_small_zen;
}
}
break;
@@ -1823,7 +1823,7 @@ void ztrsm_blis_impl
// better accuracy in large sizes
if (dim_a <= 500)
#endif
ker_ft = bli_trsm_small;
ker_ft = bli_trsm_small_zen;
break;
}
}
@@ -2229,7 +2229,7 @@ void ctrsm_blis_impl
(is_parallel && (m0+n0)<320))
{
err_t small_status;
small_status = bli_trsm_small
small_status = bli_trsm_small_zen
(
blis_side,
&alphao,

View File

@@ -192,7 +192,7 @@ void saxpby_blis_impl
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
axpbyv_ker_ptr = bli_saxpbyv_zen_int10;
axpbyv_ker_ptr = bli_saxpbyv_zen_int_10;
break;
default:
@@ -324,14 +324,14 @@ void daxpby_blis_impl
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
axpbyv_ker_ptr = bli_daxpbyv_zen_int_avx512;
axpbyv_ker_ptr = bli_daxpbyv_zen4_int;
break;
#endif
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
axpbyv_ker_ptr = bli_daxpbyv_zen_int10;
axpbyv_ker_ptr = bli_daxpbyv_zen_int_10;
break;
default:

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -353,7 +353,7 @@ void bli_cnormfv_unb_var1
size_t buffer_size = n * sizeof( scomplex );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_scnorm2fv_unb_var1_avx2(): get mem pool block\n" );
printf( "bli_scnorm2fv_zen_int_unb_var1(): get mem pool block\n" );
#endif
// Acquire a Buffer(n*size(scomplex)) from the memory broker
@@ -378,12 +378,12 @@ void bli_cnormfv_unb_var1
incx_buf = 1;
}
bli_scnorm2fv_unb_var1_avx2( n, x_buf, incx_buf, norm, cntx );
bli_scnorm2fv_zen_int_unb_var1( n, x_buf, incx_buf, norm, cntx );
if ( bli_mem_is_alloc( &mem_buf_X ) )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
printf( "bli_scnorm2fv_zen_int_unb_var1(): releasing mem pool block\n" );
#endif
// Return the buffer to pool.
bli_pba_release( &rntm_l , &mem_buf_X );
@@ -392,7 +392,7 @@ void bli_cnormfv_unb_var1
else
{
// Call the kernel with the unit-strided vector x
bli_scnorm2fv_unb_var1_avx2( n, x_buf, incx_buf, norm, cntx );
bli_scnorm2fv_zen_int_unb_var1( n, x_buf, incx_buf, norm, cntx );
}
break;
@@ -470,8 +470,8 @@ void bli_znormfv_unb_var1
case BLIS_ARCH_ZEN:
#ifdef BLIS_KERNELS_ZEN
norm_fp = bli_dznorm2fv_unb_var1_avx2;
reduce_fp = bli_dnorm2fv_unb_var1_avx2;
norm_fp = bli_dznorm2fv_zen_int_unb_var1;
reduce_fp = bli_dnorm2fv_zen_int_unb_var1;
fast_path_thresh = 2000;
#ifdef BLIS_ENABLE_OPENMP
@@ -947,7 +947,7 @@ void bli_snormfv_unb_var1
size_t buffer_size = n * sizeof( float );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_snorm2fv_unb_var1_avx2(): get mem pool block\n" );
printf( "bli_snorm2fv_zen_int_unb_var1(): get mem pool block\n" );
#endif
// Acquire a Buffer(n*size(float)) from the memory broker
@@ -972,12 +972,12 @@ void bli_snormfv_unb_var1
incx_buf = 1;
}
bli_snorm2fv_unb_var1_avx2( n, x_buf, incx_buf, norm, cntx );
bli_snorm2fv_zen_int_unb_var1( n, x_buf, incx_buf, norm, cntx );
if ( bli_mem_is_alloc( &mem_buf_X ) )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
printf( "bli_snorm2fv_zen_int_unb_var1(): releasing mem pool block\n" );
#endif
// Return the buffer to pool.
bli_pba_release( &rntm_l , &mem_buf_X );
@@ -986,7 +986,7 @@ void bli_snormfv_unb_var1
else
{
// Call the kernel with the unit-strided vector x
bli_snorm2fv_unb_var1_avx2( n, x_buf, incx_buf, norm, cntx );
bli_snorm2fv_zen_int_unb_var1( n, x_buf, incx_buf, norm, cntx );
}
break;
@@ -1065,9 +1065,9 @@ void bli_dnormfv_unb_var1
#if defined(BLIS_KERNELS_ZEN4)
if( n <= 30 )
norm_fp = bli_dnorm2fv_unb_var1_avx2;
norm_fp = bli_dnorm2fv_zen_int_unb_var1;
else
norm_fp = bli_dnorm2fv_unb_var1_avx512;
norm_fp = bli_dnorm2fv_zen4_int_unb_var1;
#ifdef __clang__
fast_path_thresh = 6000;
@@ -1085,9 +1085,9 @@ void bli_dnormfv_unb_var1
#if defined(BLIS_KERNELS_ZEN4)
if( n <= 250 )
norm_fp = bli_dnorm2fv_unb_var1_avx2;
norm_fp = bli_dnorm2fv_zen_int_unb_var1;
else
norm_fp = bli_dnorm2fv_unb_var1_avx512;
norm_fp = bli_dnorm2fv_zen4_int_unb_var1;
fast_path_thresh = 4000;
@@ -1102,7 +1102,7 @@ void bli_dnormfv_unb_var1
case BLIS_ARCH_ZEN:
#ifdef BLIS_KERNELS_ZEN
norm_fp = bli_dnorm2fv_unb_var1_avx2;
norm_fp = bli_dnorm2fv_zen_int_unb_var1;
fast_path_thresh = 4000;
#ifdef BLIS_ENABLE_OPENMP

View File

@@ -328,14 +328,14 @@ set(CAN_TEST_INFO_VALUE ON)
# The following part will be used to set up macros that relate to the version
# of BLIS library being tested.
if(ENABLE_THREADING STREQUAL "openmp")
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${PROJECT_SOURCE_DIR}/cmake/get_version.cpp
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${CMAKE_SOURCE_DIR}/cmake/get_version.cpp
COMPILE_DEFINITIONS -I${BLIS_PATH}/include/ -I${BLIS_PATH}/include/blis
LINK_LIBRARIES ${BLIS_LIBRARY} ${COMMON_LIBS} OpenMP::OpenMP_CXX ${ASAN_FLAGS} ${COVERAGE_FLAGS}
RUN_OUTPUT_VARIABLE TEST_BLIS_VERSION
COMPILE_OUTPUT_VARIABLE COMP_VAR_VERSION
)
else()
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${PROJECT_SOURCE_DIR}/cmake/get_version.cpp
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${CMAKE_SOURCE_DIR}/cmake/get_version.cpp
COMPILE_DEFINITIONS -I${BLIS_PATH}/include/ -I${BLIS_PATH}/include/blis
LINK_LIBRARIES ${BLIS_LIBRARY} ${COMMON_LIBS} ${ASAN_FLAGS} ${COVERAGE_FLAGS}
RUN_OUTPUT_VARIABLE TEST_BLIS_VERSION
@@ -387,14 +387,14 @@ endif()
# This way, kernel tests won't be compiled/run for shared versions of BLIS.
if(BLIS_LINKING_TYPE STREQUAL "static")
if(ENABLE_THREADING STREQUAL "openmp")
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${PROJECT_SOURCE_DIR}/cmake/config_ukr_tests.cpp
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${CMAKE_SOURCE_DIR}/cmake/config_ukr_tests.cpp
COMPILE_DEFINITIONS -I${BLIS_PATH}/include/ -I${BLIS_PATH}/include/blis
LINK_LIBRARIES ${BLIS_LIBRARY} ${COMMON_LIBS} OpenMP::OpenMP_CXX ${ASAN_FLAGS} ${COVERAGE_FLAGS}
RUN_OUTPUT_VARIABLE UKR_CONFIG
COMPILE_OUTPUT_VARIABLE COMP_VAR
)
else()
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${PROJECT_SOURCE_DIR}/cmake/config_ukr_tests.cpp
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${CMAKE_SOURCE_DIR}/cmake/config_ukr_tests.cpp
COMPILE_DEFINITIONS -I${BLIS_PATH}/include/ -I${BLIS_PATH}/include/blis
LINK_LIBRARIES ${BLIS_LIBRARY} ${COMMON_LIBS} ${ASAN_FLAGS} ${COVERAGE_FLAGS}
RUN_OUTPUT_VARIABLE UKR_CONFIG

View File

@@ -39,41 +39,150 @@
*/
#ifdef AOCL_DEV
#define K_zen4_int_40x2_mt zen4_int_40x2_mt
#define K_zen4_int_40x8_mt zen4_int_40x8_mt
#define K_zen4_int_40x2_st zen4_int_40x2_st
#define K_zen4_int_40x8_st zen4_int_40x8_st
#define K_bli_zdotv_zen_int_5 bli_zdotv_zen_int_5
#define K_bli_cdotv_zen_int_5 bli_cdotv_zen_int_5
#define K_bli_zaxpyv_zen_int_5 bli_zaxpyv_zen_int_5
#define K_bli_caxpyv_zen_int_5 bli_caxpyv_zen_int_5
#define K_bli_sswapv_zen_int_8 bli_sswapv_zen_int_8
#define K_bli_dswapv_zen_int_8 bli_dswapv_zen_int_8
#define K_bli_zdscalv_zen_int_10 bli_zdscalv_zen_int_10
#define K_bli_sscalv_zen_int_10 bli_sscalv_zen_int_10
#define K_bli_dscalv_zen_int_10 bli_dscalv_zen_int_10
#define K_bli_sdotv_zen_int_10 bli_sdotv_zen_int_10
#define K_bli_ddotv_zen_int_10 bli_ddotv_zen_int_10
#define K_bli_saxpyv_zen_int_10 bli_saxpyv_zen_int_10
#define K_bli_daxpyv_zen_int_10 bli_daxpyv_zen_int_10
#define K_bli_saxpbyv_zen_int_10 bli_saxpbyv_zen_int_10
#define K_bli_daxpbyv_zen_int_10 bli_daxpbyv_zen_int_10
#define K_bli_dgemmsup_cv_zen5_asm_24x8m bli_dgemmsup_cv_zen5_asm_24x8m
#define K_bli_dgemmsup_cv_zen4_asm_24x8m bli_dgemmsup_cv_zen4_asm_24x8m
#define K_bli_dgemmsup_cv_zen4_asm_24x8m_new bli_dgemmsup_cv_zen4_asm_24x8m_new
#define K_bli_dgemm_tiny_zen4_24x8 bli_dgemm_tiny_zen4_24x8
#define K_bli_dgemm_tiny_zen_6x8 bli_dgemm_tiny_zen_6x8
#define K_bli_zaxpyf_zen4_int_8 bli_zaxpyf_zen4_int_8
#define K_bli_daxpyf_zen4_int bli_daxpyf_zen4_int
#define K_bli_ddotxf_zen4_int bli_ddotxf_zen4_int
#define K_bli_dgemm_zen4_asm_8x24 bli_dgemm_zen4_asm_8x24
#define K_bli_ztrsm_small_zen_int_pack bli_ztrsm_small_zen_int_pack
#define K_bli_ctrsm_small_zen_int_pack bli_ctrsm_small_zen_int_pack
#define K_bli_strsm_small_zen_int_pack bli_strsm_small_zen_int_pack
#define K_bli_dtrsm_small_zen_int_pack bli_dtrsm_small_zen_int_pack
#define K_bli_ztrsm_small_zen5 bli_ztrsm_small_zen5
#define K_bli_dtrsm_small_zen4_int_pack bli_dtrsm_small_zen4_int_pack
#define K_bli_trsm_small_ref bli_trsm_small_ref
#define K_bli_trsm_small_zen bli_trsm_small_zen
#define K_bli_trsm_small_zen bli_trsm_small_zen
#define K_bli_trsm_small_zen5_mt bli_trsm_small_zen5_mt
#define K_bli_trsm_small_zen5 bli_trsm_small_zen5
#define K_bli_trsm_small_zen4_mt bli_trsm_small_zen4_mt
#define K_bli_trsm_small_zen4 bli_trsm_small_zen4
#define K_bli_zsetv_zen4_int bli_zsetv_zen4_int
#define K_bli_dsetv_zen4_int bli_dsetv_zen4_int
#define K_bli_ssetv_zen4_int bli_ssetv_zen4_int
#define K_bli_dgemv_n_zen4_int_32x8_st bli_dgemv_n_zen4_int_32x8_st
#define K_scalv_zen4_int scalv_zen4_int
#define K_scalv_zen4_int scalv_zen4_int
#define K_bli_zscalv_zen4_int bli_zscalv_zen4_int
#define K_bli_cscalv_zen4_int bli_cscalv_zen4_int
#define K_bli_zdscalv_zen4_int bli_zdscalv_zen4_int
#define K_bli_dscalv_zen4_int bli_dscalv_zen4_int
#define K_bli_sscalv_zen4_int bli_sscalv_zen4_int
#define K_bli_dscal2v_zen4_int bli_dscal2v_zen4_int
#define K_bli_zdotxv_zen4_int bli_zdotxv_zen4_int
#define K_bli_zdotv_zen4_asm bli_zdotv_zen4_asm
#define K_bli_zdotv_zen4_int bli_zdotv_zen4_int
#define K_bli_ddotv_zen4_int bli_ddotv_zen4_int
#define K_bli_sdotv_zen4_int bli_sdotv_zen4_int
#define K_bli_dcopyv_zen5_asm bli_dcopyv_zen5_asm
#define K_bli_zcopyv_zen4_int bli_zcopyv_zen4_int
#define K_bli_dcopyv_zen4_int bli_dcopyv_zen4_int
#define K_bli_scopyv_zen4_int bli_scopyv_zen4_int
#define K_bli_zcopyv_zen4_asm bli_zcopyv_zen4_asm
#define K_bli_dcopyv_zen4_asm bli_dcopyv_zen4_asm
#define K_bli_scopyv_zen4_asm bli_scopyv_zen4_asm
#define K_bli_zaxpyv_zen4_int bli_zaxpyv_zen4_int
#define K_bli_daxpyv_zen4_int bli_daxpyv_zen4_int
#define K_bli_saxpyv_zen4_int bli_saxpyv_zen4_int
#define K_bli_daxpbyv_zen4_int bli_daxpbyv_zen4_int
#define K_bli_damaxv_zen4_int bli_damaxv_zen4_int
#define K_bli_samaxv_zen4_int bli_samaxv_zen4_int
#define K_bli_daddv_zen4_int bli_daddv_zen4_int
#define K_bli_dnorm2fv_zen4_int_unb_var1 bli_dnorm2fv_zen4_int_unb_var1
#define K_bli_snorm2fv_zen_int_unb_var1 bli_snorm2fv_zen_int_unb_var1
#define K_bli_scnorm2fv_zen_int_unb_var1 bli_scnorm2fv_zen_int_unb_var1
#define K_bli_dznorm2fv_zen_int_unb_var1 bli_dznorm2fv_zen_int_unb_var1
#define K_bli_dnorm2fv_zen_int_unb_var1 bli_dnorm2fv_zen_int_unb_var1
#define K_bli_sgemmsup_rd_zen4_asm_6x64n bli_sgemmsup_rd_zen4_asm_6x64n
#define K_bli_sgemmsup_rd_zen4_asm_6x64m bli_sgemmsup_rd_zen4_asm_6x64m
#define K_bli_sgemmsup_rv_zen4_asm_6x64n bli_sgemmsup_rv_zen4_asm_6x64n
#define K_bli_sgemmsup_rv_zen4_asm_6x64m bli_sgemmsup_rv_zen4_asm_6x64m
#define K_bli_sgemmsup_rv_zen4_asm_6x64n bli_sgemmsup_rv_zen4_asm_6x64n
#define K_bli_sgemmsup_rv_zen4_asm_6x64m bli_sgemmsup_rv_zen4_asm_6x64m
#define K_bli_dgemmtrsm_u_zen4_asm_16x14 bli_dgemmtrsm_u_zen4_asm_16x14
#define K_bli_dgemmtrsm_l_zen4_asm_16x14 bli_dgemmtrsm_l_zen4_asm_16x14
#define K_bli_dgemv_n_zen bli_dgemv_n_zen
#define K_bli_dgemv_t_zen_int_16x1m bli_dgemv_t_zen_int_16x1m
#define K_bli_dgemv_t_zen_int_16x2m bli_dgemv_t_zen_int_16x2m
#define K_bli_dgemv_t_zen_int_16x3m bli_dgemv_t_zen_int_16x3m
#define K_bli_dgemv_t_zen_int_16x4m bli_dgemv_t_zen_int_16x4m
#define K_bli_dgemv_t_zen_int_16x5m bli_dgemv_t_zen_int_16x5m
#define K_bli_dgemv_t_zen_int_16x6m bli_dgemv_t_zen_int_16x6m
#define K_bli_dgemv_t_zen_int_16x7m bli_dgemv_t_zen_int_16x7m
#define K_bli_dgemv_t_zen_int bli_dgemv_t_zen_int
#define K_bli_dgemv_t_zen4_int_32x1m bli_dgemv_t_zen4_int_32x1m
#define K_bli_dgemv_t_zen4_int_32x2m bli_dgemv_t_zen4_int_32x2m
#define K_bli_dgemv_t_zen4_int_32x3m bli_dgemv_t_zen4_int_32x3m
#define K_bli_dgemv_t_zen4_int_32x4m bli_dgemv_t_zen4_int_32x4m
#define K_bli_dgemv_t_zen4_int_32x5m bli_dgemv_t_zen4_int_32x5m
#define K_bli_dgemv_t_zen4_int_32x6m bli_dgemv_t_zen4_int_32x6m
#define K_bli_dgemv_t_zen4_int_32x7m bli_dgemv_t_zen4_int_32x7m
#define K_bli_dgemv_t_zen4_int bli_dgemv_t_zen4_int
#define K_bli_dgemv_n_zen4_int_m_leftx1n bli_dgemv_n_zen4_int_m_leftx1n
#define K_bli_dgemv_n_zen4_int_8x1n bli_dgemv_n_zen4_int_8x1n
#define K_bli_dgemv_n_zen4_int_16x1n bli_dgemv_n_zen4_int_16x1n
#define K_bli_dgemv_n_zen4_int_32x1n bli_dgemv_n_zen4_int_32x1n
#define K_bli_dgemv_n_zen4_int_m_leftx2n bli_dgemv_n_zen4_int_m_leftx2n
#define K_bli_dgemv_n_zen4_int_8x2n bli_dgemv_n_zen4_int_8x2n
#define K_bli_dgemv_n_zen4_int_16x2n bli_dgemv_n_zen4_int_16x2n
#define K_bli_dgemv_n_zen4_int_32x2n bli_dgemv_n_zen4_int_32x2n
#define K_bli_dgemv_n_zen4_int_m_leftx3n bli_dgemv_n_zen4_int_m_leftx3n
#define K_bli_dgemv_n_zen4_int_8x3n bli_dgemv_n_zen4_int_8x3n
#define K_bli_dgemv_n_zen4_int_16x3n bli_dgemv_n_zen4_int_16x3n
#define K_bli_dgemv_n_zen4_int_32x3n bli_dgemv_n_zen4_int_32x3n
#define K_bli_dgemv_n_zen4_int_m_leftx4n bli_dgemv_n_zen4_int_m_leftx4n
#define K_bli_dgemv_n_zen4_int_8x4n bli_dgemv_n_zen4_int_8x4n
#define K_bli_dgemv_n_zen4_int_16x4n bli_dgemv_n_zen4_int_16x4n
#define K_bli_dgemv_n_zen4_int_32x4n bli_dgemv_n_zen4_int_32x4n
#define K_bli_dgemv_n_zen4_int_m_leftx8n bli_dgemv_n_zen4_int_m_leftx8n
#define K_bli_dgemv_n_zen4_int_8x8n bli_dgemv_n_zen4_int_8x8n
#define K_bli_dgemv_n_zen4_int_16x8n bli_dgemv_n_zen4_int_16x8n
#define K_bli_dgemv_n_zen4_int_32x8n bli_dgemv_n_zen4_int_32x8n
#define K_bli_dgemv_n_zen4_int_16mx1 bli_dgemv_n_zen4_int_16mx1
#define K_bli_dgemv_n_zen4_int_16mx2 bli_dgemv_n_zen4_int_16mx2
#define K_bli_dgemv_n_zen4_int_16mx3 bli_dgemv_n_zen4_int_16mx3
#define K_bli_dgemv_n_zen4_int_16mx4 bli_dgemv_n_zen4_int_16mx4
#define K_bli_dgemv_n_zen4_int_16mx5 bli_dgemv_n_zen4_int_16mx5
#define K_bli_dgemv_n_zen4_int_16mx6 bli_dgemv_n_zen4_int_16mx6
#define K_bli_dgemv_n_zen4_int_16mx7 bli_dgemv_n_zen4_int_16mx7
#define K_bli_dgemv_n_zen4_int_16mx8 bli_dgemv_n_zen4_int_16mx8
#define K_bli_dgemv_n_zen4_int bli_dgemv_n_zen4_int
#define K_bli_cgemm_zen4_int_32x4_k1_nn bli_cgemm_zen4_int_32x4_k1_nn
#define K_bli_zgemm_zen4_int_16x4_k1_nn bli_zgemm_zen4_int_16x4_k1_nn
#define K_bli_dgemm_zen4_int_24x8_k1_nn bli_dgemm_zen4_int_24x8_k1_nn
#define K_bli_zgemm_zen_int_4x4_k1_nn bli_zgemm_zen_int_4x4_k1_nn
#define K_bli_dgemm_zen_int_8x6_k1_nn bli_dgemm_zen_int_8x6_k1_nn
#define AOCL_51
#define K_bli_dgemv_n_zen4_int 1
#define K_bli_dgemv_n_zen4_40x2_int_st 1
#define K_bli_dgemv_n_zen4_40x2_int_mt 1
#define K_bli_dgemv_m_zen4_40x8_int_st 1
#define K_bli_dgemv_m_zen4_40x8_int_mt_Ndiv 1
#define K_bli_dgemv_m_zen4_40x8_int_mt_Mdiv 1
#define K_bli_dgemv_m_zen4_40x8_int_mt_Mdiv_Ndiv 1
#endif
#ifdef AOCL_51
#define K_bli_zgemmsup_cv_zen4_asm_fx1 1
#define K_bli_zgemmsup_cv_zen4_asm_fx2 1
#define K_bli_zgemmsup_cv_zen4_asm_fx3 1
#define K_bli_zgemmsup_cv_zen4_asm_fx4 1
#define K_bli_cgemm_32x4_avx512_k1_nn 1
#define K_bli_cgemmsup_cv_zen4_asm_24x4m 1
#define K_bli_cgemmsup_cv_zen4_asm_24x3m 1
#define K_bli_cgemmsup_cv_zen4_asm_24x2m 1
#define K_bli_cgemmsup_cv_zen4_asm_24x1m 1
#define K_bli_cgemmsup_cv_zen4_asm_16x4 1
#define K_bli_cgemmsup_cv_zen4_asm_16x3 1
#define K_bli_cgemmsup_cv_zen4_asm_16x2 1
#define K_bli_cgemmsup_cv_zen4_asm_16x1 1
#define K_bli_cgemmsup_cv_zen4_asm_8x4 1
#define K_bli_cgemmsup_cv_zen4_asm_8x3 1
#define K_bli_cgemmsup_cv_zen4_asm_8x2 1
#define K_bli_cgemmsup_cv_zen4_asm_8x1 1
#define K_bli_cgemmsup_cv_zen4_asm_fx4 1
#define K_bli_cgemmsup_cv_zen4_asm_fx3 1
#define K_bli_cgemmsup_cv_zen4_asm_fx2 1
#define K_bli_cgemmsup_cv_zen4_asm_fx1 1
#define K_bli_cgemm_zen4_asm_24x4 1
#define K_bli_cgemm_zen4_asm_4x24 1
#define K_bli_zgemmsup_cd_zen4_asm_12x2m 1
#define K_bli_zgemmsup_cd_zen4_asm_12x4m 1
#define K_bli_zgemmsup_cd_zen4_asm_2x2 1
@@ -82,54 +191,117 @@
#define K_bli_zgemmsup_cd_zen4_asm_4x4 1
#define K_bli_zgemmsup_cd_zen4_asm_8x2 1
#define K_bli_zgemmsup_cd_zen4_asm_8x4 1
#define K_bli_dgemmsup_rv_zen4_asm_24x8m_new 1
#define K_bli_dgemv_t_zen_int 1
#define K_bli_dgemv_t_zen_int_16x7m 1
#define K_bli_dgemv_t_zen_int_16x6m 1
#define K_bli_dgemv_t_zen_int_16x5m 1
#define K_bli_dgemv_t_zen_int_16x4m 1
#define K_bli_dgemv_t_zen_int_16x3m 1
#define K_bli_dgemv_t_zen_int_16x2m 1
#define K_bli_dgemv_t_zen_int_16x1m 1
#define K_bli_dgemv_t_zen4_int 1
#define K_bli_dgemv_t_zen4_int_32x7m 1
#define K_bli_dgemv_t_zen4_int_32x6m 1
#define K_bli_dgemv_t_zen4_int_32x5m 1
#define K_bli_dgemv_t_zen4_int_32x4m 1
#define K_bli_dgemv_t_zen4_int_32x3m 1
#define K_bli_dgemv_t_zen4_int_32x2m 1
#define K_bli_dgemv_t_zen4_int_32x1m 1
#define K_bli_ztrsm_small_ZEN5 1
#define K_bli_dgemv_n_zen_int_16mx8_avx512 1
#define K_bli_dgemv_n_zen_int_16mx7_avx512 1
#define K_bli_dgemv_n_zen_int_16mx6_avx512 1
#define K_bli_dgemv_n_zen_int_16mx5_avx512 1
#define K_bli_dgemv_n_zen_int_16mx4_avx512 1
#define K_bli_dgemv_n_zen_int_16mx3_avx512 1
#define K_bli_dgemv_n_zen_int_16mx2_avx512 1
#define K_bli_dgemv_n_zen_int_16mx1_avx512 1
#define K_bli_dgemv_n_zen_int_32x8n_avx512 1
#define K_bli_dgemv_n_zen_int_16x8n_avx512 1
#define K_bli_dgemv_n_zen_int_8x8n_avx512 1
#define K_bli_dgemv_n_zen_int_m_leftx8n_avx512 1
#define K_bli_dgemv_n_zen_int_32x4n_avx512 1
#define K_bli_dgemv_n_zen_int_16x4n_avx512 1
#define K_bli_dgemv_n_zen_int_8x4n_avx512 1
#define K_bli_dgemv_n_zen_int_m_leftx4n_avx512 1
#define K_bli_dgemv_n_zen_int_32x3n_avx512 1
#define K_bli_dgemv_n_zen_int_16x3n_avx512 1
#define K_bli_dgemv_n_zen_int_8x3n_avx512 1
#define K_bli_dgemv_n_zen_int_m_leftx3n_avx512 1
#define K_bli_dgemv_n_zen_int_32x2n_avx512 1
#define K_bli_dgemv_n_zen_int_16x2n_avx512 1
#define K_bli_dgemv_n_zen_int_8x2n_avx512 1
#define K_bli_dgemv_n_zen_int_m_leftx2n_avx512 1
#define K_bli_dgemv_n_zen_int_32x1n_avx512 1
#define K_bli_dgemv_n_zen_int_16x1n_avx512 1
#define K_bli_dgemv_n_zen_int_8x1n_avx512 1
#define K_bli_dgemv_n_zen_int_m_leftx1n_avx512 1
#define K_bli_dcopyv_zen4_asm_avx512_biway 1
#define K_bli_dcopyv_zen5_asm_avx512 1
#ifndef K_bli_dgemmsup_cv_zen4_asm_24x8m_new
#define K_bli_dgemmsup_cv_zen4_asm_24x8m_new bli_dgemmsup_rv_zen4_asm_24x8m_new
#endif
#ifndef K_bli_dgemv_t_zen_int
#define K_bli_dgemv_t_zen_int bli_dgemv_t_zen_int_avx2
#endif
#define K_bli_dgemv_t_zen_int_mx7_avx2 1
#define K_bli_dgemv_t_zen_int_mx6_avx2 1
#define K_bli_dgemv_t_zen_int_mx5_avx2 1
#define K_bli_dgemv_t_zen_int_mx4_avx2 1
#define K_bli_dgemv_t_zen_int_mx3_avx2 1
#define K_bli_dgemv_t_zen_int_mx2_avx2 1
#define K_bli_dgemv_t_zen_int_mx1_avx2 1
#ifndef K_bli_dgemv_t_zen4_int
#define K_bli_dgemv_t_zen4_int bli_dgemv_t_zen_int_avx512
#endif
#define K_bli_dgemv_t_zen_int_mx7_avx512 1
#define K_bli_dgemv_t_zen_int_mx6_avx512 1
#define K_bli_dgemv_t_zen_int_mx5_avx512 1
#define K_bli_dgemv_t_zen_int_mx4_avx512 1
#define K_bli_dgemv_t_zen_int_mx3_avx512 1
#define K_bli_dgemv_t_zen_int_mx2_avx512 1
#define K_bli_dgemv_t_zen_int_mx1_avx512 1
#ifndef K_bli_ztrsm_small_zen5
#define K_bli_ztrsm_small_zen5 bli_ztrsm_small_ZEN5
#endif
#ifndef K_bli_dgemv_n_zen4_int_16mx8
#define K_bli_dgemv_n_zen4_int_16mx8 bli_dgemv_n_zen_int_16mx8_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_16mx7
#define K_bli_dgemv_n_zen4_int_16mx7 bli_dgemv_n_zen_int_16mx7_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_16mx6
#define K_bli_dgemv_n_zen4_int_16mx6 bli_dgemv_n_zen_int_16mx6_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_16mx5
#define K_bli_dgemv_n_zen4_int_16mx5 bli_dgemv_n_zen_int_16mx5_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_16mx4
#define K_bli_dgemv_n_zen4_int_16mx4 bli_dgemv_n_zen_int_16mx4_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_16mx3
#define K_bli_dgemv_n_zen4_int_16mx3 bli_dgemv_n_zen_int_16mx3_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_16mx2
#define K_bli_dgemv_n_zen4_int_16mx2 bli_dgemv_n_zen_int_16mx2_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_16mx1
#define K_bli_dgemv_n_zen4_int_16mx1 bli_dgemv_n_zen_int_16mx1_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_32x8n
#define K_bli_dgemv_n_zen4_int_32x8n bli_dgemv_n_zen_int_32x8n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_16x8n
#define K_bli_dgemv_n_zen4_int_16x8n bli_dgemv_n_zen_int_16x8n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_8x8n
#define K_bli_dgemv_n_zen4_int_8x8n bli_dgemv_n_zen_int_8x8n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_m_leftx8n
#define K_bli_dgemv_n_zen4_int_m_leftx8n bli_dgemv_n_zen_int_m_leftx8n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_32x4n
#define K_bli_dgemv_n_zen4_int_32x4n bli_dgemv_n_zen_int_32x4n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_16x4n
#define K_bli_dgemv_n_zen4_int_16x4n bli_dgemv_n_zen_int_16x4n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_8x4n
#define K_bli_dgemv_n_zen4_int_8x4n bli_dgemv_n_zen_int_8x4n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_m_leftx4n
#define K_bli_dgemv_n_zen4_int_m_leftx4n bli_dgemv_n_zen_int_m_leftx4n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_32x3n
#define K_bli_dgemv_n_zen4_int_32x3n bli_dgemv_n_zen_int_32x3n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_16x3n
#define K_bli_dgemv_n_zen4_int_16x3n bli_dgemv_n_zen_int_16x3n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_8x3n
#define K_bli_dgemv_n_zen4_int_8x3n bli_dgemv_n_zen_int_8x3n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_m_leftx3n
#define K_bli_dgemv_n_zen4_int_m_leftx3n bli_dgemv_n_zen_int_m_leftx3n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_32x2n
#define K_bli_dgemv_n_zen4_int_32x2n bli_dgemv_n_zen_int_32x2n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_16x2n
#define K_bli_dgemv_n_zen4_int_16x2n bli_dgemv_n_zen_int_16x2n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_8x2n
#define K_bli_dgemv_n_zen4_int_8x2n bli_dgemv_n_zen_int_8x2n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_m_leftx2n
#define K_bli_dgemv_n_zen4_int_m_leftx2n bli_dgemv_n_zen_int_m_leftx2n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_32x1n
#define K_bli_dgemv_n_zen4_int_32x1n bli_dgemv_n_zen_int_32x1n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_16x1n
#define K_bli_dgemv_n_zen4_int_16x1n bli_dgemv_n_zen_int_16x1n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_8x1n
#define K_bli_dgemv_n_zen4_int_8x1n bli_dgemv_n_zen_int_8x1n_avx512
#endif
#ifndef K_bli_dgemv_n_zen4_int_m_leftx1n
#define K_bli_dgemv_n_zen4_int_m_leftx1n bli_dgemv_n_zen_int_m_leftx1n_avx512
#endif
#define AOCL_50
#endif
@@ -141,35 +313,75 @@
#define K_bli_ccopyv_zen_int 1
#define K_bli_cscal2v_zen_int 1
#define K_bli_cscalv_zen_int 1
#define K_bli_cscalv_zen_int_avx512 1
#ifndef K_bli_cscalv_zen4_int
#define K_bli_cscalv_zen4_int bli_cscalv_zen_int_avx512
#endif
#define K_bli_csetv_zen_int 1
#define K_bli_daddv_zen_int 1
#define K_bli_daddv_zen_int_avx512 1
#define K_bli_daxpbyv_zen_int_avx512 1
#define K_bli_daxpyf_zen_int_avx512 1
#define K_bli_dcopyv_zen4_asm_avx512 1
#define K_bli_dgemm_avx512_asm_8x24 1
#define K_bli_dnorm2fv_unb_var1_avx512 1
#ifndef K_bli_daddv_zen4_int
#define K_bli_daddv_zen4_int bli_daddv_zen_int_avx512
#endif
#ifndef K_bli_daxpbyv_zen4_int
#define K_bli_daxpbyv_zen4_int bli_daxpbyv_zen_int_avx512
#endif
#ifndef K_bli_daxpyf_zen4_int
#define K_bli_daxpyf_zen4_int bli_daxpyf_zen_int_avx512
#endif
#ifndef K_bli_dcopyv_zen4_asm
#define K_bli_dcopyv_zen4_asm bli_dcopyv_zen4_asm_avx512
#endif
#ifndef K_bli_dgemm_zen4_asm_8x24
#define K_bli_dgemm_zen4_asm_8x24 bli_dgemm_avx512_asm_8x24
#endif
#ifndef K_bli_dnorm2fv_zen4_int_unb_var1
#define K_bli_dnorm2fv_zen4_int_unb_var1 bli_dnorm2fv_unb_var1_avx512
#endif
#define K_bli_dscal2v_zen_int 1
#define K_bli_dscal2v_zen_int_avx512 1
#define K_bli_dsetv_zen_int_avx512 1
#ifndef K_bli_dscal2v_zen4_int
#define K_bli_dscal2v_zen4_int bli_dscal2v_zen_int_avx512
#endif
#ifndef K_bli_dsetv_zen4_int
#define K_bli_dsetv_zen4_int bli_dsetv_zen_int_avx512
#endif
#define K_bli_saddv_zen_int 1
#define K_bli_scopyv_zen4_asm_avx512 1
#ifndef K_bli_scopyv_zen4_asm
#define K_bli_scopyv_zen4_asm bli_scopyv_zen4_asm_avx512
#endif
#define K_bli_sscal2v_zen_int 1
#define K_bli_ssetv_zen_int_avx512 1
#ifndef K_bli_ssetv_zen4_int
#define K_bli_ssetv_zen4_int bli_ssetv_zen_int_avx512
#endif
#define K_bli_zaddv_zen_int 1
#define K_bli_zaxpyf_zen_int_8_avx512 1
#define K_bli_zaxpyv_zen_int_avx512 1
#define K_bli_zcopyv_zen4_asm_avx512 1
#define K_bli_zdotv_zen4_asm_avx512 1
#define K_bli_zdotv_zen_int_avx512 1
#define K_bli_zgemm_16x4_avx512_k1_nn 1
#define K_bli_zscalv_zen_int_avx512 1
#ifndef K_bli_zaxpyf_zen4_int_8
#define K_bli_zaxpyf_zen4_int_8 bli_zaxpyf_zen_int_8_avx512
#endif
#ifndef K_bli_zaxpyv_zen4_int
#define K_bli_zaxpyv_zen4_int bli_zaxpyv_zen_int_avx512
#endif
#ifndef K_bli_zcopyv_zen4_asm
#define K_bli_zcopyv_zen4_asm bli_zcopyv_zen4_asm_avx512
#endif
#ifndef K_bli_zdotv_zen4_asm
#define K_bli_zdotv_zen4_asm bli_zdotv_zen4_asm_avx512
#endif
#ifndef K_bli_zdotv_zen4_int
#define K_bli_zdotv_zen4_int bli_zdotv_zen_int_avx512
#endif
#ifndef K_bli_zgemm_zen4_int_16x4_k1_nn
#define K_bli_zgemm_zen4_int_16x4_k1_nn bli_zgemm_16x4_avx512_k1_nn
#endif
#ifndef K_bli_zscalv_zen4_int
#define K_bli_zscalv_zen4_int bli_zscalv_zen_int_avx512
#endif
#define K_bli_zsetv_zen_int 1
#define K_bli_zsetv_zen_int_avx512 1
#ifndef K_bli_zsetv_zen4_int
#define K_bli_zsetv_zen4_int bli_zsetv_zen_int_avx512
#endif
// In AOCL 4.2 but interface changed at 5.0
#define K_bli_zgemm_4x4_avx2_k1_nn 1
#ifndef K_bli_zgemm_zen_int_4x4_k1_nn
#define K_bli_zgemm_zen_int_4x4_k1_nn bli_zgemm_4x4_avx2_k1_nn
#endif
#define AOCL_42
@@ -180,13 +392,19 @@
#define E_GEMM_COMPUTE
#define K_bli_dgemm_24x8_avx512_k1_nn 1
#define K_bli_zdscalv_zen_int_avx512 1
#ifndef K_bli_dgemm_zen4_int_24x8_k1_nn
#define K_bli_dgemm_zen4_int_24x8_k1_nn bli_dgemm_24x8_avx512_k1_nn
#endif
#ifndef K_bli_zdscalv_zen4_int
#define K_bli_zdscalv_zen4_int bli_zdscalv_zen_int_avx512
#endif
#define K_bli_zgemm_zen4_asm_4x12 1
#define K_bli_zgemm_zen_asm_2x6 1
// In AOCL 4.1 but interface changed at 4.2
#define K_bli_dgemm_8x6_avx2_k1_nn 1
#ifndef K_bli_dgemm_zen_int_8x6_k1_nn
#define K_bli_dgemm_zen_int_8x6_k1_nn bli_dgemm_8x6_avx2_k1_nn
#endif
#define AOCL_41
@@ -196,7 +414,9 @@
#ifdef AOCL_41
#define K_bli_caxpbyv_zen_int 1
#define K_bli_caxpyv_zen_int5 1
#ifndef K_bli_caxpyv_zen_int_5
#define K_bli_caxpyv_zen_int_5 bli_caxpyv_zen_int5
#endif
#define K_bli_cgemm_haswell_asm_3x8 1
#define K_bli_cgemmsup_rv_zen_asm_1x2 1
#define K_bli_cgemmsup_rv_zen_asm_1x4 1
@@ -213,68 +433,128 @@
#define K_bli_cgemmsup_rv_zen_asm_3x8m 1
#define K_bli_cgemmsup_rv_zen_asm_3x8n 1
#define K_bli_damaxv_zen_int 1
#define K_bli_damaxv_zen_int_avx512 1
#ifndef K_bli_damaxv_zen4_int
#define K_bli_damaxv_zen4_int bli_damaxv_zen_int_avx512
#endif
#define K_bli_daxpbyv_zen_int 1
#define K_bli_daxpbyv_zen_int10 1
#ifndef K_bli_daxpbyv_zen_int_10
#define K_bli_daxpbyv_zen_int_10 bli_daxpbyv_zen_int10
#endif
#define K_bli_daxpyv_zen_int 1
#define K_bli_daxpyv_zen_int10 1
#define K_bli_daxpyv_zen_int_avx512 1
#ifndef K_bli_daxpyv_zen_int_10
#define K_bli_daxpyv_zen_int_10 bli_daxpyv_zen_int10
#endif
#ifndef K_bli_daxpyv_zen4_int
#define K_bli_daxpyv_zen4_int bli_daxpyv_zen_int_avx512
#endif
#define K_bli_dcopyv_zen_int 1
#define K_bli_ddotv_zen_int 1
#define K_bli_ddotv_zen_int10 1
#define K_bli_ddotv_zen_int_avx512 1
#ifndef K_bli_ddotv_zen_int_10
#define K_bli_ddotv_zen_int_10 bli_ddotv_zen_int10
#endif
#ifndef K_bli_ddotv_zen4_int
#define K_bli_ddotv_zen4_int bli_ddotv_zen_int_avx512
#endif
#define K_bli_dgemm_haswell_asm_6x8 1
#define K_bli_dgemm_zen4_asm_32x6 1
#define K_bli_dgemm_zen4_asm_8x24 1
#ifndef K_bli_dgemm_zen4_asm_8x24
#define K_bli_dgemm_zen4_asm_8x24 bli_dgemm_zen4_asm_8x24
#endif
#define K_bli_dgemmsup_rd_haswell_asm_6x8m 1
#define K_bli_dgemmsup_rd_haswell_asm_6x8n 1
#define K_bli_dgemmsup_rv_haswell_asm_6x8m 1
#define K_bli_dgemmsup_rv_haswell_asm_6x8n 1
#define K_bli_dgemmsup_rv_zen4_asm_24x8m 1
#define K_bli_dgemmsup_rv_zen5_asm_24x8m 1
#ifndef K_bli_dgemmsup_cv_zen4_asm_24x8m
#define K_bli_dgemmsup_cv_zen4_asm_24x8m bli_dgemmsup_rv_zen4_asm_24x8m
#endif
#ifndef K_bli_dgemmsup_cv_zen5_asm_24x8m
#define K_bli_dgemmsup_cv_zen5_asm_24x8m bli_dgemmsup_rv_zen5_asm_24x8m
#endif
#define K_bli_dgemmtrsm_l_haswell_asm_6x8 1
#define K_bli_dgemmtrsm_l_zen4_asm_8x24 1
#define K_bli_dgemmtrsm_u_haswell_asm_6x8 1
#define K_bli_dgemmtrsm_u_zen4_asm_8x24 1
#define K_bli_dnorm2fv_unb_var1_avx2 1
#ifndef K_bli_dnorm2fv_zen_int_unb_var1
#define K_bli_dnorm2fv_zen_int_unb_var1 bli_dnorm2fv_unb_var1_avx2
#endif
#define K_bli_dscalv_zen_int 1
#define K_bli_dscalv_zen_int10 1
#define K_bli_dscalv_zen_int_avx512 1
#ifndef K_bli_dscalv_zen_int_10
#define K_bli_dscalv_zen_int_10 bli_dscalv_zen_int10
#endif
#ifndef K_bli_dscalv_zen4_int
#define K_bli_dscalv_zen4_int bli_dscalv_zen_int_avx512
#endif
#define K_bli_dsetv_zen_int 1
#define K_bli_dswapv_zen_int8 1
#define K_bli_dznorm2fv_unb_var1_avx2 1
#ifndef K_bli_dswapv_zen_int_8
#define K_bli_dswapv_zen_int_8 bli_dswapv_zen_int8
#endif
#ifndef K_bli_dznorm2fv_zen_int_unb_var1
#define K_bli_dznorm2fv_zen_int_unb_var1 bli_dznorm2fv_unb_var1_avx2
#endif
#define K_bli_samaxv_zen_int 1
#define K_bli_samaxv_zen_int_avx512 1
#ifndef K_bli_samaxv_zen4_int
#define K_bli_samaxv_zen4_int bli_samaxv_zen_int_avx512
#endif
#define K_bli_saxpbyv_zen_int 1
#define K_bli_saxpbyv_zen_int10 1
#ifndef K_bli_saxpbyv_zen_int_10
#define K_bli_saxpbyv_zen_int_10 bli_saxpbyv_zen_int10
#endif
#define K_bli_saxpyv_zen_int 1
#define K_bli_saxpyv_zen_int10 1
#define K_bli_saxpyv_zen_int_avx512 1
#define K_bli_scnorm2fv_unb_var1_avx2 1
#ifndef K_bli_saxpyv_zen_int_10
#define K_bli_saxpyv_zen_int_10 bli_saxpyv_zen_int10
#endif
#ifndef K_bli_saxpyv_zen4_int
#define K_bli_saxpyv_zen4_int bli_saxpyv_zen_int_avx512
#endif
#ifndef K_bli_scnorm2fv_zen_int_unb_var1
#define K_bli_scnorm2fv_zen_int_unb_var1 bli_scnorm2fv_unb_var1_avx2
#endif
#define K_bli_scopyv_zen_int 1
#define K_bli_sgemm_haswell_asm_6x16 1
#define K_bli_sgemm_skx_asm_32x12_l2 1
#define K_bli_sgemmsup_rd_zen_asm_6x16m 1
#define K_bli_sgemmsup_rd_zen_asm_6x16n 1
#define K_bli_sgemmsup_rd_zen_asm_6x64m_avx512 1
#define K_bli_sgemmsup_rd_zen_asm_6x64n_avx512 1
#ifndef K_bli_sgemmsup_rd_zen4_asm_6x64m
#define K_bli_sgemmsup_rd_zen4_asm_6x64m bli_sgemmsup_rd_zen_asm_6x64m_avx512
#endif
#ifndef K_bli_sgemmsup_rd_zen4_asm_6x64n
#define K_bli_sgemmsup_rd_zen4_asm_6x64n bli_sgemmsup_rd_zen_asm_6x64n_avx512
#endif
#define K_bli_sgemmsup_rv_zen_asm_6x16m 1
#define K_bli_sgemmsup_rv_zen_asm_6x16n 1
#define K_bli_sgemmsup_rv_zen_asm_6x64m_avx512 1
#define K_bli_sgemmsup_rv_zen_asm_6x64n_avx512 1
#ifndef K_bli_sgemmsup_rv_zen4_asm_6x64m
#define K_bli_sgemmsup_rv_zen4_asm_6x64m bli_sgemmsup_rv_zen_asm_6x64m_avx512
#endif
#ifndef K_bli_sgemmsup_rv_zen4_asm_6x64n
#define K_bli_sgemmsup_rv_zen4_asm_6x64n bli_sgemmsup_rv_zen_asm_6x64n_avx512
#endif
#define K_bli_sgemmtrsm_l_haswell_asm_6x16 1
#define K_bli_sgemmtrsm_u_haswell_asm_6x16 1
#define K_bli_snorm2fv_unb_var1_avx2 1
#ifndef K_bli_snorm2fv_zen_int_unb_var1
#define K_bli_snorm2fv_zen_int_unb_var1 bli_snorm2fv_unb_var1_avx2
#endif
#define K_bli_sscalv_zen_int 1
#define K_bli_sscalv_zen_int10 1
#ifndef K_bli_sscalv_zen_int_10
#define K_bli_sscalv_zen_int_10 bli_sscalv_zen_int10
#endif
#define K_bli_ssetv_zen_int 1
#define K_bli_sswapv_zen_int8 1
#define K_bli_trsm_small 1
#define K_bli_trsm_small_AVX512 1
#ifndef K_bli_sswapv_zen_int_8
#define K_bli_sswapv_zen_int_8 bli_sswapv_zen_int8
#endif
#ifndef K_bli_trsm_small_zen
#define K_bli_trsm_small_zen bli_trsm_small
#endif
#ifndef K_bli_trsm_small_zen4
#define K_bli_trsm_small_zen4 bli_trsm_small_AVX512
#endif
#define K_bli_zaxpbyv_zen_int 1
#define K_bli_zaxpyv_zen_int5 1
#ifndef K_bli_zaxpyv_zen_int_5
#define K_bli_zaxpyv_zen_int_5 bli_zaxpyv_zen_int5
#endif
#define K_bli_zcopyv_zen_int 1
#define K_bli_zdscalv_zen_int10 1
#ifndef K_bli_zdscalv_zen_int_10
#define K_bli_zdscalv_zen_int_10 bli_zdscalv_zen_int10
#endif
#define K_bli_zgemm_haswell_asm_3x4 1
#define K_bli_zgemm_zen4_asm_12x4 1
#define K_bli_zgemmsup_cv_zen4_asm_12x1m 1
@@ -322,6 +602,6 @@
// If kernels have been removed, we need to undefine them here.
#ifdef AOCL_51
#undef K_bli_dgemm_zen4_asm_8x24
#endif
//#ifdef AOCL_51
// #undef K_bli_dgemm_zen4_asm_8x24
//#endif

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Portions of this file consist of AI-generated content.
Redistribution and use in source and binary forms, with or without
@@ -146,8 +146,8 @@ INSTANTIATE_TEST_SUITE_P(
// ----------------------------------------------
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_daddv_zen_int_avx512 kernel.
The code structure for bli_daddv_zen_int_avx512( ... ) is as follows :
Unit testing for functionality of bli_daddv_zen4_int kernel.
The code structure for bli_daddv_zen4_int( ... ) is as follows :
For unit strides :
Main loop : In blocks of 64 --> L64
Fringe loops : In blocks of 32 --> L32
@@ -157,12 +157,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
#ifdef K_bli_daddv_zen_int_avx512
#ifdef K_bli_daddv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_daddv_zen_int_avx512_unitStrides,
bli_daddv_zen4_int_unitStrides,
daddvGeneric,
::testing::Combine(
::testing::Values(bli_daddv_zen_int_avx512), // kernel address
::testing::Values(K_bli_daddv_zen4_int), // kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(// Testing the loops standalone
gtint_t(64), // size n, for L64
@@ -179,12 +179,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_daddv_zen_int_avx512
#ifdef K_bli_daddv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_daddv_zen_int_avx512_nonUnitStrides,
bli_daddv_zen4_int_nonUnitStrides,
daddvGeneric,
::testing::Combine(
::testing::Values(bli_daddv_zen_int_avx512), // kernel address
::testing::Values(K_bli_daddv_zen4_int), // kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(// Testing the loops standalone
gtint_t(7), // size n, for LScalar

View File

@@ -141,8 +141,8 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_damaxv_zen_int_avx512 kernel.
The code structure for bli_damaxv_zen_int_avx512( ... ) is as follows :
Unit testing for functionality of bli_damaxv_zen4_int kernel.
The code structure for bli_damaxv_zen4_int( ... ) is as follows :
For unit strides :
Main loop : In blocks of 64 --> L64
@@ -154,12 +154,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_damaxv_zen_int_avx512
#ifdef K_bli_damaxv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_damaxv_zen_int_avx512_unitStrides,
bli_damaxv_zen4_int_unitStrides,
damaxvGeneric,
::testing::Combine(
::testing::Values(bli_damaxv_zen_int_avx512), // kernel address
::testing::Values(K_bli_damaxv_zen4_int), // kernel address
::testing::Values(gtint_t(64), // for size n, L64
gtint_t(32), // L32
gtint_t(16), // L16
@@ -174,13 +174,13 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_damaxv_zen_int_avx512
#ifdef K_bli_damaxv_zen4_int
// Unit testing with non-unit strides.
INSTANTIATE_TEST_SUITE_P(
bli_damaxv_zen_int_avx512_nonUnitStrides,
bli_damaxv_zen4_int_nonUnitStrides,
damaxvGeneric,
::testing::Combine(
::testing::Values(bli_damaxv_zen_int_avx512), // kernel address
::testing::Values(K_bli_damaxv_zen4_int), // kernel address
::testing::Values(gtint_t(10), // n, size of the vector
gtint_t(25)),
::testing::Values(gtint_t(5)), // incx

View File

@@ -126,8 +126,8 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_samaxv_zen_int_avx512 kernel.
The code structure for bli_samaxv_zen_int_avx512( ... ) is as follows :
Unit testing for functionality of bli_samaxv_zen4_int kernel.
The code structure for bli_samaxv_zen4_int( ... ) is as follows :
For unit strides :
Main loop : In blocks of 128 --> L128
@@ -139,12 +139,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_samaxv_zen_int_avx512
#ifdef K_bli_samaxv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_samaxv_zen_int_avx512_unitStrides,
bli_samaxv_zen4_int_unitStrides,
samaxvGeneric,
::testing::Combine(
::testing::Values(bli_samaxv_zen_int_avx512), // kernel address
::testing::Values(K_bli_samaxv_zen4_int), // kernel address
::testing::Values(gtint_t(128), // for size n, L128
gtint_t(64), // L64
gtint_t(32), // L32
@@ -160,12 +160,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with non-unit strides.
#ifdef K_bli_samaxv_zen_int_avx512
#ifdef K_bli_samaxv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_samaxv_zen_int_avx512_nonUnitStrides,
bli_samaxv_zen4_int_nonUnitStrides,
samaxvGeneric,
::testing::Combine(
::testing::Values(bli_samaxv_zen_int_avx512), // kernel address
::testing::Values(K_bli_samaxv_zen4_int), // kernel address
::testing::Values(gtint_t(10), // n, size of the vector
gtint_t(25)),
::testing::Values(gtint_t(5)), // incx

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -113,8 +113,8 @@ TEST_P( daxpbyvGeneric, UKR )
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
/*
Unit testing for functionality of bli_daxpbyv_zen_int10 kernel.
The code structure for bli_daxpbyv_zen_int10( ... ) is as follows :
Unit testing for functionality of bli_daxpbyv_zen_int_10 kernel.
The code structure for bli_daxpbyv_zen_int_10( ... ) is as follows :
For unit strides :
Main loop : In blocks of 40 --> L40
Fringe loops : In blocks of 20 --> L20
@@ -126,12 +126,12 @@ TEST_P( daxpbyvGeneric, UKR )
*/
// Unit testing with unit stride, across all loops.
#ifdef K_bli_daxpbyv_zen_int10
#ifdef K_bli_daxpbyv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_daxpbyv_zen_int10_unitStrides,
bli_daxpbyv_zen_int_10_unitStrides,
daxpbyvGeneric,
::testing::Combine(
::testing::Values(bli_daxpbyv_zen_int10), // kernel address
::testing::Values(K_bli_daxpbyv_zen_int_10), // kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(// Testing the loops standalone
gtint_t(40), // size n, for L40
@@ -160,12 +160,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing for non unit strides
#ifdef K_bli_daxpbyv_zen_int10
#ifdef K_bli_daxpbyv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_daxpbyv_zen_int10_nonUnitStrides,
bli_daxpbyv_zen_int_10_nonUnitStrides,
daxpbyvGeneric,
::testing::Combine(
::testing::Values(bli_daxpbyv_zen_int10), // kernel address
::testing::Values(K_bli_daxpbyv_zen_int_10), // kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(gtint_t(10), // n, size of the vector
gtint_t(25)),
@@ -244,8 +244,8 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_daxpbyv_zen_int_avx512 kernel.
The code structure for bli_daxpbyv_zen_int_avx512( ... ) is as follows :
Unit testing for functionality of bli_daxpbyv_zen4_int kernel.
The code structure for bli_daxpbyv_zen4_int( ... ) is as follows :
For unit strides :
Main loop : In blocks of 64 --> L64
Fringe loops : In blocks of 32 --> L32
@@ -257,12 +257,12 @@ INSTANTIATE_TEST_SUITE_P(
*/
// Unit testing with unit stride, across all loops.
#ifdef K_bli_daxpbyv_zen_int_avx512
#ifdef K_bli_daxpbyv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_daxpbyv_zen_int_avx512_unitStrides,
bli_daxpbyv_zen4_int_unitStrides,
daxpbyvGeneric,
::testing::Combine(
::testing::Values(bli_daxpbyv_zen_int_avx512), // kernel address
::testing::Values(K_bli_daxpbyv_zen4_int), // kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(// Testing the loops standalone
gtint_t(64), // size n, for L64
@@ -286,12 +286,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing for non unit strides
#ifdef K_bli_daxpbyv_zen_int_avx512
#ifdef K_bli_daxpbyv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_daxpbyv_zen_int_avx512_nonUnitStrides,
bli_daxpbyv_zen4_int_nonUnitStrides,
daxpbyvGeneric,
::testing::Combine(
::testing::Values(bli_daxpbyv_zen_int_avx512), // kernel address
::testing::Values(K_bli_daxpbyv_zen4_int), // kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(gtint_t(10), // n, size of the vector
gtint_t(25)),

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -114,8 +114,8 @@ TEST_P( saxpbyvGeneric, UKR )
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
/*
Unit testing for functionality of bli_saxpbyv_zen_int10 kernel.
The code structure for bli_saxpbyv_zen_int10( ... ) is as follows :
Unit testing for functionality of bli_saxpbyv_zen_int_10 kernel.
The code structure for bli_saxpbyv_zen_int_10( ... ) is as follows :
For unit strides :
Main loop : In blocks of 80 --> L80
Fringe loops : In blocks of 40 --> L40
@@ -128,12 +128,12 @@ TEST_P( saxpbyvGeneric, UKR )
*/
// Unit testing with unit stride, across all loops.
#ifdef K_bli_saxpbyv_zen_int10
#ifdef K_bli_saxpbyv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_saxpbyv_zen_int10_unitStride,
bli_saxpbyv_zen_int_10_unitStride,
saxpbyvGeneric,
::testing::Combine(
::testing::Values(bli_saxpbyv_zen_int10), // kernel address
::testing::Values(K_bli_saxpbyv_zen_int_10), // kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(// Testing the loops standalone
gtint_t(80), // size n, for L80
@@ -161,12 +161,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing for non unit strides
#ifdef K_bli_saxpbyv_zen_int10
#ifdef K_bli_saxpbyv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_saxpbyv_zen_int_unitStride,
saxpbyvGeneric,
::testing::Combine(
::testing::Values(bli_saxpbyv_zen_int10), // kernel address
::testing::Values(K_bli_saxpbyv_zen_int_10), // kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(gtint_t(10), // n, size of the vector
gtint_t(25)),

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Portions of this file consist of AI-generated content.
Redistribution and use in source and binary forms, with or without
@@ -125,15 +125,15 @@ TEST_P( daxpyfGeneric, UKR )
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_daxpyf_zen_int_avx512 kernel.
Unit testing for functionality of bli_daxpyf_zen4_int kernel.
*/
// Unit testing with unit strides, across all fuse-factors.
#ifdef K_bli_daxpyf_zen_int_avx512
#ifdef K_bli_daxpyf_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_daxpyf_zen_int_avx512_unitStrides,
bli_daxpyf_zen4_int_unitStrides,
daxpyfGeneric,
::testing::Combine(
::testing::Values(bli_daxpyf_zen_int_avx512), // kernel address
::testing::Values(K_bli_daxpyf_zen4_int), // kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(gtint_t(1),
@@ -167,12 +167,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with non-unit strides, across all fuse-factors.
#ifdef K_bli_daxpyf_zen_int_avx512
#ifdef K_bli_daxpyf_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_daxpyf_zen_int_avx512_nonUnitStrides,
bli_daxpyf_zen4_int_nonUnitStrides,
daxpyfGeneric,
::testing::Combine(
::testing::Values(bli_daxpyf_zen_int_avx512), // kernel address
::testing::Values(K_bli_daxpyf_zen4_int), // kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(gtint_t(15), gtint_t(27)), // for size n

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Portions of this file consist of AI-generated content.
Redistribution and use in source and binary forms, with or without
@@ -121,12 +121,12 @@ TEST_P( zaxpyfGeneric, UKR )
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_zaxpyf_zen_int_8_avx512
#ifdef K_bli_zaxpyf_zen4_int_8
INSTANTIATE_TEST_SUITE_P(
bli_zaxpyf_zen_int_2_avx512_unitStrides,
zaxpyfGeneric,
::testing::Combine(
::testing::Values(bli_zaxpyf_zen_int_8_avx512), // kernel address
::testing::Values(K_bli_zaxpyf_zen4_int_8), // kernel address
::testing::Values('n'
#if defined(TEST_BLIS_TYPED)
,'c'
@@ -156,12 +156,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with non-unit strides, across all loops.
#ifdef K_bli_zaxpyf_zen_int_8_avx512
#ifdef K_bli_zaxpyf_zen4_int_8
INSTANTIATE_TEST_SUITE_P(
bli_zaxpyf_zen_int_2_avx512_nonUnitStrides,
zaxpyfGeneric,
::testing::Combine(
::testing::Values(bli_zaxpyf_zen_int_8_avx512), // kernel address
::testing::Values(K_bli_zaxpyf_zen4_int_8), // kernel address
::testing::Values('n'
#if defined(TEST_BLIS_TYPED)
,'c'
@@ -195,12 +195,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_zaxpyf_zen_int_8_avx512
#ifdef K_bli_zaxpyf_zen4_int_8
INSTANTIATE_TEST_SUITE_P(
bli_zaxpyf_zen_int_4_avx512_unitStrides,
zaxpyfGeneric,
::testing::Combine(
::testing::Values(bli_zaxpyf_zen_int_8_avx512), // kernel address
::testing::Values(K_bli_zaxpyf_zen4_int_8), // kernel address
::testing::Values('n'
#if defined(TEST_BLIS_TYPED)
,'c'
@@ -230,12 +230,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with non-unit strides, across all loops.
#ifdef K_bli_zaxpyf_zen_int_8_avx512
#ifdef K_bli_zaxpyf_zen4_int_8
INSTANTIATE_TEST_SUITE_P(
bli_zaxpyf_zen_int_4_avx512_nonUnitStrides,
zaxpyfGeneric,
::testing::Combine(
::testing::Values(bli_zaxpyf_zen_int_8_avx512), // kernel address
::testing::Values(K_bli_zaxpyf_zen4_int_8), // kernel address
::testing::Values('n'
#if defined(TEST_BLIS_TYPED)
,'c'
@@ -259,8 +259,8 @@ INSTANTIATE_TEST_SUITE_P(
#endif
/*
Unit testing for functionality of bli_zaxpyf_zen_int_8_avx512 kernel.
The code structure for bli_zaxpyf_zen_int_8_avx512( ... ) is as follows :
Unit testing for functionality of bli_zaxpyf_zen4_int_8 kernel.
The code structure for bli_zaxpyf_zen4_int_8( ... ) is as follows :
For unit strides :
Main loop : In blocks of 8 --> L8
Fringe loops : In blocks of 4 --> L4
@@ -269,12 +269,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_zaxpyf_zen_int_8_avx512
#ifdef K_bli_zaxpyf_zen4_int_8
INSTANTIATE_TEST_SUITE_P(
bli_zaxpyf_zen_int_8_avx512_unitStrides,
bli_zaxpyf_zen4_int_8_unitStrides,
zaxpyfGeneric,
::testing::Combine(
::testing::Values(bli_zaxpyf_zen_int_8_avx512), // kernel address
::testing::Values(K_bli_zaxpyf_zen4_int_8), // kernel address
::testing::Values('n'
#if defined(TEST_BLIS_TYPED)
,'c'
@@ -304,12 +304,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with non-unit strides, across all loops.
#ifdef K_bli_zaxpyf_zen_int_8_avx512
#ifdef K_bli_zaxpyf_zen4_int_8
INSTANTIATE_TEST_SUITE_P(
bli_zaxpyf_zen_int_8_avx512_nonUnitStrides,
bli_zaxpyf_zen4_int_8_nonUnitStrides,
zaxpyfGeneric,
::testing::Combine(
::testing::Values(bli_zaxpyf_zen_int_8_avx512), // kernel address
::testing::Values(K_bli_zaxpyf_zen4_int_8), // kernel address
::testing::Values('n'
#if defined(TEST_BLIS_TYPED)
,'c'

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Portions of this file consist of AI-generated content.
Redistribution and use in source and binary forms, with or without
@@ -94,8 +94,8 @@ TEST_P( caxpyvGeneric, UKR )
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
/*
Unit testing for functionality of bli_caxpyv_zen_int5 kernel.
The code structure for bli_caxpyv_zen_int5( ... ) is as follows :
Unit testing for functionality of bli_caxpyv_zen_int_5 kernel.
The code structure for bli_caxpyv_zen_int_5( ... ) is as follows :
For unit strides :
Main loop : In blocks of 20 --> L20
Fringe loops : In blocks of 8 --> L8
@@ -105,12 +105,12 @@ TEST_P( caxpyvGeneric, UKR )
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_caxpyv_zen_int5
#ifdef K_bli_caxpyv_zen_int_5
INSTANTIATE_TEST_SUITE_P(
bli_caxpyv_zen_int5_unitStrides,
bli_caxpyv_zen_int_5_unitStrides,
caxpyvGeneric,
::testing::Combine(
::testing::Values(bli_caxpyv_zen_int5), // kernel address
::testing::Values(K_bli_caxpyv_zen_int_5), // kernel address
::testing::Values('n'
#ifdef TEST_BLIS_TYPED
, 'c' // conjx
@@ -138,12 +138,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing for non unit strides
#ifdef K_bli_caxpyv_zen_int5
#ifdef K_bli_caxpyv_zen_int_5
INSTANTIATE_TEST_SUITE_P(
bli_caxpyv_zen_int5_nonUnitStrides,
bli_caxpyv_zen_int_5_nonUnitStrides,
caxpyvGeneric,
::testing::Combine(
::testing::Values(bli_caxpyv_zen_int5), // kernel address
::testing::Values(K_bli_caxpyv_zen_int_5), // kernel address
::testing::Values('n'
#ifdef TEST_BLIS_TYPED
, 'c' // conjx

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -92,8 +92,8 @@ TEST_P( daxpyvGeneric, UKR )
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
/*
Unit testing for functionality of bli_daxpyv_zen_int10 kernel.
The code structure for bli_daxpyv_zen_int10( ... ) is as follows :
Unit testing for functionality of bli_daxpyv_zen_int_10 kernel.
The code structure for bli_daxpyv_zen_int_10( ... ) is as follows :
For unit strides :
Main loop : In blocks of 52 --> L52
Fringe loops : In blocks of 40 --> L40
@@ -106,12 +106,12 @@ TEST_P( daxpyvGeneric, UKR )
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_daxpyv_zen_int10
#ifdef K_bli_daxpyv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_daxpyv_zen_int10_unitStrides,
bli_daxpyv_zen_int_10_unitStrides,
daxpyvGeneric,
::testing::Combine(
::testing::Values(bli_daxpyv_zen_int10), // kernel address
::testing::Values(K_bli_daxpyv_zen_int_10), // kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(// Testing the loops standalone
gtint_t(52), // size n, for L52
@@ -141,12 +141,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing for non unit strides
#ifdef K_bli_daxpyv_zen_int10
#ifdef K_bli_daxpyv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_daxpyv_zen_int10_nonUnitStrides,
bli_daxpyv_zen_int_10_nonUnitStrides,
daxpyvGeneric,
::testing::Combine(
::testing::Values(bli_daxpyv_zen_int10), // kernel address
::testing::Values(K_bli_daxpyv_zen_int_10), // kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(gtint_t(10), // n, size of the vector
gtint_t(25)),
@@ -163,7 +163,7 @@ INSTANTIATE_TEST_SUITE_P(
/*
Unit testing for functionality of bli_daxpyv_zen_int kernel.
The code structure for bli_daxpyv_zen_int10( ... ) is as follows :
The code structure for bli_daxpyv_zen_int_10( ... ) is as follows :
For unit strides :
Main loop : In blocks of 16 --> L16
Element wise loop post all these loops.
@@ -216,8 +216,8 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_daxpyv_zen_int_avx512 kernel.
The code structure for bli_daxpyv_zen_int_avx512( ... ) is as follows :
Unit testing for functionality of bli_daxpyv_zen4_int kernel.
The code structure for bli_daxpyv_zen4_int( ... ) is as follows :
For unit strides :
Main loop : In blocks of 64 --> L64
Fringe loops : In blocks of 32 --> L32
@@ -229,12 +229,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_daxpyv_zen_int_avx512
#ifdef K_bli_daxpyv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_daxpyv_zen_int_avx512_unitStrides,
bli_daxpyv_zen4_int_unitStrides,
daxpyvGeneric,
::testing::Combine(
::testing::Values(bli_daxpyv_zen_int_avx512), // kernel address
::testing::Values(K_bli_daxpyv_zen4_int),// kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(// Testing the loops standalone
gtint_t(64), // size n, for L64
@@ -262,12 +262,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing for non unit strides
#ifdef K_bli_daxpyv_zen_int_avx512
#ifdef K_bli_daxpyv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_daxpyv_zen_int_avx512_nonUnitStrides,
bli_daxpyv_zen4_int_nonUnitStrides,
daxpyvGeneric,
::testing::Combine(
::testing::Values(bli_daxpyv_zen_int_avx512), // kernel address
::testing::Values(K_bli_daxpyv_zen4_int),// kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(gtint_t(10), // n, size of the vector
gtint_t(25)),

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Portions of this file consist of AI-generated content.
Redistribution and use in source and binary forms, with or without
@@ -84,8 +84,8 @@ TEST_P( saxpyvGeneric, UKR )
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
/*
Unit testing for functionality of bli_saxpyv_zen_int10 kernel.
The code structure for bli_saxpyv_zen_int10( ... ) is as follows :
Unit testing for functionality of bli_saxpyv_zen_int_10 kernel.
The code structure for bli_saxpyv_zen_int_10( ... ) is as follows :
For unit strides :
Main loop : In blocks of 120 --> L120
Fringe loops : In blocks of 80 --> L80
@@ -98,12 +98,12 @@ TEST_P( saxpyvGeneric, UKR )
For non-unit strides : A single loop, to process element wise.
*/
#ifdef K_bli_saxpyv_zen_int10
#ifdef K_bli_saxpyv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_saxpyv_zen_int10_unitStrides,
bli_saxpyv_zen_int_10_unitStrides,
saxpyvGeneric,
::testing::Combine(
::testing::Values(bli_saxpyv_zen_int10), // kernel address
::testing::Values(K_bli_saxpyv_zen_int_10), // kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(// Testing the loops standalone
gtint_t(120), // size n, for L120
@@ -128,12 +128,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_saxpyv_zen_int10
#ifdef K_bli_saxpyv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_saxpyv_zen_int10_nonUnitStrides,
bli_saxpyv_zen_int_10_nonUnitStrides,
saxpyvGeneric,
::testing::Combine(
::testing::Values(bli_saxpyv_zen_int10), // kernel address
::testing::Values(K_bli_saxpyv_zen_int_10), // kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(// Testing the loops standalone
gtint_t(7), // size n, for LScalar
@@ -205,8 +205,8 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_saxpyv_zen_int_avx512 kernel.
The code structure for bli_saxpyv_zen_int_avx512( ... ) is as follows :
Unit testing for functionality of bli_saxpyv_zen4_int kernel.
The code structure for bli_saxpyv_zen4_int( ... ) is as follows :
For unit strides :
Main loop : In blocks of 128 --> L128
Fringe loops : In blocks of 64 --> L64
@@ -218,12 +218,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
#ifdef K_bli_saxpyv_zen_int_avx512
#ifdef K_bli_saxpyv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_saxpyv_zen_int_avx512_unitStrides,
bli_saxpyv_zen4_int_unitStrides,
saxpyvGeneric,
::testing::Combine(
::testing::Values(bli_saxpyv_zen_int_avx512), // kernel address
::testing::Values(K_bli_saxpyv_zen4_int), // kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(// Testing the loops standalone
gtint_t(128), // size n, for L128
@@ -244,12 +244,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_saxpyv_zen_int_avx512
#ifdef K_bli_saxpyv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_saxpyv_zen_int_avx512_nonUnitStrides,
bli_saxpyv_zen4_int_nonUnitStrides,
saxpyvGeneric,
::testing::Combine(
::testing::Values(bli_saxpyv_zen_int_avx512), // kernel address
::testing::Values(K_bli_saxpyv_zen4_int),// kernel address
::testing::Values('n'), // use x, not conj(x) (since it is real)
::testing::Values(// Testing the loops standalone
gtint_t(7), // size n, for LScalar

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Portions of this file consist of AI-generated content.
Redistribution and use in source and binary forms, with or without
@@ -94,7 +94,7 @@ TEST_P( zaxpyvGeneric, UKR )
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
/*
Unit testing for functionality of bli_zaxpyv_zen_int5 kernel.
Unit testing for functionality of bli_zaxpyv_zen_int_5 kernel.
The code structure for bli_zaxpyv_zen_int10( ... ) is as follows :
For unit strides :
Main loop : In blocks of 14 --> L14
@@ -107,12 +107,12 @@ TEST_P( zaxpyvGeneric, UKR )
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_zaxpyv_zen_int5
#ifdef K_bli_zaxpyv_zen_int_5
INSTANTIATE_TEST_SUITE_P(
bli_zaxpyv_zen_int5_unitStrides,
bli_zaxpyv_zen_int_5_unitStrides,
zaxpyvGeneric,
::testing::Combine(
::testing::Values(bli_zaxpyv_zen_int5), // kernel address
::testing::Values(K_bli_zaxpyv_zen_int_5), // kernel address
::testing::Values('n'
#ifdef TEST_BLIS_TYPED
, 'c' // conjx
@@ -144,12 +144,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing for non unit strides
#ifdef K_bli_zaxpyv_zen_int5
#ifdef K_bli_zaxpyv_zen_int_5
INSTANTIATE_TEST_SUITE_P(
bli_zaxpyv_zen_int5_nonUnitStrides,
bli_zaxpyv_zen_int_5_nonUnitStrides,
zaxpyvGeneric,
::testing::Combine(
::testing::Values(bli_zaxpyv_zen_int5), // kernel address
::testing::Values(K_bli_zaxpyv_zen_int_5), // kernel address
::testing::Values('n'
#ifdef TEST_BLIS_TYPED
, 'c' // conjx
@@ -171,8 +171,8 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_zaxpyv_zen_int_avx512 kernel.
The code structure for bli_zaxpyv_zen_int_avx512( ... ) is as follows :
Unit testing for functionality of bli_zaxpyv_zen4_int kernel.
The code structure for bli_zaxpyv_zen4_int( ... ) is as follows :
For unit strides :
Main loop : In blocks of 32 --> L32
Fringe loops : In blocks of 16 --> L16
@@ -183,12 +183,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_zaxpyv_zen_int_avx512
#ifdef K_bli_zaxpyv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_zaxpyv_zen_int_avx512_unitStrides,
bli_zaxpyv_zen4_int_unitStrides,
zaxpyvGeneric,
::testing::Combine(
::testing::Values(bli_zaxpyv_zen_int_avx512), // kernel address
::testing::Values(K_bli_zaxpyv_zen4_int), // kernel address
::testing::Values('n'
#ifdef TEST_BLIS_TYPED
, 'c' // conjx
@@ -219,12 +219,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing for non unit strides
#ifdef K_bli_zaxpyv_zen_int_avx512
#ifdef K_bli_zaxpyv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_zaxpyv_zen_int_avx512_nonUnitStrides,
bli_zaxpyv_zen4_int_nonUnitStrides,
zaxpyvGeneric,
::testing::Combine(
::testing::Values(bli_zaxpyv_zen_int_avx512), // kernel address
::testing::Values(K_bli_zaxpyv_zen4_int), // kernel address
::testing::Values('n'
#ifdef TEST_BLIS_TYPED
, 'c' // conjx

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -136,8 +136,8 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_dcopyv_zen4_asm_avx512 kernel.
The code structure for bli_dcopyv_zen4_asm_avx512( ... ) is as follows :
Unit testing for functionality of bli_dcopyv_zen4_asm kernel.
The code structure for bli_dcopyv_zen4_asm( ... ) is as follows :
For unit strides :
Main loop : In blocks of 128 --> L128
Fringe loops : In blocks of 64 --> L64
@@ -149,12 +149,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with Unit Strides(US), across all loops.
#ifdef K_bli_dcopyv_zen4_asm_avx512
#ifdef K_bli_dcopyv_zen4_asm
INSTANTIATE_TEST_SUITE_P(
bli_dcopyv_zen4_asm_avx512_unitStrides,
bli_dcopyv_zen4_asm_unitStrides,
dcopyvGeneric,
::testing::Combine(
::testing::Values(bli_dcopyv_zen4_asm_avx512),
::testing::Values(K_bli_dcopyv_zen4_asm),
::testing::Values('n'), // conjugate parameter, 'n' for dcopyv
::testing::Values(// Testing the loops standalone
gtint_t(128), // size n, for L128
@@ -179,12 +179,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with Non-Unit Strides(US), across all loops.
#ifdef K_bli_dcopyv_zen4_asm_avx512
#ifdef K_bli_dcopyv_zen4_asm
INSTANTIATE_TEST_SUITE_P(
bli_dcopyv_zen4_asm_avx512_nonUnitStrides,
bli_dcopyv_zen4_asm_nonUnitStrides,
dcopyvGeneric,
::testing::Combine(
::testing::Values(bli_dcopyv_zen4_asm_avx512),
::testing::Values(K_bli_dcopyv_zen4_asm),
::testing::Values('n'), // conjugate parameter, 'n' for dcopyv
::testing::Values(gtint_t(25), gtint_t(37)), // size of the vector
::testing::Values(gtint_t(5)), // stride size for x
@@ -196,8 +196,8 @@ INSTANTIATE_TEST_SUITE_P(
#endif
/*
Unit testing for functionality of bli_dcopyv_zen4_asm_avx512_biway kernel.
The code structure for bli_dcopyv_zen4_asm_avx512_biway( ... ) is as follows :
Unit testing for functionality of bli_dcopyv_zen4_asm_biway kernel.
The code structure for bli_dcopyv_zen4_asm_biway( ... ) is as follows :
For unit strides :
Main loop : In blocks of 128 --> L128
Fringe loops : In blocks of 64 --> L64
@@ -209,12 +209,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with Unit Strides(US), across all loops.
#ifdef K_bli_dcopyv_zen4_asm_avx512_biway
#ifdef K_bli_dcopyv_zen4_asm_biway
INSTANTIATE_TEST_SUITE_P(
bli_dcopyv_zen4_asm_avx512_biway_unitStrides,
bli_dcopyv_zen4_asm_biway_unitStrides,
dcopyvGeneric,
::testing::Combine(
::testing::Values(bli_dcopyv_zen4_asm_avx512_biway),
::testing::Values(bli_dcopyv_zen4_asm_biway),
::testing::Values('n'), // conjugate parameter, 'n' for dcopyv
::testing::Values(// Testing the loops standalone
gtint_t(128), // size n, for L128
@@ -239,12 +239,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with Non-Unit Strides(US), across all loops.
#ifdef K_bli_dcopyv_zen4_asm_avx512_biway
#ifdef K_bli_dcopyv_zen4_asm_biway
INSTANTIATE_TEST_SUITE_P(
bli_dcopyv_zen4_asm_avx512_biway_nonUnitStrides,
bli_dcopyv_zen4_asm_biway_nonUnitStrides,
dcopyvGeneric,
::testing::Combine(
::testing::Values(bli_dcopyv_zen4_asm_avx512_biway),
::testing::Values(bli_dcopyv_zen4_asm_biway),
::testing::Values('n'), // conjugate parameter, 'n' for dcopyv
::testing::Values(gtint_t(25), gtint_t(37)), // size of the vector
::testing::Values(gtint_t(5)), // stride size for x
@@ -258,8 +258,8 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN5) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_dcopyv_zen4_asm_avx512 kernel.
The code structure for bli_dcopyv_zen5_asm_avx512( ... ) is as follows :
Unit testing for functionality of bli_dcopyv_zen4_asm kernel.
The code structure for bli_dcopyv_zen5_asm( ... ) is as follows :
For unit strides :
Main loop : In blocks of 128 --> L128
Fringe loops : In blocks of 64 --> L64
@@ -271,12 +271,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with Unit Strides(US), across all loops.
#ifdef K_bli_dcopyv_zen5_asm_avx512
#ifdef K_bli_dcopyv_zen5_asm
INSTANTIATE_TEST_SUITE_P(
bli_dcopyv_zen5_asm_avx512_unitStrides,
bli_dcopyv_zen5_asm_unitStrides,
dcopyvGeneric,
::testing::Combine(
::testing::Values(bli_dcopyv_zen5_asm_avx512),
::testing::Values(K_bli_dcopyv_zen5_asm),
::testing::Values('n'), // conjugate parameter, 'n' for dcopyv
::testing::Values(// Testing the loops standalone
gtint_t(128), // size n, for L128
@@ -301,12 +301,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with Non-Unit Strides(US), across all loops.
#ifdef K_bli_dcopyv_zen5_asm_avx512
#ifdef K_bli_dcopyv_zen5_asm
INSTANTIATE_TEST_SUITE_P(
bli_dcopyv_zen5_asm_avx512_nonUnitStrides,
bli_dcopyv_zen5_asm_nonUnitStrides,
dcopyvGeneric,
::testing::Combine(
::testing::Values(bli_dcopyv_zen5_asm_avx512),
::testing::Values(K_bli_dcopyv_zen5_asm),
::testing::Values('n'), // conjugate parameter, 'n' for dcopyv
::testing::Values(gtint_t(25), gtint_t(37)), // size of the vector
::testing::Values(gtint_t(5)), // stride size for x

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -136,8 +136,8 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_scopyv_zen4_asm_avx512 kernel.
The code structure for bli_scopyv_zen4_asm_avx512( ... ) is as follows :
Unit testing for functionality of bli_scopyv_zen4_asm kernel.
The code structure for bli_scopyv_zen4_asm( ... ) is as follows :
For unit strides :
Main loop : In blocks of 512 --> L512
Fringe loops : In blocks of 256 --> L256
@@ -150,12 +150,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with Unit Strides(US), across all loops.
#ifdef K_bli_scopyv_zen4_asm_avx512
#ifdef K_bli_scopyv_zen4_asm
INSTANTIATE_TEST_SUITE_P(
bli_scopyv_zen4_asm_avx512_unitStrides,
bli_scopyv_zen4_asm_unitStrides,
scopyvGeneric,
::testing::Combine(
::testing::Values(bli_scopyv_zen4_asm_avx512),
::testing::Values(K_bli_scopyv_zen4_asm),
::testing::Values('n'), // conjugate parameter, 'n' for scopyv
::testing::Values(// Testing the loops standalone
gtint_t(512), // size n, for L512
@@ -182,12 +182,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with Non-Unit Strides(US), across all loops.
#ifdef K_bli_scopyv_zen4_asm_avx512
#ifdef K_bli_scopyv_zen4_asm
INSTANTIATE_TEST_SUITE_P(
bli_scopyv_zen4_asm_avx512_nonUnitStrides,
bli_scopyv_zen4_asm_nonUnitStrides,
scopyvGeneric,
::testing::Combine(
::testing::Values(bli_scopyv_zen4_asm_avx512),
::testing::Values(K_bli_scopyv_zen4_asm),
::testing::Values('n'), // conjugate parameter, 'n' for scopyv
::testing::Values(gtint_t(25), gtint_t(37)), // size of the vector
::testing::Values(gtint_t(5)), // stride size for x

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -141,8 +141,8 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_zcopyv_zen4_asm_avx512 kernel.
The code structure for bli_zcopyv_zen4_asm_avx512( ... ) is as follows :
Unit testing for functionality of bli_zcopyv_zen4_asm kernel.
The code structure for bli_zcopyv_zen4_asm( ... ) is as follows :
For unit strides :
Main loop : In blocks of 128 --> L128
Fringe loops : In blocks of 64 --> L64
@@ -155,12 +155,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with Unit Strides(US), across all loops.
#ifdef K_bli_zcopyv_zen4_asm_avx512
#ifdef K_bli_zcopyv_zen4_asm
INSTANTIATE_TEST_SUITE_P(
bli_zcopyv_zen4_asm_avx512_unitStrides,
bli_zcopyv_zen4_asm_unitStrides,
zcopyvGeneric,
::testing::Combine(
::testing::Values(bli_zcopyv_zen4_asm_avx512),
::testing::Values(K_bli_zcopyv_zen4_asm),
::testing::Values('n' // n: use x, c: use conj(x)
#ifdef TEST_BLIS_TYPED
, 'c' // this option is BLIS-api specific.
@@ -191,12 +191,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with Non-Unit Strides(US), across all loops.
#ifdef K_bli_zcopyv_zen4_asm_avx512
#ifdef K_bli_zcopyv_zen4_asm
INSTANTIATE_TEST_SUITE_P(
bli_zcopyv_zen4_asm_avx512_nonUnitStrides,
bli_zcopyv_zen4_asm_nonUnitStrides,
zcopyvGeneric,
::testing::Combine(
::testing::Values(bli_zcopyv_zen4_asm_avx512),
::testing::Values(K_bli_zcopyv_zen4_asm),
::testing::Values('n' // n: use x, c: use conj(x)
#ifdef TEST_BLIS_TYPED
, 'c' // this option is BLIS-api specific.

View File

@@ -163,7 +163,7 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
// Tests for bli_ddotv_zen_int10 (AVX2) kernel.
// Tests for bli_ddotv_zen_int_10 (AVX2) kernel.
/**
* Loops:
* L20 - Main loop, handles 20 elements
@@ -174,12 +174,12 @@ INSTANTIATE_TEST_SUITE_P(
*
* LNUnit - loop for non-unit increments
*/
#ifdef K_bli_ddotv_zen_int10
#ifdef K_bli_ddotv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_ddotv_zen_int10_unitStride,
bli_ddotv_zen_int_10_unitStride,
ddotvGeneric,
::testing::Combine(
::testing::Values(bli_ddotv_zen_int10),
::testing::Values(K_bli_ddotv_zen_int_10),
// conj(x): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// conj(y): uses n (no_conjugate) since it is real.
@@ -220,12 +220,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_ddotv_zen_int10
#ifdef K_bli_ddotv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_ddotv_zen_int10_nonUnitPositiveStrides,
bli_ddotv_zen_int_10_nonUnitPositiveStrides,
ddotvGeneric,
::testing::Combine(
::testing::Values(bli_ddotv_zen_int10),
::testing::Values(K_bli_ddotv_zen_int_10),
// conj(x): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// conj(y): uses n (no_conjugate) since it is real.
@@ -258,7 +258,7 @@ INSTANTIATE_TEST_SUITE_P(
// ----- Begin ZEN4 (AVX512) Kernel Tests -----
// ----------------------------------------------
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
// Tests for bli_ddotv_zen_int_avx512 (AVX512) kernel.
// Tests for bli_ddotv_zen4_int (AVX512) kernel.
/**
* Loops & If conditions:
* L40 - Main loop, handles 40 elements
@@ -268,12 +268,12 @@ INSTANTIATE_TEST_SUITE_P(
*
* LNUnit - loop for non-unit increments
*/
#ifdef K_bli_ddotv_zen_int_avx512
#ifdef K_bli_ddotv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_ddotv_zen_int_avx512_unitStride,
bli_ddotv_zen4_int_unitStride,
ddotvGeneric,
::testing::Combine(
::testing::Values(bli_ddotv_zen_int_avx512),
::testing::Values(K_bli_ddotv_zen4_int),
// conj(x): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// conj(y): uses n (no_conjugate) since it is real.
@@ -313,12 +313,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_ddotv_zen_int_avx512
#ifdef K_bli_ddotv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_ddotv_zen_int_avx512_nonUnitPositiveStrides,
bli_ddotv_zen4_int_nonUnitPositiveStrides,
ddotvGeneric,
::testing::Combine(
::testing::Values(bli_ddotv_zen_int_avx512),
::testing::Values(K_bli_ddotv_zen4_int),
// conj(x): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// conj(y): uses n (no_conjugate) since it is real.

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -90,7 +90,7 @@ TEST_P( zdotvGeneric, UKR )
// ----- Begin ZEN4 (AVX512) Kernel Tests -----
// ----------------------------------------------
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
// Tests for bli_zdotv_zen_int_avx512 (AVX512) kernel.
// Tests for bli_zdotv_zen4_int (AVX512) kernel.
/**
* Loops & If conditions:
* L32 - Main loop, handles 32 elements
@@ -102,12 +102,12 @@ TEST_P( zdotvGeneric, UKR )
*
* LNUnit - loop for non-unit increments
*/
#ifdef K_bli_zdotv_zen_int_avx512
#ifdef K_bli_zdotv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_zdotv_zen_int_avx512_unitStride,
bli_zdotv_zen4_int_unitStride,
zdotvGeneric,
::testing::Combine(
::testing::Values(bli_zdotv_zen_int_avx512),
::testing::Values(K_bli_zdotv_zen4_int),
// conj(x): use n (no_conjugate) or c (conjugate).
::testing::Values('n', 'c'),
// conj(y): use n (no_conjugate) or c (conjugate).
@@ -147,12 +147,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_zdotv_zen_int_avx512
#ifdef K_bli_zdotv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_zdotv_zen_int_avx512_nonUnitPositiveStrides,
bli_zdotv_zen4_int_nonUnitPositiveStrides,
zdotvGeneric,
::testing::Combine(
::testing::Values(bli_zdotv_zen_int_avx512),
::testing::Values(K_bli_zdotv_zen4_int),
// conj(x): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// conj(y): uses n (no_conjugate) since it is real.
@@ -176,7 +176,7 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
// Tests for bli_zdotv_zen_int_avx512 (AVX512) kernel.
// Tests for bli_zdotv_zen4_int (AVX512) kernel.
/**
* Loops & If conditions:
* L32 - Main loop, handles 32 elements
@@ -188,12 +188,12 @@ INSTANTIATE_TEST_SUITE_P(
*
* LNUnit - loop for non-unit increments
*/
#ifdef K_bli_zdotv_zen4_asm_avx512
#ifdef K_bli_zdotv_zen4_asm
INSTANTIATE_TEST_SUITE_P(
DISABLED_bli_zdotv_zen4_asm_avx512_unitStride,
DISABLED_bli_zdotv_zen4_asm_unitStride,
zdotvGeneric,
::testing::Combine(
::testing::Values(bli_zdotv_zen4_asm_avx512),
::testing::Values(K_bli_zdotv_zen4_asm),
// conj(x): use n (no_conjugate) or c (conjugate).
::testing::Values('n', 'c'),
// conj(y): use n (no_conjugate) or c (conjugate).
@@ -233,12 +233,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_zdotv_zen4_asm_avx512
#ifdef K_bli_zdotv_zen4_asm
INSTANTIATE_TEST_SUITE_P(
bli_zdotv_zen4_asm_avx512_nonUnitPositiveStrides,
bli_zdotv_zen4_asm_nonUnitPositiveStrides,
zdotvGeneric,
::testing::Combine(
::testing::Values(bli_zdotv_zen4_asm_avx512),
::testing::Values(K_bli_zdotv_zen4_asm),
// conj(x): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// conj(y): uses n (no_conjugate) since it is real.

View File

@@ -1643,9 +1643,9 @@ public:
};
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
#ifdef K_bli_cgemm_32x4_avx512_k1_nn
#ifdef K_bli_cgemm_zen4_int_32x4_k1_nn
INSTANTIATE_TEST_SUITE_P(
bli_cgemm_32x4_avx512_k1_nn,
bli_cgemm_zen4_int_32x4_k1_nn,
cgemmUkrk1,
::testing::Combine(
@@ -1656,7 +1656,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values('c'), // storage
::testing::Range(gtint_t(1), gtint_t(65), 1), // values of m
::testing::Range(gtint_t(1), gtint_t(9), 1), // values of n
::testing::Values(bli_cgemm_32x4_avx512_k1_nn),
::testing::Values(K_bli_cgemm_zen4_int_32x4_k1_nn),
::testing::Values(true, false) // memory test
),
::cgemmUkrk1Print()

View File

@@ -271,9 +271,9 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
#ifdef K_bli_dgemmsup_rv_zen4_asm_24x8m
#ifdef K_bli_dgemmsup_cv_zen4_asm_24x8m
INSTANTIATE_TEST_SUITE_P(
bli_dgemmsup_rv_zen4_asm_24x8m_col_stored_c,
bli_dgemmsup_cv_zen4_asm_24x8m_col_stored_c,
dgemmGenericSUP,
::testing::Combine(
::testing::Range(gtint_t(1), gtint_t(25), 1), // values of m
@@ -282,7 +282,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(2.0, 1.0, -1.0), // alpha value
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
::testing::Values('c'), // storage of c
::testing::Values(bli_dgemmsup_rv_zen4_asm_24x8m), // dgemm_sup kernel
::testing::Values(K_bli_dgemmsup_cv_zen4_asm_24x8m), // dgemm_sup kernel
::testing::Values(gtint_t(8)), // Micro kernel block MR
::testing::Values('n'), // transa
::testing::Values('n'), // transb
@@ -293,9 +293,9 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemmsup_rv_zen4_asm_24x8m
#ifdef K_bli_dgemmsup_cv_zen4_asm_24x8m
INSTANTIATE_TEST_SUITE_P(
bli_dgemmsup_rv_zen4_asm_24x8m_row_stored_c,
bli_dgemmsup_cv_zen4_asm_24x8m_row_stored_c,
dgemmGenericSUP,
::testing::Combine(
::testing::Range(gtint_t(1), gtint_t(25), 1), // values of m
@@ -304,7 +304,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(2.0, 1.0, -1.0), // alpha value
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
::testing::Values('r'), // storage of c
::testing::Values(bli_dgemmsup_rv_zen4_asm_24x8m), // dgemm_sup kernel
::testing::Values(K_bli_dgemmsup_cv_zen4_asm_24x8m), // dgemm_sup kernel
::testing::Values(gtint_t(8)), // Micro kernel block MR
::testing::Values('t'), // transa
::testing::Values('n'), // transb
@@ -315,9 +315,9 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemmsup_rv_zen4_asm_24x8m_new
#ifdef K_bli_dgemmsup_cv_zen4_asm_24x8m_new
INSTANTIATE_TEST_SUITE_P(
bli_dgemmsup_rv_zen4_asm_24x8m_new_col_stored_c,
bli_dgemmsup_cv_zen4_asm_24x8m_new_col_stored_c,
dgemmGenericSUP,
::testing::Combine(
::testing::Range(gtint_t(1), gtint_t(25), 1), // values of m
@@ -326,7 +326,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(2.0, 1.0, -1.0), // alpha value
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
::testing::Values('c'), // storage of c
::testing::Values(bli_dgemmsup_rv_zen4_asm_24x8m_new), // dgemm_sup kernel
::testing::Values(K_bli_dgemmsup_cv_zen4_asm_24x8m_new), // dgemm_sup kernel
::testing::Values(gtint_t(8)), // Micro kernel block MR
::testing::Values('n'), // transa
::testing::Values('n'), // transb
@@ -337,9 +337,9 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemmsup_rv_zen4_asm_24x8m_new
#ifdef K_bli_dgemmsup_cv_zen4_asm_24x8m_new
INSTANTIATE_TEST_SUITE_P(
bli_dgemmsup_rv_zen4_asm_24x8m_new_row_stored_c,
bli_dgemmsup_cv_zen4_asm_24x8m_new_row_stored_c,
dgemmGenericSUP,
::testing::Combine(
::testing::Range(gtint_t(1), gtint_t(25), 1), // values of m
@@ -348,7 +348,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(2.0, 1.0, -1.0), // alpha value
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
::testing::Values('r'), // storage of c
::testing::Values(bli_dgemmsup_rv_zen4_asm_24x8m_new), // dgemm_sup kernel
::testing::Values(K_bli_dgemmsup_cv_zen4_asm_24x8m_new), // dgemm_sup kernel
::testing::Values(gtint_t(8)), // Micro kernel block MR
::testing::Values('t'), // transa
::testing::Values('n'), // transb
@@ -363,9 +363,9 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN5) && defined(GTEST_AVX512)
#ifdef K_bli_dgemmsup_rv_zen5_asm_24x8m
#ifdef K_bli_dgemmsup_cv_zen5_asm_24x8m
INSTANTIATE_TEST_SUITE_P(
bli_dgemmsup_rv_zen5_asm_24x8m_col_stored_c,
bli_dgemmsup_cv_zen5_asm_24x8m_col_stored_c,
dgemmGenericSUP,
::testing::Combine(
::testing::Range(gtint_t(1), gtint_t(25), 1), // values of m
@@ -374,7 +374,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(2.0, 1.0, -1.0), // alpha value
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
::testing::Values('c'), // storage of c
::testing::Values(bli_dgemmsup_rv_zen5_asm_24x8m), // dgemm_sup kernel
::testing::Values(K_bli_dgemmsup_cv_zen5_asm_24x8m), // dgemm_sup kernel
::testing::Values(gtint_t(8)), // Micro kernel block MR
::testing::Values('n'), // transa
::testing::Values('n'), // transb
@@ -385,9 +385,9 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemmsup_rv_zen5_asm_24x8m
#ifdef K_bli_dgemmsup_cv_zen5_asm_24x8m
INSTANTIATE_TEST_SUITE_P(
bli_dgemmsup_rv_zen5_asm_24x8m_row_stored_c,
bli_dgemmsup_cv_zen5_asm_24x8m_row_stored_c,
dgemmGenericSUP,
::testing::Combine(
::testing::Range(gtint_t(1), gtint_t(25), 1), // values of m
@@ -396,7 +396,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(2.0, 1.0, -1.0), // alpha value
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
::testing::Values('r'), // storage of c
::testing::Values(bli_dgemmsup_rv_zen5_asm_24x8m), // dgemm_sup kernel
::testing::Values(K_bli_dgemmsup_cv_zen5_asm_24x8m), // dgemm_sup kernel
::testing::Values(gtint_t(8)), // Micro kernel block MR
::testing::Values('t'), // transa
::testing::Values('n'), // transb
@@ -510,26 +510,7 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemm_avx512_asm_8x24
INSTANTIATE_TEST_SUITE_P(
bli_dgemm_avx512_asm_8x24,
dgemmGenericNat,
::testing::Combine(
::testing::Range(gtint_t(0), gtint_t(17), 1), // values of k
::testing::Values(2.0, 1.0, -1.0), // alpha value
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
::testing::Values('r', 'c'), // storage
::testing::Values(8), // values of m
::testing::Values(24), // values of n
::testing::Values(bli_dgemm_avx512_asm_8x24),
::testing::Values(true, false) // memory test
),
::dgemmGenericNatPrint()
);
#endif
#ifdef K_bli_dgemm_zen4_asm_8x24
// Old version of bli_dgemm_avx512_asm_8x24 kernel, removed in 5.1
INSTANTIATE_TEST_SUITE_P(
bli_dgemm_zen4_asm_8x24,
dgemmGenericNat,
@@ -540,7 +521,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values('r', 'c'), // storage
::testing::Values(8), // values of m
::testing::Values(24), // values of n
::testing::Values(bli_dgemm_zen4_asm_8x24),
::testing::Values(K_bli_dgemm_zen4_asm_8x24),
::testing::Values(true, false) // memory test
),
::dgemmGenericNatPrint()
@@ -665,9 +646,9 @@ public:
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
#ifdef K_bli_dgemm_24x8_avx512_k1_nn
#ifdef K_bli_dgemm_zen4_int_24x8_k1_nn
INSTANTIATE_TEST_SUITE_P(
bli_dgemm_24x8_avx512_k1_nn,
bli_dgemm_zen4_int_24x8_k1_nn,
dgemmGenericK1,
::testing::Combine(
@@ -676,7 +657,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values('c'), // storage
::testing::Range(gtint_t(1), gtint_t(25), 1), // values of m
::testing::Range(gtint_t(1), gtint_t(9), 1), // values of n
::testing::Values(bli_dgemm_24x8_avx512_k1_nn),
::testing::Values(K_bli_dgemm_zen4_int_24x8_k1_nn),
::testing::Values(true, false) // memory test
),
::dgemmGenericK1Print()
@@ -688,9 +669,9 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
#ifdef K_bli_dgemm_8x6_avx2_k1_nn
#ifdef K_bli_dgemm_zen_int_8x6_k1_nn
INSTANTIATE_TEST_SUITE_P(
bli_dgemm_8x6_avx2_k1_nn,
bli_dgemm_zen_int_8x6_k1_nn,
dgemmGenericK1,
::testing::Combine(
::testing::Values(2.0, 1.0, -1.0), // alpha value
@@ -698,7 +679,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values('c'), // storage
::testing::Range(gtint_t(1), gtint_t(9), 1), // values of m
::testing::Range(gtint_t(1), gtint_t(7), 1), // values of n
::testing::Values(bli_dgemm_8x6_avx2_k1_nn),
::testing::Values(K_bli_dgemm_zen_int_8x6_k1_nn),
::testing::Values(true, false) // memory test
),
::dgemmGenericK1Print()

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -270,7 +270,7 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
#ifdef K_bli_sgemmsup_rv_zen_asm_6x64m_avx512
#ifdef K_bli_sgemmsup_rv_zen4_asm_6x64m
INSTANTIATE_TEST_SUITE_P(
bli_sgemmsup_rv_zen_asm_6x64m_row_stored_c,
sgemmGenericSUP,
@@ -281,7 +281,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(2.0, 1.0, -1.0), // alpha value
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
::testing::Values('r'), // storage of c
::testing::Values(bli_sgemmsup_rv_zen_asm_6x64m_avx512), // sgemm_sup kernel
::testing::Values(K_bli_sgemmsup_rv_zen4_asm_6x64m), // sgemm_sup kernel
::testing::Values(gtint_t(6)), // Micro kernel block MR
::testing::Values('t'), // transa
::testing::Values('n'), // transb
@@ -292,7 +292,7 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_sgemmsup_rv_zen_asm_6x64m_avx512
#ifdef K_bli_sgemmsup_rv_zen4_asm_6x64m
INSTANTIATE_TEST_SUITE_P(
bli_sgemmsup_rv_zen_asm_6x64m_col_stored_c,
sgemmGenericSUP,
@@ -303,7 +303,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(2.0, 1.0, -1.0), // alpha value
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
::testing::Values('c'), // storage of c
::testing::Values(bli_sgemmsup_rv_zen_asm_6x64m_avx512), // sgemm_sup_kernel
::testing::Values(K_bli_sgemmsup_rv_zen4_asm_6x64m), // sgemm_sup_kernel
::testing::Values(gtint_t(6)), // Micro kernel block MR
::testing::Values('n'), // transa
::testing::Values('t'), // transb
@@ -315,7 +315,7 @@ INSTANTIATE_TEST_SUITE_P(
#endif
/*
The bli_sgemmsup_rd_zen_asm_6x64m_avx512(standalone), accepts inputs with the
The bli_sgemmsup_rd_zen4_asm_6x64m(standalone), accepts inputs with the
following contingency for n.
n <= NR, where NR is 64
The code structure for the sgemm_sup rd kernels(m-var) are as follows:
@@ -336,7 +336,7 @@ INSTANTIATE_TEST_SUITE_P(
*/
// Checking with row storage of C
#ifdef K_bli_sgemmsup_rd_zen_asm_6x64m_avx512
#ifdef K_bli_sgemmsup_rd_zen4_asm_6x64m
INSTANTIATE_TEST_SUITE_P(
bli_sgemmsup_rd_zen_asm_6x64m_row_stored_c,
sgemmGenericSUP,
@@ -357,7 +357,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(2.0, 1.0, -1.0), // alpha value
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
::testing::Values('r'), // storage of c
::testing::Values(bli_sgemmsup_rd_zen_asm_6x64m_avx512), // sgemm_sup_kernel
::testing::Values(K_bli_sgemmsup_rd_zen4_asm_6x64m), // sgemm_sup_kernel
::testing::Values(gtint_t(6)), // Micro kernel block MR
::testing::Values('n'), // transa, has to be N for row storage
::testing::Values('t'), // transb, has to be T for row storage
@@ -371,7 +371,7 @@ INSTANTIATE_TEST_SUITE_P(
// Checking with col storage of C
// NOTE : Since we are inducing transpose at opertaion level, for code coverage, we
// have to interchange m and n instantiations
#ifdef K_bli_sgemmsup_rd_zen_asm_6x64m_avx512
#ifdef K_bli_sgemmsup_rd_zen4_asm_6x64m
INSTANTIATE_TEST_SUITE_P(
bli_sgemmsup_rd_zen_asm_6x64m_col_stored_c,
sgemmGenericSUP,
@@ -392,7 +392,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(2.0, 1.0, -1.0), // alpha value
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
::testing::Values('c'), // storage of c
::testing::Values(bli_sgemmsup_rd_zen_asm_6x64m_avx512), // sgemm_sup_kernel
::testing::Values(K_bli_sgemmsup_rd_zen4_asm_6x64m), // sgemm_sup_kernel
::testing::Values(gtint_t(6)), // Micro kernel block MR
::testing::Values('t'), // transa, has to be T for row storage
::testing::Values('n'), // transb, has to be N for row storage
@@ -403,7 +403,7 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_sgemmsup_rv_zen_asm_6x64n_avx512
#ifdef K_bli_sgemmsup_rv_zen4_asm_6x64n
INSTANTIATE_TEST_SUITE_P(
bli_sgemmsup_rv_zen_asm_6x64n_row_stored_c,
sgemmGenericSUP,
@@ -414,7 +414,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(2.0, 1.0, -1.0), // alpha value
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
::testing::Values('r'), // storage of c
::testing::Values(bli_sgemmsup_rv_zen_asm_6x64n_avx512), // sgemm_sup_kernel
::testing::Values(K_bli_sgemmsup_rv_zen4_asm_6x64n), // sgemm_sup_kernel
::testing::Values(gtint_t(6)), // Micro kernel block MR
::testing::Values('t'), // transa
::testing::Values('n'), // transb
@@ -425,7 +425,7 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_sgemmsup_rd_zen_asm_6x64n_avx512
#ifdef K_bli_sgemmsup_rd_zen4_asm_6x64n
INSTANTIATE_TEST_SUITE_P(
bli_sgemmsup_rd_zen_asm_6x64n_row_stored_c,
sgemmGenericSUP,
@@ -436,7 +436,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(2.0, 1.0, -1.0), // alpha value
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
::testing::Values('r'), // storage of c
::testing::Values(bli_sgemmsup_rd_zen_asm_6x64n_avx512), // sgemm_sup_kernel
::testing::Values(K_bli_sgemmsup_rd_zen4_asm_6x64n), // sgemm_sup_kernel
::testing::Values(gtint_t(6)), // Micro kernel block MR
::testing::Values('n'), // transa
::testing::Values('t'), // transb

View File

@@ -1816,9 +1816,9 @@ public:
};
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
#ifdef K_bli_zgemm_16x4_avx512_k1_nn
#ifdef K_bli_zgemm_zen4_int_16x4_k1_nn
INSTANTIATE_TEST_SUITE_P(
bli_zgemm_16x4_avx512_k1_nn,
bli_zgemm_zen4_int_16x4_k1_nn,
zgemmUkrk1,
::testing::Combine(
@@ -1829,7 +1829,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values('c'), // storage
::testing::Range(gtint_t(1), gtint_t(33), 1), // values of m
::testing::Range(gtint_t(1), gtint_t(9), 1), // values of n
::testing::Values(bli_zgemm_16x4_avx512_k1_nn),
::testing::Values(K_bli_zgemm_zen4_int_16x4_k1_nn),
::testing::Values(true, false) // memory test
),
::zgemmUkrk1Print()
@@ -1838,9 +1838,9 @@ INSTANTIATE_TEST_SUITE_P(
#endif
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
#ifdef K_bli_zgemm_4x4_avx2_k1_nn
#ifdef K_bli_zgemm_zen_int_4x4_k1_nn
INSTANTIATE_TEST_SUITE_P(
bli_zgemm_4x4_avx2_k1_nn,
bli_zgemm_zen_int_4x4_k1_nn,
zgemmUkrk1,
::testing::Combine(
::testing::Values(dcomplex{1.0, 0.0}, dcomplex{-1.0, 0.0},
@@ -1850,7 +1850,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values('c'), // storage
::testing::Range(gtint_t(1), gtint_t(9), 1), // values of m
::testing::Range(gtint_t(1), gtint_t(9), 1), // values of n
::testing::Values(bli_zgemm_4x4_avx2_k1_nn),
::testing::Values(K_bli_zgemm_zen_int_4x4_k1_nn),
::testing::Values(true, false) // memory test
),
::zgemmUkrk1Print()

View File

@@ -687,14 +687,14 @@ INSTANTIATE_TEST_SUITE_P(
// -------------------------------
static dgemv_ker_ft_conja m_ker_fp[8] =
{
bli_dgemv_n_zen_int_16mx1_avx512, // n = 1
bli_dgemv_n_zen_int_16mx2_avx512, // n = 2
bli_dgemv_n_zen_int_16mx3_avx512, // n = 3
bli_dgemv_n_zen_int_16mx4_avx512, // n = 4
bli_dgemv_n_zen_int_16mx5_avx512, // n = 5
bli_dgemv_n_zen_int_16mx6_avx512, // n = 6
bli_dgemv_n_zen_int_16mx7_avx512, // n = 7
bli_dgemv_n_zen_int_16mx8_avx512, // n = 8; base kernel
bli_dgemv_n_zen4_int_16mx1, // n = 1
bli_dgemv_n_zen4_int_16mx2, // n = 2
bli_dgemv_n_zen4_int_16mx3, // n = 3
bli_dgemv_n_zen4_int_16mx4, // n = 4
bli_dgemv_n_zen4_int_16mx5, // n = 5
bli_dgemv_n_zen4_int_16mx6, // n = 6
bli_dgemv_n_zen4_int_16mx7, // n = 7
bli_dgemv_n_zen4_int_16mx8, // n = 8; base kernel
};
#define DGEMV_TEST_M(N) \
@@ -721,45 +721,45 @@ static dgemv_ker_ft_conja m_ker_fp[8] =
), \
(::gemvUKRPrint<double, dgemv_ker_ft_conja>()) \
);
#ifdef K_bli_dgemv_n_zen_int_16mx8_avx512
#ifdef K_bli_dgemv_n_zen4_int_16mx8
DGEMV_TEST_M(8)
#endif
#ifdef K_bli_dgemv_n_zen_int_16mx7_avx512
#ifdef K_bli_dgemv_n_zen4_int_16mx7
DGEMV_TEST_M(7)
#endif
#ifdef K_bli_dgemv_n_zen_int_16mx6_avx512
#ifdef K_bli_dgemv_n_zen4_int_16mx6
DGEMV_TEST_M(6)
#endif
#ifdef K_bli_dgemv_n_zen_int_16mx5_avx512
#ifdef K_bli_dgemv_n_zen4_int_16mx5
DGEMV_TEST_M(5)
#endif
#ifdef K_bli_dgemv_n_zen_int_16mx4_avx512
#ifdef K_bli_dgemv_n_zen4_int_16mx4
DGEMV_TEST_M(4)
#endif
#ifdef K_bli_dgemv_n_zen_int_16mx3_avx512
#ifdef K_bli_dgemv_n_zen4_int_16mx3
DGEMV_TEST_M(3)
#endif
#ifdef K_bli_dgemv_n_zen_int_16mx2_avx512
#ifdef K_bli_dgemv_n_zen4_int_16mx2
DGEMV_TEST_M(2)
#endif
#ifdef K_bli_dgemv_n_zen_int_16mx1_avx512
#ifdef K_bli_dgemv_n_zen4_int_16mx1
DGEMV_TEST_M(1)
#endif
// 32x8n kernel will handle case where m >= 32.
#ifdef K_bli_dgemv_n_zen_int_32x8n_avx512
#ifdef K_bli_dgemv_n_zen4_int_32x8n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_32x8n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_32x8n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_32x8n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -778,12 +778,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// 16x8n kernel will handle case where m = [16, 32).
#ifdef K_bli_dgemv_n_zen_int_16x8n_avx512
#ifdef K_bli_dgemv_n_zen4_int_16x8n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_16x8n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_16x8n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_16x8n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -802,12 +802,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// 8x8n kernel will handle case where m = [8, 15).
#ifdef K_bli_dgemv_n_zen_int_8x8n_avx512
#ifdef K_bli_dgemv_n_zen4_int_8x8n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_8x8n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_8x8n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_8x8n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -826,12 +826,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// m_leftx8n kernel will handle case where m = [1, 7).
#ifdef K_bli_dgemv_n_zen_int_m_leftx8n_avx512
#ifdef K_bli_dgemv_n_zen4_int_m_leftx8n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_m_leftx8n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_m_leftx8n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_m_leftx8n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -850,12 +850,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// 32x4n kernel will handle case where m >= 32 and n = 4.
#ifdef K_bli_dgemv_n_zen_int_32x4n_avx512
#ifdef K_bli_dgemv_n_zen4_int_32x4n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_32x4n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_32x4n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_32x4n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -874,12 +874,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// 16x4n kernel will handle case where m = [16, 32) and n = 4.
#ifdef K_bli_dgemv_n_zen_int_16x4n_avx512
#ifdef K_bli_dgemv_n_zen4_int_16x4n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_16x4n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_16x4n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_16x4n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -897,12 +897,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// 8x4n kernel will handle case where m = [8, 15) and n = 4.
#ifdef K_bli_dgemv_n_zen_int_8x4n_avx512
#ifdef K_bli_dgemv_n_zen4_int_8x4n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_8x4n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_8x4n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_8x4n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -920,12 +920,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// m_leftx4n kernel will handle case where m = [1, 7) and n = 4.
#ifdef K_bli_dgemv_n_zen_int_m_leftx4n_avx512
#ifdef K_bli_dgemv_n_zen4_int_m_leftx4n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_m_leftx4n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_m_leftx4n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_m_leftx4n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -943,12 +943,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// 32x3n kernel will handle case where m >= 32 and n = 3.
#ifdef K_bli_dgemv_n_zen_int_32x3n_avx512
#ifdef K_bli_dgemv_n_zen4_int_32x3n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_32x3n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_32x3n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_32x3n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -967,12 +967,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// 16x3n kernel will handle case where m = [16, 32) and n = 3.
#ifdef K_bli_dgemv_n_zen_int_16x3n_avx512
#ifdef K_bli_dgemv_n_zen4_int_16x3n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_16x3n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_16x3n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_16x3n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -990,12 +990,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// 8x3n kernel will handle case where m = [8, 15) and n = 3.
#ifdef K_bli_dgemv_n_zen_int_8x3n_avx512
#ifdef K_bli_dgemv_n_zen4_int_8x3n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_8x3n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_8x3n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_8x3n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -1013,12 +1013,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// m_leftx3n kernel will handle case where m = [1, 7) and n = 3.
#ifdef K_bli_dgemv_n_zen_int_m_leftx3n_avx512
#ifdef K_bli_dgemv_n_zen4_int_m_leftx3n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_m_leftx3n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_m_leftx3n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_m_leftx3n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -1036,12 +1036,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// 32x2n kernel will handle case where m >= 32 and n = 2.
#ifdef K_bli_dgemv_n_zen_int_32x2n_avx512
#ifdef K_bli_dgemv_n_zen4_int_32x2n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_32x2n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_32x2n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_32x2n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -1060,12 +1060,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// 16x2n kernel will handle case where m = [16, 32) and n = 2.
#ifdef K_bli_dgemv_n_zen_int_16x2n_avx512
#ifdef K_bli_dgemv_n_zen4_int_16x2n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_16x2n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_16x2n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_16x2n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -1083,12 +1083,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// 8x2n kernel will handle case where m = [8, 15) and n = 2.
#ifdef K_bli_dgemv_n_zen_int_8x2n_avx512
#ifdef K_bli_dgemv_n_zen4_int_8x2n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_8x2n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_8x2n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_8x2n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -1106,12 +1106,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// m_leftx2n kernel will handle case where m = [1, 7) and n = 2.
#ifdef K_bli_dgemv_n_zen_int_m_leftx2n_avx512
#ifdef K_bli_dgemv_n_zen4_int_m_leftx2n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_m_leftx2n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_m_leftx2n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_m_leftx2n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -1129,12 +1129,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// 32x1n kernel will handle case where m >= 32 and n = 1.
#ifdef K_bli_dgemv_n_zen_int_32x1n_avx512
#ifdef K_bli_dgemv_n_zen4_int_32x1n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_32x1n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_32x1n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_32x1n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -1153,12 +1153,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// 16x1n kernel will handle case where m = [16, 32) and n = 1.
#ifdef K_bli_dgemv_n_zen_int_16x1n_avx512
#ifdef K_bli_dgemv_n_zen4_int_16x1n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_16x1n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_16x1n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_16x1n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -1176,12 +1176,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// 8x1n kernel will handle case where m = [8, 15) and n = 1.
#ifdef K_bli_dgemv_n_zen_int_8x1n_avx512
#ifdef K_bli_dgemv_n_zen4_int_8x1n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_8x1n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_8x1n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_8x1n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -1199,12 +1199,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// m_leftx1n kernel will handle case where m = [1, 7) and n = 1.
#ifdef K_bli_dgemv_n_zen_int_m_leftx1n_avx512
#ifdef K_bli_dgemv_n_zen4_int_m_leftx1n
INSTANTIATE_TEST_SUITE_P(
dgemv_n_m_leftx1n_avx512,
dgemvGenericConja,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen_int_m_leftx1n_avx512),
::testing::Values(K_bli_dgemv_n_zen4_int_m_leftx1n),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -1247,12 +1247,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
#ifdef K_bli_dgemv_n_zen4_40x2_int_st
#ifdef K_bli_dgemv_n_zen4_int_40x2_st
INSTANTIATE_TEST_SUITE_P(
bli_dgemv_n_zen4_40x2_int_st,
bli_dgemv_n_zen4_int_40x2_st,
dgemvGenericTransa,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen4_40x2_int_st),
::testing::Values(bli_dgemv_n_zen4_int_40x2_st),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -1271,12 +1271,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_n_zen4_40x2_int_mt)
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_n_zen4_int_40x2_mt)
INSTANTIATE_TEST_SUITE_P(
bli_dgemv_n_zen4_40x2_int_mt,
bli_dgemv_n_zen4_int_40x2_mt,
dgemvGenericTransa,
::testing::Combine(
::testing::Values(bli_dgemv_n_zen4_40x2_int_mt),
::testing::Values(bli_dgemv_n_zen4_int_40x2_mt),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -1295,12 +1295,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemv_m_zen4_40x8_int_st
#ifdef K_bli_dgemv_m_zen4_int_40x8_st
INSTANTIATE_TEST_SUITE_P(
bli_dgemv_m_zen4_40x8_int_st,
bli_dgemv_m_zen4_int_40x8_st,
dgemvGenericTransa,
::testing::Combine(
::testing::Values(bli_dgemv_m_zen4_40x8_int_st),
::testing::Values(bli_dgemv_m_zen4_int_40x8_st),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -1319,12 +1319,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_m_zen4_40x8_int_mt_Ndiv)
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_m_zen4_int_40x8_mt_Ndiv)
INSTANTIATE_TEST_SUITE_P(
bli_dgemv_m_zen4_40x8_int_mt_Ndiv,
bli_dgemv_m_zen4_int_40x8_mt_Ndiv,
dgemvGenericTransa,
::testing::Combine(
::testing::Values(bli_dgemv_m_zen4_40x8_int_mt_Ndiv),
::testing::Values(bli_dgemv_m_zen4_int_40x8_mt_Ndiv),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -1343,12 +1343,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_m_zen4_40x8_int_mt_Mdiv)
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_m_zen4_int_40x8_mt_Mdiv)
INSTANTIATE_TEST_SUITE_P(
bli_dgemv_m_zen4_40x8_int_mt_Mdiv,
bli_dgemv_m_zen4_int_40x8_mt_Mdiv,
dgemvGenericTransa,
::testing::Combine(
::testing::Values(bli_dgemv_m_zen4_40x8_int_mt_Mdiv),
::testing::Values(bli_dgemv_m_zen4_int_40x8_mt_Mdiv),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx
@@ -1367,12 +1367,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_m_zen4_40x8_int_mt_Mdiv_Ndiv)
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_m_zen4_int_40x8_mt_Mdiv_Ndiv)
INSTANTIATE_TEST_SUITE_P(
bli_dgemv_m_zen4_40x8_int_mt_Mdiv_Ndiv,
bli_dgemv_m_zen4_int_40x8_mt_Mdiv_Ndiv,
dgemvGenericTransa,
::testing::Combine(
::testing::Values(bli_dgemv_m_zen4_40x8_int_mt_Mdiv_Ndiv),
::testing::Values(bli_dgemv_m_zen4_int_40x8_mt_Mdiv_Ndiv),
::testing::Values( 'c' ), // storage format
::testing::Values( 'n' ), // transa
::testing::Values( 'n' ), // conjx

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -72,8 +72,8 @@ TEST_P( dnrm2Generic, UKR )
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
/*
Unit testing for functionality of bli_dnorm2fv_unb_var1_avx2 kernel.
The code structure for bli_dnorm2fv_unb_var1_avx2( ... ) is as follows :
Unit testing for functionality of bli_dnorm2fv_zen_int_unb_var1 kernel.
The code structure for bli_dnorm2fv_zen_int_unb_var1( ... ) is as follows :
For unit strides :
Main loop : In blocks of 8 --> L8
Fringe loops : In blocks of 4 --> L4
@@ -82,12 +82,12 @@ TEST_P( dnrm2Generic, UKR )
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_dnorm2fv_unb_var1_avx2
#ifdef K_bli_dnorm2fv_zen_int_unb_var1
INSTANTIATE_TEST_SUITE_P(
bli_dnorm2fv_unb_var1_avx2_unitStrides,
bli_dnorm2fv_zen_int_unb_var1_unitStrides,
dnrm2Generic,
::testing::Combine(
::testing::Values(bli_dnorm2fv_unb_var1_avx2), // ukr function
::testing::Values(K_bli_dnorm2fv_zen_int_unb_var1), // ukr function
// m size of vector
::testing::Values(// Testing the loops standalone
gtint_t(8), // size n, for L8
@@ -105,12 +105,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with non-unit strides.
#ifdef K_bli_dnorm2fv_unb_var1_avx2
#ifdef K_bli_dnorm2fv_zen_int_unb_var1
INSTANTIATE_TEST_SUITE_P(
bli_dnorm2fv_unb_var1_avx2_nonUnitStrides,
bli_dnorm2fv_zen_int_unb_var1_nonUnitStrides,
dnrm2Generic,
::testing::Combine(
::testing::Values(bli_dnorm2fv_unb_var1_avx2), // ukr function
::testing::Values(K_bli_dnorm2fv_zen_int_unb_var1), // ukr function
// m size of vector
::testing::Values(// Testing the loops standalone
gtint_t(25), // n, size of the vector
@@ -127,8 +127,8 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_dnorm2fv_unb_var1_avx512 kernel.
The code structure for bli_dnorm2fv_unb_var1_avx512( ... ) is as follows :
Unit testing for functionality of bli_dnorm2fv_zen4_int_unb_var1 kernel.
The code structure for bli_dnorm2fv_zen4_int_unb_var1( ... ) is as follows :
For unit strides :
Main loop : In blocks of 32 --> L32
Fringe loops : In blocks of 16 --> L16
@@ -138,12 +138,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_dnorm2fv_unb_var1_avx512
#ifdef K_bli_dnorm2fv_zen4_int_unb_var1
INSTANTIATE_TEST_SUITE_P(
bli_dnorm2fv_unb_var1_avx512_unitStrides,
bli_dnorm2fv_zen4_int_unb_var1_unitStrides,
dnrm2Generic,
::testing::Combine(
::testing::Values(bli_dnorm2fv_unb_var1_avx512), // ukr function
::testing::Values(K_bli_dnorm2fv_zen4_int_unb_var1), // ukr function
// m size of vector
::testing::Values(// Testing the loops standalone
gtint_t(32), // size n, for L32
@@ -162,12 +162,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with non-unit strides.
#ifdef K_bli_dnorm2fv_unb_var1_avx512
#ifdef K_bli_dnorm2fv_zen4_int_unb_var1
INSTANTIATE_TEST_SUITE_P(
bli_dnorm2fv_unb_var1_avx512_nonUnitStrides,
bli_dnorm2fv_zen4_int_unb_var1_nonUnitStrides,
dnrm2Generic,
::testing::Combine(
::testing::Values(bli_dnorm2fv_unb_var1_avx512), // ukr function
::testing::Values(K_bli_dnorm2fv_zen4_int_unb_var1), // ukr function
// m size of vector
::testing::Values(// Testing the loops standalone
gtint_t(25), // n, size of the vector

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -72,8 +72,8 @@ TEST_P( dznrm2Generic, UKR )
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
/*
Unit testing for functionality of bli_dznorm2fv_unb_var1_avx2 kernel.
The code structure for bli_dznorm2fv_unb_var1_avx2( ... ) is as follows :
Unit testing for functionality of bli_dznorm2fv_zen_int_unb_var1 kernel.
The code structure for bli_dznorm2fv_zen_int_unb_var1( ... ) is as follows :
For unit strides :
Main loop : In blocks of 4 --> L4
Fringe loops : In blocks of 2 --> L2
@@ -82,12 +82,12 @@ TEST_P( dznrm2Generic, UKR )
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_dznorm2fv_unb_var1_avx2
#ifdef K_bli_dznorm2fv_zen_int_unb_var1
INSTANTIATE_TEST_SUITE_P(
bli_dznorm2fv_unb_var1_avx2_unitStrides,
bli_dznorm2fv_zen_int_unb_var1_unitStrides,
dznrm2Generic,
::testing::Combine(
::testing::Values(bli_dznorm2fv_unb_var1_avx2), // ukr function
::testing::Values(K_bli_dznorm2fv_zen_int_unb_var1), // ukr function
// m size of vector
::testing::Values(// Testing the loops standalone
gtint_t(4), // size n, for L4
@@ -105,12 +105,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with non-unit strides.
#ifdef K_bli_dznorm2fv_unb_var1_avx2
#ifdef K_bli_dznorm2fv_zen_int_unb_var1
INSTANTIATE_TEST_SUITE_P(
bli_dznorm2fv_unb_var1_avx2_nonUnitStrides,
bli_dznorm2fv_zen_int_unb_var1_nonUnitStrides,
dznrm2Generic,
::testing::Combine(
::testing::Values(bli_dznorm2fv_unb_var1_avx2), // ukr function
::testing::Values(K_bli_dznorm2fv_zen_int_unb_var1), // ukr function
// m size of vector
::testing::Values(// Testing the loops standalone
gtint_t(25), // n, size of the vector

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -72,8 +72,8 @@ TEST_P( scnrm2Generic, UKR )
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
/*
Unit testing for functionality of bli_scnorm2fv_unb_var1_avx2 kernel.
The code structure for bli_scnorm2fv_unb_var1_avx2( ... ) is as follows :
Unit testing for functionality of bli_scnorm2fv_zen_int_unb_var1 kernel.
The code structure for bli_scnorm2fv_zen_int_unb_var1( ... ) is as follows :
For unit strides :
Main loop : In blocks of 16 --> L16
Fringe loops : In blocks of 12 --> L12
@@ -85,12 +85,12 @@ TEST_P( scnrm2Generic, UKR )
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_scnorm2fv_unb_var1_avx2
#ifdef K_bli_scnorm2fv_zen_int_unb_var1
INSTANTIATE_TEST_SUITE_P(
bli_scnorm2fv_unb_var1_avx2_unitStrides,
bli_scnorm2fv_zen_int_unb_var1_unitStrides,
scnrm2Generic,
::testing::Combine(
::testing::Values(bli_scnorm2fv_unb_var1_avx2), // ukr function
::testing::Values(K_bli_scnorm2fv_zen_int_unb_var1), // ukr function
// m size of vector
::testing::Values(// Testing the loops standalone
gtint_t(64), // size n, for L16
@@ -106,12 +106,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with non-unit strides.
#ifdef K_bli_scnorm2fv_unb_var1_avx2
#ifdef K_bli_scnorm2fv_zen_int_unb_var1
INSTANTIATE_TEST_SUITE_P(
bli_scnorm2fv_unb_var1_avx2_nonUnitStrides,
bli_scnorm2fv_zen_int_unb_var1_nonUnitStrides,
scnrm2Generic,
::testing::Combine(
::testing::Values(bli_scnorm2fv_unb_var1_avx2), // ukr function
::testing::Values(K_bli_scnorm2fv_zen_int_unb_var1), // ukr function
// m size of vector
::testing::Values(// Testing the loops standalone
gtint_t(25), // n, size of the vector

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -72,8 +72,8 @@ TEST_P( snrm2Generic, UKR )
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
/*
Unit testing for functionality of bli_snorm2fv_unb_var1_avx2 kernel.
The code structure for bli_snorm2fv_unb_var1_avx2( ... ) is as follows :
Unit testing for functionality of bli_snorm2fv_zen_int_unb_var1 kernel.
The code structure for bli_snorm2fv_zen_int_unb_var1( ... ) is as follows :
For unit strides :
Main loop : In blocks of 32 --> L32
Fringe loops : In blocks of 24 --> L24
@@ -85,12 +85,12 @@ TEST_P( snrm2Generic, UKR )
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_snorm2fv_unb_var1_avx2
#ifdef K_bli_snorm2fv_zen_int_unb_var1
INSTANTIATE_TEST_SUITE_P(
bli_snorm2fv_unb_var1_avx2_unitStrides,
bli_snorm2fv_zen_int_unb_var1_unitStrides,
snrm2Generic,
::testing::Combine(
::testing::Values(bli_snorm2fv_unb_var1_avx2), // ukr function
::testing::Values(K_bli_snorm2fv_zen_int_unb_var1), // ukr function
// m size of vector
::testing::Values(// Testing the loops standalone
gtint_t(64), // size n, for L32
@@ -106,12 +106,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with non-unit strides.
#ifdef K_bli_snorm2fv_unb_var1_avx2
#ifdef K_bli_snorm2fv_zen_int_unb_var1
INSTANTIATE_TEST_SUITE_P(
bli_snorm2fv_unb_var1_avx2_nonUnitStrides,
bli_snorm2fv_zen_int_unb_var1_nonUnitStrides,
snrm2Generic,
::testing::Combine(
::testing::Values(bli_snorm2fv_unb_var1_avx2), // ukr function
::testing::Values(K_bli_snorm2fv_zen_int_unb_var1), // ukr function
// m size of vector
::testing::Values(// Testing the loops standalone
gtint_t(25), // n, size of the vector

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -163,8 +163,8 @@ INSTANTIATE_TEST_SUITE_P(
// ----------------------------------------------
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_dscal2v_zen_int_avx512 kernel.
The code structure for bli_dscal2v_zen_int_avx512( ... ) is as follows :
Unit testing for functionality of bli_dscal2v_zen4_int kernel.
The code structure for bli_dscal2v_zen4_int( ... ) is as follows :
For unit strides :
Main loop : In blocks of 64 --> L64
Fringe loops : In blocks of 32 --> L32
@@ -175,12 +175,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
#ifdef K_bli_dscal2v_zen_int_avx512
#ifdef K_bli_dscal2v_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_dscal2v_zen_int_avx512_unitPositiveStride,
bli_dscal2v_zen4_int_unitPositiveStride,
dscal2vGeneric,
::testing::Combine(
::testing::Values(bli_dscal2v_zen_int_avx512),
::testing::Values(K_bli_dscal2v_zen4_int),
// conjx: uses n (no_conjugate) since it is real.
::testing::Values('n'),
::testing::Values(// Testing the loops standalone
@@ -201,12 +201,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dscal2v_zen_int_avx512
#ifdef K_bli_dscal2v_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_dscal2v_zen_int_avx512_nonUnitPositiveStrides,
bli_dscal2v_zen4_int_nonUnitPositiveStrides,
dscal2vGeneric,
::testing::Combine(
::testing::Values(bli_dscal2v_zen_int_avx512),
::testing::Values(K_bli_dscal2v_zen4_int),
// conjx: uses n (no_conjugate) since it is real.
::testing::Values('n'),
::testing::Values(// Testing the loops standalone

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -180,7 +180,7 @@ INSTANTIATE_TEST_SUITE_P(
// ----- Begin ZEN4 (AVX512) Kernel Tests -----
// ----------------------------------------------
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
// Tests for bli_cscalv_zen_int_avx512 (AVX512) kernel.
// Tests for bli_cscalv_zen4_int (AVX512) kernel.
/**
* Loops:
* L96 - Main loop, handles 96 scomplex elements
@@ -193,12 +193,12 @@ INSTANTIATE_TEST_SUITE_P(
*
* LScalar - handles non-unit increments
*/
#ifdef K_bli_cscalv_zen_int_avx512
#ifdef K_bli_cscalv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_cscalv_zen_int_avx512_unitPositiveStride,
bli_cscalv_zen4_int_unitPositiveStride,
cscalvGeneric,
::testing::Combine(
::testing::Values(bli_cscalv_zen_int_avx512),
::testing::Values(K_bli_cscalv_zen4_int),
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'
#ifdef TEST_BLIS_TYPED
@@ -236,12 +236,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_cscalv_zen_int_avx512
#ifdef K_bli_cscalv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_cscalv_zen_int_avx512_nonUnitPositiveStrides,
bli_cscalv_zen4_int_nonUnitPositiveStrides,
cscalvGeneric,
::testing::Combine(
::testing::Values(bli_cscalv_zen_int_avx512),
::testing::Values(K_bli_cscalv_zen4_int),
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'
#ifdef TEST_BLIS_TYPED

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -160,7 +160,7 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
// Tests for bli_dscalv_zen_int10 (AVX2) kernel.
// Tests for bli_dscalv_zen_int_10 (AVX2) kernel.
/**
* Cases and Loops:
* C0 L64 - Main loop, handles 64 elements
@@ -172,12 +172,12 @@ INSTANTIATE_TEST_SUITE_P(
*
* LNUnit - loop for non-unit increments
*/
#ifdef K_bli_dscalv_zen_int10
#ifdef K_bli_dscalv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_dscalv_zen_int10_unitPositiveStride,
bli_dscalv_zen_int_10_unitPositiveStride,
dscalvGeneric,
::testing::Combine(
::testing::Values(bli_dscalv_zen_int10),
::testing::Values(K_bli_dscalv_zen_int_10),
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
@@ -223,12 +223,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dscalv_zen_int10
#ifdef K_bli_dscalv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_dscalv_zen_int10_nonUnitPositiveStrides,
bli_dscalv_zen_int_10_nonUnitPositiveStrides,
dscalvGeneric,
::testing::Combine(
::testing::Values(bli_dscalv_zen_int10),
::testing::Values(K_bli_dscalv_zen_int_10),
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
@@ -260,7 +260,7 @@ INSTANTIATE_TEST_SUITE_P(
// ----- Begin ZEN4 (AVX512) Kernel Tests -----
// ----------------------------------------------
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
// Tests for bli_dscalv_zen_int_avx512 (AVX512) kernel.
// Tests for bli_dscalv_zen4_int (AVX512) kernel.
/**
* Loops:
* L64 - Main loop, handles 64 elements
@@ -271,12 +271,12 @@ INSTANTIATE_TEST_SUITE_P(
* L2 - handles 2 elements
* LScalar - leftover loop (also handles non-unit increments)
*/
#ifdef K_bli_dscalv_zen_int_avx512
#ifdef K_bli_dscalv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_dscalv_zen_int_avx512_unitPositiveStride,
bli_dscalv_zen4_int_unitPositiveStride,
dscalvGeneric,
::testing::Combine(
::testing::Values(bli_dscalv_zen_int_avx512),
::testing::Values(K_bli_dscalv_zen4_int),
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
@@ -335,12 +335,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dscalv_zen_int_avx512
#ifdef K_bli_dscalv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_dscalv_zen_int_avx512_nonUnitPositiveStrides,
bli_dscalv_zen4_int_nonUnitPositiveStrides,
dscalvGeneric,
::testing::Combine(
::testing::Values(bli_dscalv_zen_int_avx512),
::testing::Values(K_bli_dscalv_zen4_int),
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -159,7 +159,7 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
// Tests for bli_sscalv_zen_int10 (AVX2) kernel.
// Tests for bli_sscalv_zen_int_10 (AVX2) kernel.
/**
* Cases and Loops:
* C0 L128 - Main loop, handles 128 elements
@@ -174,12 +174,12 @@ INSTANTIATE_TEST_SUITE_P(
*
* LNUnit - loop for non-unit increments
*/
#ifdef K_bli_sscalv_zen_int10
#ifdef K_bli_sscalv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_sscalv_zen_int10_unitPositiveStride,
bli_sscalv_zen_int_10_unitPositiveStride,
sscalvGeneric,
::testing::Combine(
::testing::Values(bli_sscalv_zen_int10),
::testing::Values(K_bli_sscalv_zen_int_10),
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
@@ -219,12 +219,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_sscalv_zen_int10
#ifdef K_bli_sscalv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_sscalv_zen_int10_nonUnitPositiveStrides,
bli_sscalv_zen_int_10_nonUnitPositiveStrides,
sscalvGeneric,
::testing::Combine(
::testing::Values(bli_sscalv_zen_int10),
::testing::Values(K_bli_sscalv_zen_int_10),
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -93,7 +93,7 @@ TEST_P( zdscalvGeneric, UKR )
// ----- Begin ZEN1/2/3 (AVX2) Kernel Tests -----
// ----------------------------------------------
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
// Tests for bli_zdscalv_zen_int10 (AVX2) kernel.
// Tests for bli_zdscalv_zen_int_10 (AVX2) kernel.
/**
* Loops:
* L30 - Main loop, handles 30 elements
@@ -104,12 +104,12 @@ TEST_P( zdscalvGeneric, UKR )
* L2 - handles 2 elements
* LScalar - leftover loop (also handles non-unit increments)
*/
#ifdef K_bli_zdscalv_zen_int10
#ifdef K_bli_zdscalv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_zdscalv_zen_int10_unitPositiveStride,
bli_zdscalv_zen_int_10_unitPositiveStride,
zdscalvGeneric,
::testing::Combine(
::testing::Values(bli_zdscalv_zen_int10),
::testing::Values(K_bli_zdscalv_zen_int_10),
// conj(alpha): specify if alpha needs to be conjugated.
::testing::Values(
'n',
@@ -147,12 +147,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_zdscalv_zen_int10
#ifdef K_bli_zdscalv_zen_int_10
INSTANTIATE_TEST_SUITE_P(
bli_zdscalv_zen_int10_nonUnitPositiveStride,
bli_zdscalv_zen_int_10_nonUnitPositiveStride,
zdscalvGeneric,
::testing::Combine(
::testing::Values(bli_zdscalv_zen_int10),
::testing::Values(K_bli_zdscalv_zen_int_10),
// conj(alpha): specify if alpha needs to be conjugated.
::testing::Values(
'n',
@@ -189,7 +189,7 @@ INSTANTIATE_TEST_SUITE_P(
// ----- Begin ZEN4 (AVX512) Kernel Tests -----
// ----------------------------------------------
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
// Tests for bli_zdscalv_zen_int_avx512 (AVX512) kernel.
// Tests for bli_zdscalv_zen4_int (AVX512) kernel.
/**
* Loops:
* L16 - Main loop, handles 16 elements
@@ -198,12 +198,12 @@ INSTANTIATE_TEST_SUITE_P(
* L2 - handles 2 elements
* LScalar - leftover loop (also handles non-unit increments)
*/
#ifdef K_bli_zdscalv_zen_int_avx512
#ifdef K_bli_zdscalv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_zdscalv_zen_int_avx512_unitPositiveStride,
bli_zdscalv_zen4_int_unitPositiveStride,
zdscalvGeneric,
::testing::Combine(
::testing::Values(bli_zdscalv_zen_int_avx512),
::testing::Values(K_bli_zdscalv_zen4_int),
// conj(alpha): specify if alpha needs to be conjugated.
::testing::Values(
'n',
@@ -236,12 +236,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_zdscalv_zen_int_avx512
#ifdef K_bli_zdscalv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_zdscalv_zen_int_avx512_nonUnitPositiveStrides,
bli_zdscalv_zen4_int_nonUnitPositiveStrides,
zdscalvGeneric,
::testing::Combine(
::testing::Values(bli_zdscalv_zen_int_avx512),
::testing::Values(K_bli_zdscalv_zen4_int),
// conj(alpha): specify if alpha needs to be conjugated.
::testing::Values(
'n',

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -178,7 +178,7 @@ INSTANTIATE_TEST_SUITE_P(
// ----- Begin ZEN4 (AVX512) Kernel Tests -----
// ----------------------------------------------
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
// Tests for bli_zscalv_zen_int_avx512 (AVX512) kernel.
// Tests for bli_zscalv_zen4_int (AVX512) kernel.
/**
* Loops:
* L48 - Main loop, handles 48 elements
@@ -189,12 +189,12 @@ INSTANTIATE_TEST_SUITE_P(
* L2 - handles 2 elements
* LScalar - leftover loop (also handles non-unit increments)
*/
#ifdef K_bli_zscalv_zen_int_avx512
#ifdef K_bli_zscalv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_zscalv_zen_int_avx512_unitPositiveStride,
bli_zscalv_zen4_int_unitPositiveStride,
zscalvGeneric,
::testing::Combine(
::testing::Values(bli_zscalv_zen_int_avx512),
::testing::Values(K_bli_zscalv_zen4_int),
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'
#ifdef TEST_BLIS_TYPED
@@ -230,12 +230,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_zscalv_zen_int_avx512
#ifdef K_bli_zscalv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_zscalv_zen_int_avx512_nonUnitPositiveStrides,
bli_zscalv_zen4_int_nonUnitPositiveStrides,
zscalvGeneric,
::testing::Combine(
::testing::Values(bli_zscalv_zen_int_avx512),
::testing::Values(K_bli_zscalv_zen4_int),
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'
#ifdef TEST_BLIS_TYPED

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -144,8 +144,8 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_dsetv_zen_int_avx512 kernel.
The code structure for bli_dsetv_zen_int_avx512( ... ) is as follows :
Unit testing for functionality of bli_dsetv_zen4_int kernel.
The code structure for bli_dsetv_zen4_int( ... ) is as follows :
For unit strides :
Main loop : In blocks of 256 --> L256
Fringe loops : In blocks of 128 --> L128
@@ -159,12 +159,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with Unit Strides(US), across all loops.
#ifdef K_bli_dsetv_zen_int_avx512
#ifdef K_bli_dsetv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_dsetv_zen_int_avx512_unitStrides,
bli_dsetv_zen4_int_unitStrides,
dsetvGeneric,
::testing::Combine(
::testing::Values(bli_dsetv_zen_int_avx512),
::testing::Values(K_bli_dsetv_zen4_int),
::testing::Values('n', 'c'), // conjalpha
::testing::Values(double(2.2)), // alpha
::testing::Values(// Testing the loops standalone
@@ -201,12 +201,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with Non-Unit Strides(US), across all loops.
#ifdef K_bli_dsetv_zen_int_avx512
#ifdef K_bli_dsetv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_dsetv_zen_int_avx512_nonUnitStrides,
bli_dsetv_zen4_int_nonUnitStrides,
dsetvGeneric,
::testing::Combine(
::testing::Values(bli_dsetv_zen_int_avx512),
::testing::Values(K_bli_dsetv_zen4_int),
::testing::Values('n', 'c'), // conjalpha
::testing::Values(double(2.2)), // alpha
::testing::Values(gtint_t(25), gtint_t(37)), // size of the vector

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -144,8 +144,8 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_ssetv_zen_int_avx512 kernel.
The code structure for bli_ssetv_zen_int_avx512( ... ) is as follows :
Unit testing for functionality of bli_ssetv_zen4_int kernel.
The code structure for bli_ssetv_zen4_int( ... ) is as follows :
For unit strides :
Main loop : In blocks of 512 --> L512
Fringe loops : In blocks of 256 --> L256
@@ -158,12 +158,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_ssetv_zen_int_avx512
#ifdef K_bli_ssetv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_ssetv_zen_int_avx512_unitStrides,
bli_ssetv_zen4_int_unitStrides,
ssetvGeneric,
::testing::Combine(
::testing::Values(bli_ssetv_zen_int_avx512),
::testing::Values(K_bli_ssetv_zen4_int),
::testing::Values('n', 'c'), // conjalpha
::testing::Values(float(1.2)), // alpha
::testing::Values(// Testing the loops standalone
@@ -197,12 +197,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with non-unit strides, across all loops.
#ifdef K_bli_ssetv_zen_int_avx512
#ifdef K_bli_ssetv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_ssetv_zen_int_avx512_nonUnitStrides,
bli_ssetv_zen4_int_nonUnitStrides,
ssetvGeneric,
::testing::Combine(
::testing::Values(bli_ssetv_zen_int_avx512),
::testing::Values(K_bli_ssetv_zen4_int),
::testing::Values('n', 'c'), // conjalpha
::testing::Values(float(1.2)), // alpha
::testing::Values(gtint_t(25), gtint_t(37)), // size of the vector

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -152,8 +152,8 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
/*
Unit testing for functionality of bli_zsetv_zen_int_avx512 kernel.
The code structure for bli_zsetv_zen_int_avx512( ... ) is as follows :
Unit testing for functionality of bli_zsetv_zen4_int kernel.
The code structure for bli_zsetv_zen4_int( ... ) is as follows :
For unit strides :
Main loop : In blocks of 128 --> L128
Fringe loops : In blocks of 64 --> L64
@@ -167,12 +167,12 @@ INSTANTIATE_TEST_SUITE_P(
For non-unit strides : A single loop, to process element wise.
*/
// Unit testing with unit strides, across all loops.
#ifdef K_bli_zsetv_zen_int_avx512
#ifdef K_bli_zsetv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_zsetv_zen_int_avx512_unitStrides,
bli_zsetv_zen4_int_unitStrides,
zsetvGeneric,
::testing::Combine(
::testing::Values(bli_zsetv_zen_int_avx512),
::testing::Values(K_bli_zsetv_zen4_int),
::testing::Values('n' // conjx
#ifdef TEST_BLIS_TYPED
, 'c'
@@ -213,12 +213,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
// Unit testing with non-unit strides, across all loops.
#ifdef K_bli_zsetv_zen_int_avx512
#ifdef K_bli_zsetv_zen4_int
INSTANTIATE_TEST_SUITE_P(
bli_zsetv_zen_int_avx512_nonUnitStrides,
bli_zsetv_zen4_int_nonUnitStrides,
zsetvGeneric,
::testing::Combine(
::testing::Values(bli_zsetv_zen_int_avx512),
::testing::Values(K_bli_zsetv_zen4_int),
::testing::Values('n' // conjx
#ifdef TEST_BLIS_TYPED
, 'c'

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -75,16 +75,16 @@ TEST_P( dswapvGeneric, UKR )
// ----------------------------------------------
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
// Tests for bli_dswapv_zen_int8 (AVX2) kernel.
// Tests for bli_dswapv_zen_int_8 (AVX2) kernel.
// For unit inc on x and y:
// Optimised code is avialble for n = 32, 16, 8, 4
#ifdef K_bli_dswapv_zen_int8
#ifdef K_bli_dswapv_zen_int_8
INSTANTIATE_TEST_SUITE_P(
UnitIncrements,
dswapvGeneric,
::testing::Combine(
::testing::Values(bli_dswapv_zen_int8),
::testing::Values(K_bli_dswapv_zen_int_8),
// n: size of vector.
::testing::Values(
gtint_t(1), gtint_t(2), gtint_t(4), gtint_t(8), gtint_t(16), gtint_t(32),
@@ -108,12 +108,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dswapv_zen_int8
#ifdef K_bli_dswapv_zen_int_8
INSTANTIATE_TEST_SUITE_P(
NonUnitIncrements,
dswapvGeneric,
::testing::Combine(
::testing::Values(bli_dswapv_zen_int8),
::testing::Values(K_bli_dswapv_zen_int_8),
// n: size of vector.
::testing::Values(
gtint_t(1),

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -75,16 +75,16 @@ TEST_P( sswapvGeneric, UKR )
// ----------------------------------------------
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
// Tests for bli_dswapv_zen_int8 (AVX2) kernel.
// Tests for bli_dswapv_zen_int_8 (AVX2) kernel.
// For unit inc on x and y:
// When n values are 64, 32, 16, 8, 4 it is avx2 optimised
#ifdef K_bli_sswapv_zen_int8
#ifdef K_bli_sswapv_zen_int_8
INSTANTIATE_TEST_SUITE_P(
UnitIncrements,
sswapvGeneric,
::testing::Combine(
::testing::Values(bli_sswapv_zen_int8),
::testing::Values(K_bli_sswapv_zen_int_8),
// n: size of vector.
::testing::Values(
gtint_t(1), gtint_t(2), gtint_t(8), gtint_t(16), gtint_t(32),
@@ -108,12 +108,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_sswapv_zen_int8
#ifdef K_bli_sswapv_zen_int_8
INSTANTIATE_TEST_SUITE_P(
NonUnitIncrements,
sswapvGeneric,
::testing::Combine(
::testing::Values(bli_sswapv_zen_int8),
::testing::Values(K_bli_sswapv_zen_int_8),
// n: size of vector.
::testing::Values(
gtint_t(1),

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -88,12 +88,12 @@ TEST_P( ctrsmGenericSmall, UKR )
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
#ifdef K_bli_trsm_small
#ifdef K_bli_trsm_small_zen
INSTANTIATE_TEST_SUITE_P(
bli_trsm_small,
bli_trsm_small_zen,
ctrsmGenericSmall,
::testing::Combine(
::testing::Values(bli_trsm_small), // ker_ptr
::testing::Values(K_bli_trsm_small_zen), // ker_ptr
::testing::Values('l', 'r'), // side
::testing::Values('l', 'u'), // uplo
::testing::Values('n', 'u'), // diaga

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -165,12 +165,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_trsm_small_AVX512
#ifdef K_bli_trsm_small_zen4
INSTANTIATE_TEST_SUITE_P(
bli_trsm_small_AVX512,
bli_trsm_small_zen4,
dtrsmGenericSmall,
::testing::Combine(
::testing::Values(bli_trsm_small_AVX512), // ker_ptr
::testing::Values(K_bli_trsm_small_zen4), // ker_ptr
::testing::Values('l', 'r'), // side
::testing::Values('l', 'u'), // uplo
::testing::Values('n', 'u'), // diaga
@@ -235,12 +235,12 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
#ifdef K_bli_trsm_small
#ifdef K_bli_trsm_small_zen
INSTANTIATE_TEST_SUITE_P(
bli_trsm_small,
bli_trsm_small_zen,
dtrsmGenericSmall,
::testing::Combine(
::testing::Values(bli_trsm_small), // ker_ptr
::testing::Values(K_bli_trsm_small_zen), // ker_ptr
::testing::Values('l', 'r'), // side
::testing::Values('l', 'u'), // uplo
::testing::Values('n', 'u'), // diaga

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -169,12 +169,12 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
#ifdef K_bli_trsm_small
#ifdef K_bli_trsm_small_zen
INSTANTIATE_TEST_SUITE_P(
bli_trsm_small,
bli_trsm_small_zen,
strsmGenericSmall,
::testing::Combine(
::testing::Values(bli_trsm_small), // ker_ptr
::testing::Values(K_bli_trsm_small_zen), // ker_ptr
::testing::Values('l', 'r'), // side
::testing::Values('l', 'u'), // uplo
::testing::Values('n', 'u'), // diaga

View File

@@ -135,12 +135,12 @@ TEST_P( ztrsmGenericSmall, UKR )
#if defined(BLIS_KERNELS_ZEN5) && defined(GTEST_AVX512)
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
#ifdef K_bli_ztrsm_small_ZEN5
#ifdef K_bli_ztrsm_small_zen5
INSTANTIATE_TEST_SUITE_P(
bli_trsm_small_ZEN5_r,
bli_trsm_small_zen5_r,
ztrsmGenericSmall,
::testing::Combine(
::testing::Values(bli_trsm_small_ZEN5), // ker_ptr
::testing::Values(K_bli_trsm_small_zen5), // ker_ptr
::testing::Values('r'), // side
::testing::Values('l', 'u'), // uplo
::testing::Values('n', 'u'), // diaga
@@ -157,10 +157,10 @@ INSTANTIATE_TEST_SUITE_P(
);
INSTANTIATE_TEST_SUITE_P(
bli_trsm_small_ZEN5_l,
bli_trsm_small_zen5_l,
ztrsmGenericSmall,
::testing::Combine(
::testing::Values(bli_trsm_small_ZEN5), // ker_ptr
::testing::Values(K_bli_trsm_small_zen5), // ker_ptr
::testing::Values('l'), // side
::testing::Values('l', 'u'), // uplo
::testing::Values('n', 'u'), // diaga
@@ -177,10 +177,10 @@ INSTANTIATE_TEST_SUITE_P(
);
INSTANTIATE_TEST_SUITE_P(
bli_trsm_small_ZEN5_gemm,
bli_trsm_small_zen5_gemm,
ztrsmGenericSmall,
::testing::Combine(
::testing::Values(bli_trsm_small_ZEN5), // ker_ptr
::testing::Values(K_bli_trsm_small_zen5), // ker_ptr
::testing::Values('l', 'r'), // side
::testing::Values('l', 'u'), // uplo
::testing::Values('n', 'u'), // diaga
@@ -194,7 +194,7 @@ INSTANTIATE_TEST_SUITE_P(
),
(::trsmSmallUKRPrint<dcomplex, trsm_small_ker_ft>())
);
#endif // K_bli_ztrsm_small_ZEN5
#endif // K_bli_ztrsm_small_zen5
#endif // BLIS_ENABLE_SMALL_MATRIX_TRSM
#endif // defined(BLIS_KERNELS_ZEN5) && defined(GTEST_AVX512)
@@ -248,12 +248,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
#ifdef K_bli_trsm_small_AVX512
#ifdef K_bli_trsm_small_zen4
INSTANTIATE_TEST_SUITE_P(
bli_trsm_small_AVX512,
bli_trsm_small_zen4,
ztrsmGenericSmall,
::testing::Combine(
::testing::Values(bli_trsm_small_AVX512), // ker_ptr
::testing::Values(K_bli_trsm_small_zen4), // ker_ptr
::testing::Values('l', 'r'), // side
::testing::Values('l', 'u'), // uplo
::testing::Values('n', 'u'), // diaga
@@ -324,12 +324,12 @@ INSTANTIATE_TEST_SUITE_P(
#endif
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
#ifdef K_bli_trsm_small
#ifdef K_bli_trsm_small_zen
INSTANTIATE_TEST_SUITE_P(
bli_trsm_small,
bli_trsm_small_zen,
ztrsmGenericSmall,
::testing::Combine(
::testing::Values(bli_trsm_small), // ker_ptr
::testing::Values(K_bli_trsm_small_zen), // ker_ptr
::testing::Values('l', 'r'), // side
::testing::Values('l', 'u'), // uplo
::testing::Values('n', 'u'), // diaga

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -80,7 +80,7 @@ void bli_saxpbyv_zen_int
// When beta = !( 0 or 1 ) --> SSCALV
if ( bli_seq0( *alpha ) )
{
bli_sscalv_zen_int10
bli_sscalv_zen_int_10
(
BLIS_NO_CONJUGATE,
n,
@@ -337,7 +337,7 @@ void bli_daxpbyv_zen_int
// When beta = !( 0 or 1 ) --> DSCALV
if ( bli_deq0( *alpha ) )
{
bli_dscalv_zen_int10
bli_dscalv_zen_int_10
(
BLIS_NO_CONJUGATE,
n,
@@ -648,7 +648,7 @@ void bli_caxpbyv_zen_int
}
else
{
bli_caxpyv_zen_int5
bli_caxpyv_zen_int_5
(
conjx,
n,
@@ -1331,7 +1331,7 @@ void bli_zaxpbyv_zen_int
}
else
{
bli_zaxpyv_zen_int5
bli_zaxpyv_zen_int_5
(
conjx,
n,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -57,7 +57,7 @@ typedef union
* x & y are single precision vectors of length n.
* alpha & beta are scalars.
*/
void bli_saxpbyv_zen_int10
void bli_saxpbyv_zen_int_10
(
conj_t conjx,
dim_t n,
@@ -80,7 +80,7 @@ void bli_saxpbyv_zen_int10
// When beta = !( 0 or 1 ) --> SSCALV
if ( bli_seq0( *alpha ) )
{
bli_sscalv_zen_int10
bli_sscalv_zen_int_10
(
BLIS_NO_CONJUGATE,
n,
@@ -733,7 +733,7 @@ void bli_saxpbyv_zen_int10
* x & y are double precision vectors of length n.
* alpha & beta are scalars.
*/
void bli_daxpbyv_zen_int10
void bli_daxpbyv_zen_int_10
(
conj_t conjx,
dim_t n,
@@ -756,7 +756,7 @@ void bli_daxpbyv_zen_int10
// When beta = !( 0 or 1 ) --> DSCALV
if ( bli_deq0( *alpha ) )
{
bli_dscalv_zen_int10
bli_dscalv_zen_int_10
(
BLIS_NO_CONJUGATE,
n,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2016 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2016 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2020, The University of Texas at Austin. All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -55,7 +55,7 @@ typedef union
// -----------------------------------------------------------------------------
void bli_saxpyv_zen_int10
void bli_saxpyv_zen_int_10
(
conj_t conjx,
dim_t n,
@@ -340,7 +340,7 @@ void bli_saxpyv_zen_int10
// -----------------------------------------------------------------------------
BLIS_EXPORT_BLIS void bli_daxpyv_zen_int10
BLIS_EXPORT_BLIS void bli_daxpyv_zen_int_10
(
conj_t conjx,
dim_t n,
@@ -472,7 +472,7 @@ BLIS_EXPORT_BLIS void bli_daxpyv_zen_int10
// -----------------------------------------------------------------------------
void bli_caxpyv_zen_int5
void bli_caxpyv_zen_int_5
(
conj_t conjx,
dim_t n,
@@ -772,7 +772,7 @@ void bli_caxpyv_zen_int5
// -----------------------------------------------------------------------------
void bli_zaxpyv_zen_int5
void bli_zaxpyv_zen_int_5
(
conj_t conjx,
dim_t n,

View File

@@ -66,7 +66,7 @@ static int64_t mask_0[4] = {0, 0, 0, 0};
static int64_t *mask_ptr[] = {mask_0, mask_1, mask_2, mask_3};
// -----------------------------------------------------------------------------
void bli_sdotv_zen_int10
void bli_sdotv_zen_int_10
(
conj_t conjx,
conj_t conjy,
@@ -257,7 +257,7 @@ void bli_sdotv_zen_int10
// -----------------------------------------------------------------------------
void bli_ddotv_zen_int10
void bli_ddotv_zen_int_10
(
conj_t conjx,
conj_t conjy,
@@ -426,7 +426,7 @@ void bli_ddotv_zen_int10
// -----------------------------------------------------------------------------
void bli_cdotv_zen_int5
void bli_cdotv_zen_int_5
(
conj_t conjx,
conj_t conjy,
@@ -740,7 +740,7 @@ void bli_cdotv_zen_int5
// -----------------------------------------------------------------------------
void bli_zdotv_zen_int5
void bli_zdotv_zen_int_5
(
conj_t conjx,
conj_t conjy,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2021 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -97,7 +97,7 @@ float horizontal_add_sf(__m256 const a) {
}
// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
void bli_snorm2fv_unb_var1_avx2
void bli_snorm2fv_zen_int_unb_var1
(
dim_t n,
float* x, inc_t incx,
@@ -834,7 +834,7 @@ void bli_snorm2fv_unb_var1_avx2
}
// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
void bli_scnorm2fv_unb_var1_avx2
void bli_scnorm2fv_zen_int_unb_var1
(
dim_t n,
scomplex* x, inc_t incx,
@@ -1601,7 +1601,7 @@ void bli_scnorm2fv_unb_var1_avx2
}
// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
void bli_dnorm2fv_unb_var1_avx2
void bli_dnorm2fv_zen_int_unb_var1
(
dim_t n,
double* x, inc_t incx,
@@ -1954,7 +1954,7 @@ void bli_dnorm2fv_unb_var1_avx2
}
// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
void bli_dznorm2fv_unb_var1_avx2
void bli_dznorm2fv_zen_int_unb_var1
(
dim_t n,
dcomplex* x, inc_t incx,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2017 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2017 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
@@ -38,7 +38,7 @@
// -----------------------------------------------------------------------------
void bli_sscalv_zen_int10
void bli_sscalv_zen_int_10
(
conj_t conjalpha,
dim_t n,
@@ -309,7 +309,7 @@ void bli_sscalv_zen_int10
// -----------------------------------------------------------------------------
BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
BLIS_EXPORT_BLIS void bli_dscalv_zen_int_10
(
conj_t conjalpha,
dim_t n,
@@ -582,7 +582,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
}
}
void bli_zdscalv_zen_int10
void bli_zdscalv_zen_int_10
(
conj_t conjalpha,
dim_t n,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -134,7 +134,7 @@ void bli_ssetv_zen_int
}
}
void bli_dsetv_zen_int
void bli_dsetv_zen_int
(
conj_t conjalpha,
dim_t n,
@@ -230,7 +230,7 @@ void bli_dsetv_zen_int
}
}
void bli_csetv_zen_int
void bli_csetv_zen_int
(
conj_t conjalpha,
dim_t n,
@@ -332,7 +332,7 @@ void bli_csetv_zen_int
}
void bli_zsetv_zen_int
void bli_zsetv_zen_int
(
conj_t conjalpha,
dim_t n,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -54,7 +54,7 @@ typedef union
// -----------------------------------------------------------------------------
void bli_sswapv_zen_int8
void bli_sswapv_zen_int_8
(
dim_t n,
float* restrict x, inc_t incx,
@@ -202,7 +202,7 @@ void bli_sswapv_zen_int8
//--------------------------------------------------------------------------------
BLIS_EXPORT_BLIS void bli_dswapv_zen_int8
BLIS_EXPORT_BLIS void bli_dswapv_zen_int_8
(
dim_t n,
double* restrict x, inc_t incx,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -346,7 +346,7 @@ void bli_zaxpyf_zen_int_4
_mm_storeu_pd((double *)&alpha_chi1, temp[0]);
bli_zaxpyv_zen_int5
bli_zaxpyv_zen_int_5
(
conja,
m,

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2018, The University of Texas at Austin
Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2017 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -87,7 +87,7 @@ void bli_sdotxf_zen_int_8
if ( bli_zero_dim1( m ) || PASTEMAC(s,eq0)( *alpha ) )
{
bli_sscalv_zen_int10
bli_sscalv_zen_int_10
(
BLIS_NO_CONJUGATE,
b_n,
@@ -449,7 +449,7 @@ void bli_ddotxf_zen_int_8
// simplifies to updating y.
if (bli_zero_dim1(m) || PASTEMAC(d, eq0)(*alpha))
{
bli_dscalv_zen_int10(
bli_dscalv_zen_int_10(
BLIS_NO_CONJUGATE,
b_n,
beta,
@@ -902,7 +902,7 @@ void bli_ddotxf_zen_int_4
// simplifies to updating y.
if (bli_zero_dim1(m) || PASTEMAC(d, eq0)(*alpha))
{
bli_dscalv_zen_int10(
bli_dscalv_zen_int_10(
BLIS_NO_CONJUGATE,
b_n,
beta,
@@ -1297,7 +1297,7 @@ void bli_ddotxf_zen_int_2
// simplifies to updating y.
if (bli_zero_dim1(m) || PASTEMAC(d, eq0)(*alpha))
{
bli_dscalv_zen_int10(
bli_dscalv_zen_int_10(
BLIS_NO_CONJUGATE,
b_n,
beta,

View File

@@ -897,7 +897,7 @@ void bli_dgemv_t_zen_int
return;
}
void bli_dgemv_t_zen_int_16x7m
void bli_dgemv_t_zen_int_16x7m
(
conj_t conja,
conj_t conjx,
@@ -1290,7 +1290,7 @@ void bli_dgemv_t_zen_int_16x7m
}
}
void bli_dgemv_t_zen_int_16x6m
void bli_dgemv_t_zen_int_16x6m
(
conj_t conja,
conj_t conjx,
@@ -1653,7 +1653,7 @@ void bli_dgemv_t_zen_int_16x6m
}
}
void bli_dgemv_t_zen_int_16x5m
void bli_dgemv_t_zen_int_16x5m
(
conj_t conja,
conj_t conjx,
@@ -1955,7 +1955,7 @@ void bli_dgemv_t_zen_int_16x5m
}
}
void bli_dgemv_t_zen_int_16x4m
void bli_dgemv_t_zen_int_16x4m
(
conj_t conja,
conj_t conjx,
@@ -2221,7 +2221,7 @@ void bli_dgemv_t_zen_int_16x4m
}
void bli_dgemv_t_zen_int_16x3m
void bli_dgemv_t_zen_int_16x3m
(
conj_t conja,
conj_t conjx,
@@ -2469,7 +2469,7 @@ void bli_dgemv_t_zen_int_16x3m
}
}
void bli_dgemv_t_zen_int_16x2m
void bli_dgemv_t_zen_int_16x2m
(
conj_t conja,
conj_t conjx,
@@ -2696,7 +2696,7 @@ void bli_dgemv_t_zen_int_16x2m
}
}
void bli_dgemv_t_zen_int_16x1m
void bli_dgemv_t_zen_int_16x1m
(
conj_t conja,
conj_t conjx,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -35,11 +35,11 @@
#include "blis.h"
/**
* bli_dgemv_n_avx2(...) handles cases where op(A) = NO_TRANSPOSE for Zen/2/3
* bli_dgemv_n_zen(...) handles cases where op(A) = NO_TRANSPOSE for Zen/2/3
* architectures and is based on the previous approach of using the fused
* kernels, namely AXPYF, to perform the GEMV operation.
*/
void bli_dgemv_n_avx2
void bli_dgemv_n_zen
(
trans_t transa,
conj_t conjx,
@@ -88,7 +88,7 @@ void bli_dgemv_n_avx2
b_fuse = 8;
axpyf_kr_ptr = bli_daxpyf_zen_int_8; // DAXPYF
scal2v_kr_ptr = bli_dscal2v_zen_int; // DSCAL2V
scalv_kr_ptr = bli_dscalv_zen_int10; // DSCALV
scalv_kr_ptr = bli_dscalv_zen_int_10; // DSCALV
copyv_kr_ptr = bli_dcopyv_zen_int; // DCOPYV
/*
@@ -119,7 +119,7 @@ void bli_dgemv_n_avx2
size_t buffer_size = m0 * sizeof(double);
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_dgemv_n_avx2(): get mem pool block\n" );
printf( "bli_dgemv_n_zen(): get mem pool block\n" );
#endif
/* Acquire a Buffer(m0*size(double)) from the memory broker
@@ -218,7 +218,7 @@ void bli_dgemv_n_avx2
);
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_dgemv_n_avx2(): releasing mem pool block\n" );
printf( "bli_dgemv_n_zen(): releasing mem pool block\n" );
#endif
// Return the buffer to pool
bli_pba_release( &rntm , &mem_bufY );

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -265,7 +265,7 @@ void bli_zgemv_zen_int_4x4
bli_zcopycjs( conjx, *chi1, alpha_chi1 );
bli_zscals( *alpha, alpha_chi1 );
bli_zaxpyv_zen_int5
bli_zaxpyv_zen_int_5
(
conja,
m,
@@ -483,7 +483,7 @@ void bli_cgemv_zen_int_4x4
scomplex alpha_chi1;
bli_ccopycjs( conjx, *chi1, alpha_chi1 );
bli_cscals( *alpha, alpha_chi1 );
bli_caxpyv_zen_int5
bli_caxpyv_zen_int_5
(
conja,
m,
@@ -529,7 +529,7 @@ void bli_multi_sgemv_4x2
if (bli_zero_dim1(m) || PASTEMAC(s, eq0)(*alpha))
{
bli_sscalv_zen_int10(
bli_sscalv_zen_int_10(
BLIS_NO_CONJUGATE,
b_n,
beta,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -40,7 +40,7 @@
#define D_MR 8
#define D_NR 6
err_t bli_dgemm_8x6_avx2_k1_nn
err_t bli_dgemm_zen_int_8x6_k1_nn
(
dim_t m,
dim_t n,

View File

@@ -33,7 +33,7 @@
*/
#include "blis.h"
static dgemmsup_ker_ft kern_fp[] =
static dgemmsup_ker_ft kern_fp_zen[] =
{
bli_dgemmsup_rv_haswell_asm_6x8m,
bli_dgemmsup_rd_haswell_asm_6x8m,
@@ -45,7 +45,7 @@ static dgemmsup_ker_ft kern_fp[] =
bli_dgemmsup_rv_haswell_asm_6x8n
};
err_t bli_dgemm_tiny_6x8
err_t bli_dgemm_tiny_zen_6x8
(
conj_t conja,
conj_t conjb,
@@ -202,7 +202,7 @@ err_t bli_dgemm_tiny_6x8
*/
inc_t ps_a_use = (MR_ * rs_a);
bli_auxinfo_set_ps_a( ps_a_use, &aux );
dgemmsup_ker_ft kern_ptr = kern_fp[stor_id];
dgemmsup_ker_ft kern_ptr = kern_fp_zen[stor_id];
/**
* JC Loop is eliminated as it iterates only once, So computation

View File

@@ -35,7 +35,7 @@
#include "blis.h"
// Defining separate static arrays to hold all the kernel info, based on the datatype
static gemmtiny_ukr_info_t cgemmtiny_ukr_avx2[] =
static gemmtiny_ukr_info_t cgemmtiny_ukr_zen[] =
{
{ (void *)bli_cgemmsup_rv_zen_asm_3x8m, (void *)bli_cpackm_haswell_asm_8xk, TRUE, FALSE, 3, 4 },
{ (void *)bli_cgemmsup_rv_zen_asm_3x8m, (void *)bli_cpackm_haswell_asm_8xk, TRUE, TRUE, 3, 4 },
@@ -47,7 +47,7 @@ static gemmtiny_ukr_info_t cgemmtiny_ukr_avx2[] =
{ (void *)bli_cgemmsup_rv_zen_asm_3x8m, (void *)bli_cpackm_haswell_asm_8xk, TRUE, FALSE, 3, 4 }
};
static gemmtiny_ukr_info_t zgemmtiny_ukr_avx2[] =
static gemmtiny_ukr_info_t zgemmtiny_ukr_zen[] =
{
{ (void *)bli_zgemmsup_rv_zen_asm_3x4m, (void *)bli_zpackm_haswell_asm_4xk, TRUE, FALSE, 3, 4 },
{ (void *)bli_zgemmsup_rd_zen_asm_3x4m, (void *)bli_zpackm_haswell_asm_4xk, TRUE, FALSE, 3, 4 },
@@ -59,7 +59,7 @@ static gemmtiny_ukr_info_t zgemmtiny_ukr_avx2[] =
{ (void *)bli_zgemmsup_rv_zen_asm_3x4m, (void *)bli_zpackm_haswell_asm_4xk, TRUE, FALSE, 3, 4 }
};
// Function macro that defines the bli_?gemmtiny_avx2_ukr_info functions
// Function macro that defines the bli_?gemmtiny_ukr_zen_info functions
// These are used to acquire the kernel info at framework level
#undef GENTFUNC
#define GENTFUNC( ftype, ch, tfuncname ) \
@@ -84,5 +84,5 @@ err_t PASTEMAC( ch, tfuncname ) \
return BLIS_SUCCESS; \
} \
GENTFUNC( scomplex, c, gemmtiny_avx2_ukr_info )
GENTFUNC( dcomplex, z, gemmtiny_avx2_ukr_info )
GENTFUNC( scomplex, c, gemmtiny_ukr_zen_info )
GENTFUNC( dcomplex, z, gemmtiny_ukr_zen_info )

View File

@@ -34,9 +34,9 @@
// Macro to access the appropriate static array(that contains the kernel list),
// based on the datatype
#define TINY_GEMM_AVX2(ch) ch ## gemmtiny_ukr_avx2
#define TINY_GEMM_AVX2(ch) ch ## gemmtiny_ukr_zen
// Macro prototypes for bli_?gemmtiny_avx2_ukr_info functions
// Macro prototypes for bli_?gemmtiny_ukr_zen_info functions
// These are used to acquire the kernel info at framework level
#undef GENTFUNC
#define GENTFUNC( ftype, ch, tfuncname ) \
@@ -46,13 +46,13 @@ err_t PASTEMAC( ch, tfuncname ) \
gemmtiny_ukr_info_t *fp_info \
); \
GENTFUNC( scomplex, c, gemmtiny_avx2_ukr_info )
GENTFUNC( dcomplex, z, gemmtiny_avx2_ukr_info )
GENTFUNC( scomplex, c, gemmtiny_ukr_zen_info )
GENTFUNC( dcomplex, z, gemmtiny_ukr_zen_info )
/* Enabling the query for AVX2 kernels, based on the library's configuration */
/* Minimum requirement is 'ZEN' */
#define LOOKUP_AVX2_UKR( ch, stor_id, ukr_support, gemmtiny_ukr_info ) \
{ \
/* Call the appropriate function to query the AVX2 object info */ \
ukr_support = PASTEMAC(ch, gemmtiny_avx2_ukr_info)( stor_id, &gemmtiny_ukr_info ); \
ukr_support = PASTEMAC(ch, gemmtiny_ukr_zen_info)( stor_id, &gemmtiny_ukr_info ); \
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2018 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -56,7 +56,7 @@ typedef err_t (*trsmsmall_ker_ft)
//A.'X = B; A is upper triangular;
//A has to be transposed; double precision
BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
BLIS_INLINE err_t bli_dtrsm_small_zen_int_AutXB_AlXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -71,7 +71,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
*/
//AX = B; A is lower triangular; transpose; double precision
BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
BLIS_INLINE err_t bli_dtrsm_small_zen_int_AltXB_AuXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -85,7 +85,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
// XA = B; A is lower-traingular; No transpose;
//double precision; non-unit diagonal
BLIS_INLINE err_t bli_dtrsm_small_XAutB_XAlB
BLIS_INLINE err_t bli_dtrsm_small_zen_int_XAutB_XAlB
(
obj_t* AlphaObj,
obj_t* a,
@@ -99,7 +99,7 @@ BLIS_INLINE err_t bli_dtrsm_small_XAutB_XAlB
//XA = B; A is lower-triangular; A is transposed;
// double precision; non-unit-diagonal
BLIS_INLINE err_t bli_dtrsm_small_XAltB_XAuB
BLIS_INLINE err_t bli_dtrsm_small_zen_int_XAltB_XAuB
(
obj_t* AlphaObj,
obj_t* a,
@@ -111,7 +111,7 @@ BLIS_INLINE err_t bli_dtrsm_small_XAltB_XAuB
/*
* ZTRSM kernel declaration
*/
BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
BLIS_INLINE err_t bli_ztrsm_small_zen_int_AutXB_AlXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -120,7 +120,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
cntl_t* cntl
);
BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
BLIS_INLINE err_t bli_ztrsm_small_zen_int_AltXB_AuXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -129,7 +129,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
cntl_t* cntl
);
BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
BLIS_INLINE err_t bli_ztrsm_small_zen_int_XAutB_XAlB
(
obj_t* AlphaObj,
obj_t* a,
@@ -138,7 +138,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
cntl_t* cntl
);
BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
BLIS_INLINE err_t bli_ztrsm_small_zen_int_XAltB_XAuB
(
obj_t* AlphaObj,
obj_t* a,
@@ -149,7 +149,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
/*
* CTRSM kernel declaration
*/
BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
BLIS_INLINE err_t bli_ctrsm_small_zen_int_AutXB_AlXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -158,7 +158,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
cntl_t* cntl
);
BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
BLIS_INLINE err_t bli_ctrsm_small_zen_int_AltXB_AuXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -167,7 +167,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
cntl_t* cntl
);
BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB
BLIS_INLINE err_t bli_ctrsm_small_zen_int_XAutB_XAlB
(
obj_t* AlphaObj,
obj_t* a,
@@ -176,7 +176,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB
cntl_t* cntl
);
BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB
BLIS_INLINE err_t bli_ctrsm_small_zen_int_XAltB_XAuB
(
obj_t* AlphaObj,
obj_t* a,
@@ -187,7 +187,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB
/*
* STRSM kernel declaration
*/
BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
BLIS_INLINE err_t bli_strsm_small_zen_int_AutXB_AlXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -196,7 +196,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
cntl_t* cntl
);
BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
BLIS_INLINE err_t bli_strsm_small_zen_int_AltXB_AuXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -205,7 +205,7 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
cntl_t* cntl
);
BLIS_INLINE err_t bli_strsm_small_XAutB_XAlB
BLIS_INLINE err_t bli_strsm_small_zen_int_XAutB_XAlB
(
obj_t* AlphaObj,
obj_t* a,
@@ -214,7 +214,7 @@ BLIS_INLINE err_t bli_strsm_small_XAutB_XAlB
cntl_t* cntl
);
BLIS_INLINE err_t bli_strsm_small_XAltB_XAuB
BLIS_INLINE err_t bli_strsm_small_zen_int_XAltB_XAuB
(
obj_t* AlphaObj,
obj_t* a,
@@ -5027,7 +5027,7 @@ BLIS_INLINE err_t dtrsm_XAltB_ref
Pack a block of 8xk or 6xk from input buffer into packed buffer
directly or after transpose based on input params
*/
BLIS_INLINE void bli_dtrsm_small_pack
BLIS_INLINE void bli_dtrsm_small_zen_int_pack
(
char side,
dim_t size,
@@ -5272,7 +5272,7 @@ BLIS_INLINE void bli_dtrsm_small_pack
a. This helps to utilize cache line efficiently in TRSM operation
b. store ones when input is unit diagonal
*/
BLIS_INLINE void dtrsm_small_pack_diag_element
BLIS_INLINE void dtrsm_small_zen_int_pack_diag_element
(
bool is_unitdiag,
double *a11,
@@ -5337,47 +5337,47 @@ BLIS_INLINE void dtrsm_small_pack_diag_element
/*
* Kernels Table
*/
trsmsmall_ker_ft ker_fps[4][8] =
trsmsmall_ker_ft ker_fps_zen[4][8] =
{
{bli_strsm_small_AutXB_AlXB,
bli_strsm_small_AltXB_AuXB,
bli_strsm_small_AltXB_AuXB,
bli_strsm_small_AutXB_AlXB,
bli_strsm_small_XAutB_XAlB,
bli_strsm_small_XAltB_XAuB,
bli_strsm_small_XAltB_XAuB,
bli_strsm_small_XAutB_XAlB },
{bli_strsm_small_zen_int_AutXB_AlXB,
bli_strsm_small_zen_int_AltXB_AuXB,
bli_strsm_small_zen_int_AltXB_AuXB,
bli_strsm_small_zen_int_AutXB_AlXB,
bli_strsm_small_zen_int_XAutB_XAlB,
bli_strsm_small_zen_int_XAltB_XAuB,
bli_strsm_small_zen_int_XAltB_XAuB,
bli_strsm_small_zen_int_XAutB_XAlB },
{bli_ctrsm_small_AutXB_AlXB,
bli_ctrsm_small_AltXB_AuXB,
bli_ctrsm_small_AltXB_AuXB,
bli_ctrsm_small_AutXB_AlXB,
bli_ctrsm_small_XAutB_XAlB,
bli_ctrsm_small_XAltB_XAuB,
bli_ctrsm_small_XAltB_XAuB,
bli_ctrsm_small_XAutB_XAlB },
{bli_ctrsm_small_zen_int_AutXB_AlXB,
bli_ctrsm_small_zen_int_AltXB_AuXB,
bli_ctrsm_small_zen_int_AltXB_AuXB,
bli_ctrsm_small_zen_int_AutXB_AlXB,
bli_ctrsm_small_zen_int_XAutB_XAlB,
bli_ctrsm_small_zen_int_XAltB_XAuB,
bli_ctrsm_small_zen_int_XAltB_XAuB,
bli_ctrsm_small_zen_int_XAutB_XAlB },
{bli_dtrsm_small_AutXB_AlXB,
bli_dtrsm_small_AltXB_AuXB,
bli_dtrsm_small_AltXB_AuXB,
bli_dtrsm_small_AutXB_AlXB,
bli_dtrsm_small_XAutB_XAlB,
bli_dtrsm_small_XAltB_XAuB,
bli_dtrsm_small_XAltB_XAuB,
bli_dtrsm_small_XAutB_XAlB },
{bli_dtrsm_small_zen_int_AutXB_AlXB,
bli_dtrsm_small_zen_int_AltXB_AuXB,
bli_dtrsm_small_zen_int_AltXB_AuXB,
bli_dtrsm_small_zen_int_AutXB_AlXB,
bli_dtrsm_small_zen_int_XAutB_XAlB,
bli_dtrsm_small_zen_int_XAltB_XAuB,
bli_dtrsm_small_zen_int_XAltB_XAuB,
bli_dtrsm_small_zen_int_XAutB_XAlB },
{bli_ztrsm_small_AutXB_AlXB,
bli_ztrsm_small_AltXB_AuXB,
bli_ztrsm_small_AltXB_AuXB,
bli_ztrsm_small_AutXB_AlXB,
bli_ztrsm_small_XAutB_XAlB,
bli_ztrsm_small_XAltB_XAuB,
bli_ztrsm_small_XAltB_XAuB,
bli_ztrsm_small_XAutB_XAlB },
{bli_ztrsm_small_zen_int_AutXB_AlXB,
bli_ztrsm_small_zen_int_AltXB_AuXB,
bli_ztrsm_small_zen_int_AltXB_AuXB,
bli_ztrsm_small_zen_int_AutXB_AlXB,
bli_ztrsm_small_zen_int_XAutB_XAlB,
bli_ztrsm_small_zen_int_XAltB_XAuB,
bli_ztrsm_small_zen_int_XAltB_XAuB,
bli_ztrsm_small_zen_int_XAutB_XAlB },
};
/*
* The bli_trsm_small implements a version of TRSM where A is packed and reused
* The bli_trsm_small_zen implements a version of TRSM where A is packed and reused
*
* Input: A: MxM (triangular matrix)
* B: MxN matrix
@@ -5387,7 +5387,7 @@ trsmsmall_ker_ft ker_fps[4][8] =
*
* Note: Currently only dtrsm is supported when A & B are column-major
*/
err_t bli_trsm_small
err_t bli_trsm_small_zen
(
side_t side,
obj_t* alpha,
@@ -5461,7 +5461,7 @@ err_t bli_trsm_small
( transa & 0x1) );
trsmsmall_ker_ft ker_fp = ker_fps[dt][ keridx ];
trsmsmall_ker_ft ker_fp = ker_fps_zen[dt][ keridx ];
/*Call the kernel*/
err = ker_fp
@@ -5481,7 +5481,7 @@ err_t bli_trsm_small
* Parallelized dtrsm_small across m-dimension or n-dimension based on side(Left/Right)
*/
err_t bli_trsm_small_mt
err_t bli_trsm_small_zen_mt
(
side_t side,
obj_t* alpha,
@@ -5545,7 +5545,7 @@ err_t bli_trsm_small_mt
{
if(tid == 0)
{
bli_trsm_small
bli_trsm_small_zen
(
side,
alpha,
@@ -5607,7 +5607,7 @@ err_t bli_trsm_small_mt
// all threads
err_t status_l = BLIS_SUCCESS;
status_l = bli_trsm_small
status_l = bli_trsm_small_zen
(
side,
alpha,
@@ -8319,7 +8319,7 @@ BLIS_INLINE err_t ztrsm_AuXB_ref
}\
}
BLIS_INLINE void bli_ztrsm_small_pack
BLIS_INLINE void bli_ztrsm_small_zen_int_pack
(
char side,
dim_t size,
@@ -8465,7 +8465,7 @@ BLIS_INLINE void bli_ztrsm_small_pack
}
BLIS_INLINE void ztrsm_small_pack_diag_element
BLIS_INLINE void ztrsm_small_zen_pack_diag_element
(
bool is_unitdiag,
dcomplex *a11,
@@ -8545,7 +8545,7 @@ b11 * * * * * **a01 * * a11
*/
BLIS_INLINE err_t bli_dtrsm_small_XAltB_XAuB
BLIS_INLINE err_t bli_dtrsm_small_zen_int_XAltB_XAuB
(
obj_t* AlphaObj,
obj_t* a,
@@ -8661,7 +8661,7 @@ BLIS_INLINE err_t bli_dtrsm_small_XAltB_XAuB
until it reaches 6x(n-6) which is the maximum GEMM alone block size in A
b. This packed buffer is reused to calculate all m cols of B matrix
*/
bli_dtrsm_small_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
bli_dtrsm_small_zen_int_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
/*
Pack 6 diagonal elements of A block into an array
@@ -8669,12 +8669,12 @@ BLIS_INLINE err_t bli_dtrsm_small_XAltB_XAuB
b. store ones when input is unit diagonal
*/
dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
}
else
{
bli_dtrsm_small_pack('R', j, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
dtrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
bli_dtrsm_small_zen_int_pack('R', j, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
}
/*
@@ -10954,7 +10954,7 @@ b10 ***************** *************
***************** *******************
*/
BLIS_INLINE err_t bli_dtrsm_small_XAutB_XAlB
BLIS_INLINE err_t bli_dtrsm_small_zen_int_XAutB_XAlB
(
obj_t* AlphaObj,
obj_t* a,
@@ -11070,19 +11070,19 @@ BLIS_INLINE err_t bli_dtrsm_small_XAutB_XAlB
until it reaches 6x(n-6) which is the maximum GEMM alone block size in A
b. This packed buffer is reused to calculate all m cols of B matrix
*/
bli_dtrsm_small_pack('R', p_lda, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
bli_dtrsm_small_zen_int_pack('R', p_lda, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
/*
Pack 6 diagonal elements of A block into an array
a. This helps to utilize cache line efficiently in TRSM operation
b. store ones when input is unit diagonal
*/
dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
}
else
{
bli_dtrsm_small_pack('R', p_lda, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
dtrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
bli_dtrsm_small_zen_int_pack('R', p_lda, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
}
/*
@@ -13283,7 +13283,7 @@ BLIS_INLINE err_t bli_dtrsm_small_XAutB_XAlB
* A is lower-triangular, transpose, non-unit diagonal
* dimensions A: mxm X: mxn B: mxn
*/
BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
BLIS_INLINE err_t bli_dtrsm_small_zen_int_AltXB_AuXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -13402,19 +13402,19 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
until it reaches 8x(m-8) which is the maximum GEMM alone block size in A
b. This packed buffer is reused to calculate all n rows of B matrix
*/
bli_dtrsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
bli_dtrsm_small_zen_int_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
/*
Pack 8 diagonal elements of A block into an array
a. This helps to utilize cache line efficiently in TRSM operation
b. store ones when input is unit diagonal
*/
dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
}
else
{
bli_dtrsm_small_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack,p_lda,d_mr);
dtrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
bli_dtrsm_small_zen_int_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack,p_lda,d_mr);
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
}
/*
@@ -15296,7 +15296,7 @@ a10 ****** b11 *****************
**************** *****************
a11--->
*/
BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
BLIS_INLINE err_t bli_dtrsm_small_zen_int_AutXB_AlXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -15410,19 +15410,19 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
until it reaches 8x(m-8) which is the maximum GEMM alone block size in A
b. This packed buffer is reused to calculate all n rows of B matrix
*/
bli_dtrsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
bli_dtrsm_small_zen_int_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
/*
Pack 8 diagonal elements of A block into an array
a. This helps to utilize cache line efficiently in TRSM operation
b. store ones when input is unit diagonal
*/
dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
}
else
{
bli_dtrsm_small_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
dtrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
bli_dtrsm_small_zen_int_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
}
/*
@@ -17361,7 +17361,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
a. This helps to utilize cache line efficiently in TRSM operation
b. store ones when input is unit diagonal
*/
BLIS_INLINE void strsm_small_pack_diag_element
BLIS_INLINE void strsm_small_zen_int_pack_diag_element
(
char side,
bool is_unitdiag,
@@ -17477,7 +17477,7 @@ BLIS_INLINE void strsm_small_pack_diag_element
Pack a block of 16xk or 6xk from input buffer into packed buffer
directly or after transpose based on input params
*/
BLIS_INLINE void bli_strsm_small_pack
BLIS_INLINE void bli_strsm_small_zen_int_pack
(
char side,
dim_t size,
@@ -17903,7 +17903,7 @@ b10 ***************** *************
***************** *******************
*/
BLIS_INLINE err_t bli_strsm_small_XAutB_XAlB
BLIS_INLINE err_t bli_strsm_small_zen_int_XAutB_XAlB
(
obj_t* AlphaObj,
obj_t* a,
@@ -18015,19 +18015,19 @@ BLIS_INLINE err_t bli_strsm_small_XAutB_XAlB
until it reaches 6x(n-6) which is the maximum GEMM alone block size in A
b. This packed buffer is reused to calculate all m cols of B matrix
*/
bli_strsm_small_pack('R', p_lda, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
bli_strsm_small_zen_int_pack('R', p_lda, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
/*
Pack 6 diagonal elements of A block into an array
a. This helps to utilize cache line efficiently in TRSM operation
b. store ones when input is unit diagonal
*/
strsm_small_pack_diag_element('R',is_unitdiag,a11,cs_a,d11_pack,d_nr);
strsm_small_zen_int_pack_diag_element('R',is_unitdiag,a11,cs_a,d11_pack,d_nr);
}
else
{
bli_strsm_small_pack('R', p_lda, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
strsm_small_pack_diag_element('R',is_unitdiag,a11,rs_a,d11_pack,d_nr);
bli_strsm_small_zen_int_pack('R', p_lda, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
strsm_small_zen_int_pack_diag_element('R',is_unitdiag,a11,rs_a,d11_pack,d_nr);
}
/*
@@ -21572,7 +21572,7 @@ b11 * * * * * **a01 * * a11
*/
BLIS_INLINE err_t bli_strsm_small_XAltB_XAuB
BLIS_INLINE err_t bli_strsm_small_zen_int_XAltB_XAuB
(
obj_t* AlphaObj,
obj_t* a,
@@ -21686,7 +21686,7 @@ BLIS_INLINE err_t bli_strsm_small_XAltB_XAuB
until it reaches 6x(n-6) which is the maximum GEMM alone block size in A
b. This packed buffer is reused to calculate all m cols of B matrix
*/
bli_strsm_small_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
bli_strsm_small_zen_int_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
/*
Pack 6 diagonal elements of A block into an array
@@ -21694,12 +21694,12 @@ BLIS_INLINE err_t bli_strsm_small_XAltB_XAuB
b. store ones when input is unit diagonal
*/
strsm_small_pack_diag_element('R',is_unitdiag,a11,cs_a,d11_pack,d_nr);
strsm_small_zen_int_pack_diag_element('R',is_unitdiag,a11,cs_a,d11_pack,d_nr);
}
else
{
bli_strsm_small_pack('R', j, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
strsm_small_pack_diag_element('R',is_unitdiag,a11,rs_a,d11_pack,d_nr);
bli_strsm_small_zen_int_pack('R', j, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
strsm_small_zen_int_pack_diag_element('R',is_unitdiag,a11,rs_a,d11_pack,d_nr);
}
/*
@@ -25419,7 +25419,7 @@ a10 ****** b11 *****************
**************** *****************
a11--->
*/
BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
BLIS_INLINE err_t bli_strsm_small_zen_int_AutXB_AlXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -25537,19 +25537,19 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
until it reaches 16x(m-16) which is the maximum GEMM alone block size in A
b. This packed buffer is reused to calculate all n rows of B matrix
*/
bli_strsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
bli_strsm_small_zen_int_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
/*
Pack 16 diagonal elements of A block into an array
a. This helps to utilize cache line efficiently in TRSM operation
b. store ones when input is unit diagonal
*/
strsm_small_pack_diag_element('L',is_unitdiag,a11,cs_a,d11_pack,d_mr);
strsm_small_zen_int_pack_diag_element('L',is_unitdiag,a11,cs_a,d11_pack,d_mr);
}
else
{
bli_strsm_small_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
strsm_small_pack_diag_element('L',is_unitdiag,a11,rs_a,d11_pack,d_mr);
bli_strsm_small_zen_int_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
strsm_small_zen_int_pack_diag_element('L',is_unitdiag,a11,rs_a,d11_pack,d_mr);
}
/*
@@ -29799,7 +29799,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
* A is lower-triangular, transpose, non-unit diagonal
* dimensions A: mxm X: mxn B: mxn
*/
BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
BLIS_INLINE err_t bli_strsm_small_zen_int_AltXB_AuXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -29921,19 +29921,19 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
until it reaches 16x(m-16) which is the maximum GEMM alone block size in A
b. This packed buffer is reused to calculate all n rows of B matrix
*/
bli_strsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
bli_strsm_small_zen_int_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
/*
Pack 8 diagonal elements of A block into an array
a. This helps to utilize cache line efficiently in TRSM operation
b. store ones when input is unit diagonal
*/
strsm_small_pack_diag_element('L',is_unitdiag,a11,cs_a,d11_pack,d_mr);
strsm_small_zen_int_pack_diag_element('L',is_unitdiag,a11,cs_a,d11_pack,d_mr);
}
else
{
bli_strsm_small_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack,p_lda,d_mr);
strsm_small_pack_diag_element('L',is_unitdiag,a11,rs_a,d11_pack,d_mr);
bli_strsm_small_zen_int_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack,p_lda,d_mr);
strsm_small_zen_int_pack_diag_element('L',is_unitdiag,a11,rs_a,d11_pack,d_mr);
}
/*
@@ -33952,7 +33952,7 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
return BLIS_SUCCESS;
}
BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
BLIS_INLINE err_t bli_ztrsm_small_zen_int_AutXB_AlXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -34071,19 +34071,19 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
in A
b. This packed buffer is reused to calculate all n rows of B matrix
*/
bli_ztrsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
bli_ztrsm_small_zen_int_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
/*
Pack 4 diagonal elements of A block into an array
a. This helps to utilize cache line efficiently in TRSM operation
b. store ones when input is unit diagonal
*/
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
}
else
{
bli_ztrsm_small_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
bli_ztrsm_small_zen_int_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
}
/*
a. Perform GEMM using a10, b01.
@@ -35194,7 +35194,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
return BLIS_SUCCESS;
}
BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
BLIS_INLINE err_t bli_ztrsm_small_zen_int_AltXB_AuXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -35317,19 +35317,19 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
in A
b. This packed buffer is reused to calculate all n rows of B matrix
*/
bli_ztrsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
bli_ztrsm_small_zen_int_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
/*
Pack 8 diagonal elements of A block into an array
a. This helps to utilize cache line efficiently in TRSM operation
b. store ones when input is unit diagonal
*/
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
}
else
{
bli_ztrsm_small_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack,p_lda,d_mr);
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
bli_ztrsm_small_zen_int_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack,p_lda,d_mr);
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
}
/*
@@ -36426,7 +36426,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
return BLIS_SUCCESS;
}
BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
BLIS_INLINE err_t bli_ztrsm_small_zen_int_XAutB_XAlB
(
obj_t* AlphaObj,
obj_t* a,
@@ -36534,7 +36534,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
b. This packed buffer is reused to calculate all m cols of B
matrix
*/
bli_ztrsm_small_pack('R', p_lda, 1, a01, cs_a, D_A_pack,
bli_ztrsm_small_zen_int_pack('R', p_lda, 1, a01, cs_a, D_A_pack,
p_lda,d_nr);
/*
@@ -36543,14 +36543,14 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
operation
b. store ones when input is unit diagonal
*/
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,
d11_pack,d_nr);
}
else
{
bli_ztrsm_small_pack('R', p_lda, 0, a01, rs_a, D_A_pack,
bli_ztrsm_small_zen_int_pack('R', p_lda, 0, a01, rs_a, D_A_pack,
p_lda,d_nr);
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,
d11_pack,d_nr);
}
@@ -37327,12 +37327,12 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
{
if(transa)
{
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,
d11_pack,n_remainder);
}
else
{
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,
d11_pack,n_remainder);
}
}
@@ -37732,12 +37732,12 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
{
if(transa)
{
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,
d11_pack,n_remainder);
}
else
{
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,
d11_pack,n_remainder);
}
}
@@ -37893,7 +37893,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
return BLIS_SUCCESS;
}
BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
BLIS_INLINE err_t bli_ztrsm_small_zen_int_XAltB_XAuB
(
obj_t* AlphaObj,
obj_t* a,
@@ -38000,7 +38000,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
b. This packed buffer is reused to calculate all m cols of
B matrix
*/
bli_ztrsm_small_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
bli_ztrsm_small_zen_int_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
/*
Pack 3 diagonal elements of A block into an array
@@ -38008,14 +38008,14 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
operation
b. store ones when input is unit diagonal
*/
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,
d11_pack,d_nr);
}
else
{
bli_ztrsm_small_pack('R', j, 0, a01, rs_a, D_A_pack,
bli_ztrsm_small_zen_int_pack('R', j, 0, a01, rs_a, D_A_pack,
p_lda,d_nr);
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,
d11_pack,d_nr);
}
@@ -38762,12 +38762,12 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
{
if(transa)
{
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,
d11_pack,n_remainder);
}
else
{
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,
d11_pack,n_remainder);
}
}
@@ -39164,12 +39164,12 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
{
if(transa)
{
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,
d11_pack,n_remainder);
}
else
{
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,
d11_pack,n_remainder);
}
}
@@ -39583,7 +39583,7 @@ BLIS_INLINE err_t ctrsm_AlXB_ref
return BLIS_SUCCESS;
}
BLIS_INLINE void bli_ctrsm_small_pack
BLIS_INLINE void bli_ctrsm_small_zen_int_pack
(
char side,
dim_t size,
@@ -39768,7 +39768,7 @@ BLIS_INLINE void bli_ctrsm_small_pack
}
}
BLIS_INLINE void ctrsm_small_pack_diag_element
BLIS_INLINE void ctrsm_small_zen_pack_diag_element
(
bool is_unitdiag,
scomplex *a11,
@@ -42491,7 +42491,7 @@ BLIS_INLINE void ctrsm_small_pack_diag_element
_mm256_storeu_ps((float *)(b11 + cs_b * 2 + 4), ymm2);\
}
BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
BLIS_INLINE err_t bli_ctrsm_small_zen_int_AutXB_AlXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -42615,19 +42615,19 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
in A
b. This packed buffer is reused to calculate all n rows of B matrix
*/
bli_ctrsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
bli_ctrsm_small_zen_int_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
/*
Pack 4 diagonal elements of A block into an array
a. This helps to utilize cache line efficiently in TRSM operation
b. store ones when input is unit diagonal
*/
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
}
else
{
bli_ctrsm_small_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
bli_ctrsm_small_zen_int_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
}
/*
a. Perform GEMM using a10, b01.
@@ -44131,11 +44131,11 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
{
if(transa)
{
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,m_rem);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,m_rem);
}
else
{
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,m_rem);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,m_rem);
}
}
@@ -45028,7 +45028,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
return BLIS_SUCCESS;
}
BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
BLIS_INLINE err_t bli_ctrsm_small_zen_int_AltXB_AuXB
(
obj_t* AlphaObj,
obj_t* a,
@@ -45155,19 +45155,19 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
in A
b. This packed buffer is reused to calculate all n rows of B matrix
*/
bli_ctrsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack, p_lda,d_mr);
bli_ctrsm_small_zen_int_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack, p_lda,d_mr);
/*
Pack 4 diagonal elements of A block into an array
a. This helps to utilize cache line efficiently in TRSM operation
b. store ones when input is unit diagonal
*/
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
}
else
{
bli_ctrsm_small_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack, p_lda,d_mr);
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
bli_ctrsm_small_zen_int_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack, p_lda,d_mr);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
}
/*
a. Perform GEMM using a10, b01.
@@ -46880,11 +46880,11 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
{
if(transa)
{
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,4);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,4);
}
else
{
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,4);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,4);
}
}
@@ -47815,7 +47815,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
return BLIS_SUCCESS;
}
BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB
BLIS_INLINE err_t bli_ctrsm_small_zen_int_XAutB_XAlB
(
obj_t* AlphaObj,
obj_t* a,
@@ -47934,19 +47934,19 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB
in A
b. This packed buffer is reused to calculate all n rows of B matrix
*/
bli_ctrsm_small_pack('R', p_lda, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
bli_ctrsm_small_zen_int_pack('R', p_lda, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
/*
Pack 4 diagonal elements of A block into an array
a. This helps to utilize cache line efficiently in TRSM operation
b. store ones when input is unit diagonal
*/
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
}
else
{
bli_ctrsm_small_pack('R', p_lda, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
bli_ctrsm_small_zen_int_pack('R', p_lda, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
}
/*
a. Perform GEMM using a10, b01.
@@ -48684,11 +48684,11 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB
{
if(transa)
{
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
}
else
{
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,n_rem);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,n_rem);
}
}
@@ -49202,7 +49202,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB
if(!is_unitdiag)
{
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
}
for(i = (m-d_mr); (i+1) > 0; i -= d_mr)
@@ -49438,7 +49438,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB
return BLIS_SUCCESS;
}
BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB
BLIS_INLINE err_t bli_ctrsm_small_zen_int_XAltB_XAuB
(
obj_t* AlphaObj,
obj_t* a,
@@ -49558,19 +49558,19 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB
in A
b. This packed buffer is reused to calculate all n rows of B matrix
*/
bli_ctrsm_small_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
bli_ctrsm_small_zen_int_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
/*
Pack 4 diagonal elements of A block into an array
a. This helps to utilize cache line efficiently in TRSM operation
b. store ones when input is unit diagonal
*/
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
}
else
{
bli_ctrsm_small_pack('R', j, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
bli_ctrsm_small_zen_int_pack('R', j, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
}
/*
a. Perform GEMM using a10, b01.
@@ -50314,11 +50314,11 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB
{
if(transa)
{
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
}
else
{
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,n_rem);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,n_rem);
}
}
@@ -50843,7 +50843,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB
if(!is_unitdiag)
{
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
}
for(i = 0; (i+d_mr-1) < m; i += d_mr)

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -90,7 +90,7 @@
with k == 1. It expects the inputs and output to support the column-major storage
scheme, without any requirement to conjugate/transpose any of the operands. */
err_t bli_zgemm_4x4_avx2_k1_nn
err_t bli_zgemm_zen_int_4x4_k1_nn
(
dim_t m,
dim_t n,

View File

@@ -34,7 +34,7 @@
*/
// Including the header for tiny gemm kernel signatures
#include "bli_gemm_tiny_avx2.h"
#include "bli_gemm_tiny_zen.h"
// -- level-1m --
// Removed - reference packm kernels are used
@@ -59,28 +59,28 @@ AXPBYV_KER_PROT( scomplex, c, axpbyv_zen_int )
AXPBYV_KER_PROT( dcomplex, z, axpbyv_zen_int )
// axpbyv (intrinsics, unrolled x10)
AXPBYV_KER_PROT( float, s, axpbyv_zen_int10 )
AXPBYV_KER_PROT( double, d, axpbyv_zen_int10 )
AXPBYV_KER_PROT( float, s, axpbyv_zen_int_10 )
AXPBYV_KER_PROT( double, d, axpbyv_zen_int_10 )
// axpyv (intrinsics)
AXPYV_KER_PROT( float, s, axpyv_zen_int )
AXPYV_KER_PROT( double, d, axpyv_zen_int )
// axpyv (intrinsics unrolled x10)
AXPYV_KER_PROT( float, s, axpyv_zen_int10 )
BLIS_EXPORT_BLIS AXPYV_KER_PROT( double, d, axpyv_zen_int10 )
AXPYV_KER_PROT( scomplex, c, axpyv_zen_int5 )
AXPYV_KER_PROT( dcomplex, z, axpyv_zen_int5 )
AXPYV_KER_PROT( float, s, axpyv_zen_int_10 )
BLIS_EXPORT_BLIS AXPYV_KER_PROT( double, d, axpyv_zen_int_10 )
AXPYV_KER_PROT( scomplex, c, axpyv_zen_int_5 )
AXPYV_KER_PROT( dcomplex, z, axpyv_zen_int_5 )
// dotv (intrinsics)
DOTV_KER_PROT( float, s, dotv_zen_int )
DOTV_KER_PROT( double, d, dotv_zen_int )
// dotv (intrinsics, unrolled x10)
DOTV_KER_PROT( float, s, dotv_zen_int10 )
DOTV_KER_PROT( double, d, dotv_zen_int10 )
DOTV_KER_PROT( scomplex, c, dotv_zen_int5 )
DOTV_KER_PROT( dcomplex, z, dotv_zen_int5 )
DOTV_KER_PROT( float, s, dotv_zen_int_10 )
DOTV_KER_PROT( double, d, dotv_zen_int_10 )
DOTV_KER_PROT( scomplex, c, dotv_zen_int_5 )
DOTV_KER_PROT( dcomplex, z, dotv_zen_int_5 )
// dotxv (intrinsics)
DOTXV_KER_PROT( float, s, dotxv_zen_int )
@@ -95,13 +95,13 @@ SCALV_KER_PROT( scomplex, c, scalv_zen_int )
SCALV_KER_PROT( dcomplex, z, scalv_zen_int )
// scalv (intrinsics unrolled x10)
SCALV_KER_PROT( float, s, scalv_zen_int10 )
BLIS_EXPORT_BLIS SCALV_KER_PROT( double, d, scalv_zen_int10 )
SCALV_KER_PROT( dcomplex, z, dscalv_zen_int10 )
SCALV_KER_PROT( float, s, scalv_zen_int_10 )
BLIS_EXPORT_BLIS SCALV_KER_PROT( double, d, scalv_zen_int_10 )
SCALV_KER_PROT( dcomplex, z, dscalv_zen_int_10 )
// swapv (intrinsics)
SWAPV_KER_PROT(float, s, swapv_zen_int8 )
BLIS_EXPORT_BLIS SWAPV_KER_PROT(double, d, swapv_zen_int8 )
SWAPV_KER_PROT(float, s, swapv_zen_int_8 )
BLIS_EXPORT_BLIS SWAPV_KER_PROT(double, d, swapv_zen_int_8 )
// copyv (intrinsics)
COPYV_KER_PROT( float, s, copyv_zen_int )
@@ -328,7 +328,7 @@ err_t bli_dgemm_tiny
double* c, const inc_t rs_c0, const inc_t cs_c0
);
err_t bli_dgemm_tiny_6x8
err_t bli_dgemm_tiny_zen_6x8
(
conj_t conja,
conj_t conjb,
@@ -388,7 +388,7 @@ err_t bli_zgemm_small_At
cntl_t* cntl
);
err_t bli_dgemm_8x6_avx2_k1_nn
err_t bli_dgemm_zen_int_8x6_k1_nn
(
dim_t m,
dim_t n,
@@ -400,7 +400,7 @@ err_t bli_dgemm_8x6_avx2_k1_nn
double* c, const inc_t ldc
);
err_t bli_zgemm_4x4_avx2_k1_nn
err_t bli_zgemm_zen_int_4x4_k1_nn
(
dim_t m,
dim_t n,
@@ -412,7 +412,7 @@ err_t bli_zgemm_4x4_avx2_k1_nn
dcomplex* c, const inc_t ldc
);
err_t bli_trsm_small
err_t bli_trsm_small_zen
(
side_t side,
obj_t* alpha,
@@ -424,7 +424,7 @@ err_t bli_trsm_small
);
#ifdef BLIS_ENABLE_OPENMP
err_t bli_trsm_small_mt
err_t bli_trsm_small_zen_mt
(
side_t side,
obj_t* alpha,
@@ -480,7 +480,7 @@ bool bli_cntx_trsm_small_thresh_is_met_zen
dim_t n
);
void bli_snorm2fv_unb_var1_avx2
void bli_snorm2fv_zen_int_unb_var1
(
dim_t n,
float* x, inc_t incx,
@@ -488,7 +488,7 @@ void bli_snorm2fv_unb_var1_avx2
cntx_t* cntx
);
void bli_dnorm2fv_unb_var1_avx2
void bli_dnorm2fv_zen_int_unb_var1
(
dim_t n,
double* x, inc_t incx,
@@ -496,7 +496,7 @@ void bli_dnorm2fv_unb_var1_avx2
cntx_t* cntx
);
void bli_scnorm2fv_unb_var1_avx2
void bli_scnorm2fv_zen_int_unb_var1
(
dim_t n,
scomplex* x, inc_t incx,
@@ -504,7 +504,7 @@ void bli_scnorm2fv_unb_var1_avx2
cntx_t* cntx
);
void bli_dznorm2fv_unb_var1_avx2
void bli_dznorm2fv_zen_int_unb_var1
(
dim_t n,
dcomplex* x, inc_t incx,
@@ -543,7 +543,7 @@ void bli_sgemv_zen_ref
cntx_t* restrict cntx
);
void bli_dgemv_n_avx2
void bli_dgemv_n_zen
(
trans_t transa,
conj_t conjx,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -35,7 +35,7 @@
#include "immintrin.h"
#include "blis.h"
void bli_daddv_zen_int_avx512
void bli_daddv_zen4_int
(
conj_t conjx,
dim_t n,

View File

@@ -139,7 +139,7 @@ typedef union
);
// ----------------------------------------------------------------------------
void bli_samaxv_zen_int_avx512(
void bli_samaxv_zen4_int(
dim_t n,
float *restrict x, inc_t incx,
dim_t *restrict i_max,
@@ -443,7 +443,7 @@ void bli_samaxv_zen_int_avx512(
}
/*----------------------------------------------------------------------------------------------------*/
BLIS_EXPORT_BLIS void bli_damaxv_zen_int_avx512
BLIS_EXPORT_BLIS void bli_damaxv_zen4_int
(
dim_t n,
double *restrict x, inc_t incx,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -49,7 +49,7 @@ typedef union
* x & y are double precision vectors of length n.
* alpha & beta are scalars.
*/
void bli_daxpbyv_zen_int_avx512
void bli_daxpbyv_zen4_int
(
conj_t conjx,
dim_t n,
@@ -72,7 +72,7 @@ void bli_daxpbyv_zen_int_avx512
// When beta = !( 0 or 1 ) --> DSCALV
if ( bli_deq0( *alpha ) )
{
bli_dscalv_zen_int10
bli_dscalv_zen_int_10
(
BLIS_NO_CONJUGATE,
n,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2023 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -76,7 +76,7 @@
The expectation is that these are standard BLAS exceptions and should be handled in
a higher layer
*/
void bli_saxpyv_zen_int_avx512
void bli_saxpyv_zen4_int
(
conj_t conjx,
dim_t n,
@@ -282,7 +282,7 @@ void bli_saxpyv_zen_int_avx512
The expectation is that these are standard BLAS exceptions and should be handled in
a higher layer
*/
BLIS_EXPORT_BLIS void bli_daxpyv_zen_int_avx512
BLIS_EXPORT_BLIS void bli_daxpyv_zen4_int
(
conj_t conjx,
dim_t n,
@@ -487,7 +487,7 @@ BLIS_EXPORT_BLIS void bli_daxpyv_zen_int_avx512
The expectation is that these are standard BLAS exceptions and should be handled in
a higher layer
*/
void bli_zaxpyv_zen_int_avx512
void bli_zaxpyv_zen4_int
(
conj_t conjx,
dim_t n,

View File

@@ -77,7 +77,7 @@
a higher layer
*/
void bli_scopyv_zen4_asm_avx512
void bli_scopyv_zen4_asm
(
conj_t conjx,
dim_t n,
@@ -405,7 +405,7 @@ void bli_scopyv_zen4_asm_avx512
*/
// This function is used to copy the vector x to vector y using AVX512 instructions
void bli_dcopyv_zen4_asm_avx512
void bli_dcopyv_zen4_asm
(
conj_t conjx,
dim_t n,
@@ -691,7 +691,7 @@ void bli_dcopyv_zen4_asm_avx512
}
// This function is used to copy the vector x to vector y using AVX512 instructions in a two directional way
void bli_dcopyv_zen4_asm_avx512_biway
void bli_dcopyv_zen4_asm_biway
(
conj_t conjx,
dim_t n,
@@ -1043,7 +1043,7 @@ void bli_dcopyv_zen4_asm_avx512_biway
a higher layer
*/
void bli_zcopyv_zen4_asm_avx512
void bli_zcopyv_zen4_asm
(
conj_t conjx,
dim_t n,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -75,7 +75,7 @@
a higher layer
*/
void bli_scopyv_zen_int_avx512
void bli_scopyv_zen4_int
(
conj_t conjx,
dim_t n,
@@ -388,7 +388,7 @@ void bli_scopyv_zen_int_avx512
a higher layer
*/
void bli_dcopyv_zen_int_avx512
void bli_dcopyv_zen4_int
(
conj_t conjx,
dim_t n,
@@ -700,7 +700,7 @@ void bli_dcopyv_zen_int_avx512
a higher layer
*/
void bli_zcopyv_zen_int_avx512
void bli_zcopyv_zen4_int
(
conj_t conjx,
dim_t n,

Some files were not shown because too many files have changed in this diff Show More