mirror of
https://github.com/amd/blis.git
synced 2026-04-20 15:48:50 +00:00
Standardize Zen kernel names
Naming of Zen kernels and associated files was inconsistent with BLIS conventions for other sub-configurations and between different Zen generations. Other anomalies existed, e.g. dgemmsup 24x column preferred kernels names with _rv_ instead of _cv_. This patch renames kernels and file names to address these issues. AMD-Internal: [CPUPL-6579]
This commit is contained in:
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2019 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -118,8 +118,8 @@ void bli_cntx_init_haswell( cntx_t* cntx )
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
|
||||
#else
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_10,
|
||||
#endif
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||
@@ -134,8 +134,8 @@ void bli_cntx_init_haswell( cntx_t* cntx )
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
|
||||
#else
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_10,
|
||||
#endif
|
||||
cntx
|
||||
);
|
||||
|
||||
@@ -89,8 +89,8 @@ void bli_cntx_init_knl( cntx_t* cntx )
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
|
||||
#else
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_10,
|
||||
#endif
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||
@@ -103,8 +103,8 @@ void bli_cntx_init_knl( cntx_t* cntx )
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
|
||||
#else
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_10,
|
||||
#endif
|
||||
cntx
|
||||
);
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2023 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -82,8 +82,8 @@ void bli_cntx_init_skx( cntx_t* cntx )
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
|
||||
#else
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_10,
|
||||
#endif
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||
@@ -96,8 +96,8 @@ void bli_cntx_init_skx( cntx_t* cntx )
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
|
||||
#else
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_10,
|
||||
#endif
|
||||
cntx
|
||||
);
|
||||
@@ -133,23 +133,23 @@ void bli_cntx_init_skx( cntx_t* cntx )
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
30,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
|
||||
|
||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
|
||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64m_avx512, TRUE,
|
||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
|
||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
|
||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
|
||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64n_avx512, TRUE,
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
|
||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
|
||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen4_asm_6x64m, TRUE,
|
||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
|
||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
|
||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
|
||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen4_asm_6x64n, TRUE,
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
|
||||
|
||||
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -114,22 +114,22 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
|
||||
// axpbyv
|
||||
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10,
|
||||
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int10,
|
||||
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int_10,
|
||||
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int_10,
|
||||
BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int,
|
||||
BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int,
|
||||
|
||||
// axpyv
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5,
|
||||
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5,
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_10,
|
||||
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int_5,
|
||||
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int_5,
|
||||
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
|
||||
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
|
||||
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5,
|
||||
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int_5,
|
||||
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int_5,
|
||||
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
@@ -138,14 +138,14 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_zen_int,
|
||||
|
||||
// scalv
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_10,
|
||||
BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_zen_int,
|
||||
BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int,
|
||||
|
||||
// swapv
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int_8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int_8,
|
||||
|
||||
// copyv
|
||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -129,22 +129,22 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
|
||||
// axpbyv
|
||||
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10,
|
||||
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int10,
|
||||
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int_10,
|
||||
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int_10,
|
||||
BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int,
|
||||
BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int,
|
||||
|
||||
// axpyv
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5,
|
||||
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5,
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_10,
|
||||
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int_5,
|
||||
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int_5,
|
||||
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10,
|
||||
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
|
||||
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5,
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int_10,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int_10,
|
||||
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int_5,
|
||||
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int_5,
|
||||
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
@@ -153,14 +153,14 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_zen_int,
|
||||
|
||||
// scalv
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_10,
|
||||
BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_zen_int,
|
||||
BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int,
|
||||
|
||||
// swapv
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int_8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int_8,
|
||||
|
||||
// copyv
|
||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -132,22 +132,22 @@ void bli_cntx_init_zen3( cntx_t* cntx )
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
|
||||
// axpbyv
|
||||
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10,
|
||||
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int10,
|
||||
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int_10,
|
||||
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int_10,
|
||||
BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int,
|
||||
BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int,
|
||||
|
||||
// axpyv
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5,
|
||||
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5,
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_10,
|
||||
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int_5,
|
||||
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int_5,
|
||||
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10,
|
||||
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
|
||||
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5,
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int_10,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int_10,
|
||||
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int_5,
|
||||
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int_5,
|
||||
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
@@ -156,14 +156,14 @@ void bli_cntx_init_zen3( cntx_t* cntx )
|
||||
BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_zen_int,
|
||||
|
||||
// scalv
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_10,
|
||||
BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_zen_int,
|
||||
BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int,
|
||||
|
||||
// swapv
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int_8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int_8,
|
||||
|
||||
// copyv
|
||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
|
||||
|
||||
@@ -80,14 +80,14 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
13,
|
||||
// gemm
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_skx_asm_32x12_l2, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_avx512_asm_8x24, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_8x24, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_zen4_asm_24x4, FALSE,
|
||||
/*bli_zgemm_zen4_asm_12x4 is a column preferred kernel*/
|
||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen4_asm_12x4, FALSE,
|
||||
|
||||
// Different GEMM kernels are used for TRSM for zen4 architecture
|
||||
BLIS_GEMM_FOR_TRSM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMM_FOR_TRSM_UKR, BLIS_DOUBLE, bli_dgemm_avx512_asm_8x24, TRUE,
|
||||
BLIS_GEMM_FOR_TRSM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_8x24, TRUE,
|
||||
BLIS_GEMM_FOR_TRSM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen4_asm_4x12, TRUE,
|
||||
|
||||
// gemmtrsm_l
|
||||
@@ -156,29 +156,29 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
(
|
||||
32,
|
||||
// addv
|
||||
BLIS_ADDV_KER, BLIS_DOUBLE, bli_daddv_zen_int_avx512,
|
||||
BLIS_ADDV_KER, BLIS_DOUBLE, bli_daddv_zen4_int,
|
||||
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int_avx512,
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen4_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen4_int,
|
||||
|
||||
// axpbyv
|
||||
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10,
|
||||
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int_avx512,
|
||||
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int_10,
|
||||
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen4_int,
|
||||
BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int,
|
||||
BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int,
|
||||
|
||||
// axpyv
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_avx512,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_avx512,
|
||||
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5,
|
||||
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int_avx512,
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen4_int,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen4_int,
|
||||
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int_5,
|
||||
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen4_int,
|
||||
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int_avx512,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int_avx512,
|
||||
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
|
||||
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int_avx512,
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen4_int,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen4_int,
|
||||
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int_5,
|
||||
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen4_int,
|
||||
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
@@ -186,27 +186,27 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_zen_int,
|
||||
|
||||
// scalv
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_avx512,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_avx512,
|
||||
BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_zen_int_avx512,
|
||||
BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int_avx512,
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen4_int,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen4_int,
|
||||
BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_zen4_int,
|
||||
BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen4_int,
|
||||
|
||||
// swapv
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int_8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int_8,
|
||||
|
||||
// copyv
|
||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen4_asm_avx512,
|
||||
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen4_asm_avx512,
|
||||
BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen4_asm_avx512,
|
||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen4_asm,
|
||||
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen4_asm,
|
||||
BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen4_asm,
|
||||
|
||||
// setv
|
||||
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int_avx512,
|
||||
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int_avx512,
|
||||
BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_zen_int_avx512,
|
||||
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen4_int,
|
||||
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen4_int,
|
||||
BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_zen4_int,
|
||||
|
||||
// scal2v
|
||||
BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_zen_int_avx512,
|
||||
BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_zen4_int,
|
||||
BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_zen_int,
|
||||
cntx
|
||||
);
|
||||
@@ -299,23 +299,23 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
32,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
|
||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m_new, FALSE,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
|
||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m_new, FALSE,
|
||||
|
||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
|
||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64m_avx512, TRUE,
|
||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
|
||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
|
||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
|
||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64n_avx512, TRUE,
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
|
||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
|
||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen4_asm_6x64m, TRUE,
|
||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
|
||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
|
||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
|
||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen4_asm_6x64n, TRUE,
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
|
||||
|
||||
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_cv_zen4_asm_24x4m, FALSE,
|
||||
BLIS_RRC, BLIS_SCOMPLEX, bli_cgemmsup_cv_zen4_asm_24x4m, FALSE,
|
||||
@@ -397,14 +397,14 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen4_asm_24x8m, FALSE,
|
||||
|
||||
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
|
||||
@@ -82,14 +82,14 @@ void bli_cntx_init_zen5( cntx_t* cntx )
|
||||
13,
|
||||
// gemm
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_skx_asm_32x12_l2, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_avx512_asm_8x24, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_8x24, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_zen4_asm_24x4, FALSE,
|
||||
/*bli_zgemm_zen4_asm_12x4 is a column preferred kernel*/
|
||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen4_asm_12x4, FALSE,
|
||||
|
||||
// Different GEMM kernels are used for TRSM for zen4 architecture
|
||||
BLIS_GEMM_FOR_TRSM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMM_FOR_TRSM_UKR, BLIS_DOUBLE, bli_dgemm_avx512_asm_8x24, TRUE,
|
||||
BLIS_GEMM_FOR_TRSM_UKR, BLIS_DOUBLE, bli_dgemm_zen4_asm_8x24, TRUE,
|
||||
BLIS_GEMM_FOR_TRSM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen4_asm_4x12, TRUE,
|
||||
|
||||
// gemmtrsm_l
|
||||
@@ -158,29 +158,29 @@ void bli_cntx_init_zen5( cntx_t* cntx )
|
||||
(
|
||||
32,
|
||||
// addv
|
||||
BLIS_ADDV_KER, BLIS_DOUBLE, bli_daddv_zen_int_avx512,
|
||||
BLIS_ADDV_KER, BLIS_DOUBLE, bli_daddv_zen4_int,
|
||||
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int_avx512,
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen4_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen4_int,
|
||||
|
||||
// axpbyv
|
||||
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int10,
|
||||
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen_int_avx512,
|
||||
BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_zen_int_10,
|
||||
BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_zen4_int,
|
||||
BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_zen_int,
|
||||
BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int,
|
||||
|
||||
// axpyv
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int_avx512,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int_avx512,
|
||||
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5,
|
||||
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int_avx512,
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen4_int,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen4_int,
|
||||
BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int_5,
|
||||
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen4_int,
|
||||
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int_avx512,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int_avx512,
|
||||
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
|
||||
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int_avx512,
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen4_int,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen4_int,
|
||||
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int_5,
|
||||
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen4_int,
|
||||
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
@@ -188,27 +188,27 @@ void bli_cntx_init_zen5( cntx_t* cntx )
|
||||
BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_zen_int,
|
||||
|
||||
// scalv
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int_avx512,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int_avx512,
|
||||
BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_zen_int_avx512,
|
||||
BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen_int_avx512,
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen4_int,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen4_int,
|
||||
BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_zen4_int,
|
||||
BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_zen4_int,
|
||||
|
||||
// swapv
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int_8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int_8,
|
||||
|
||||
// copyv
|
||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen4_asm_avx512,
|
||||
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen5_asm_avx512,
|
||||
BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen4_asm_avx512,
|
||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen4_asm,
|
||||
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen5_asm,
|
||||
BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen4_asm,
|
||||
|
||||
// setv
|
||||
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int_avx512,
|
||||
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int_avx512,
|
||||
BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_zen_int_avx512,
|
||||
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen4_int,
|
||||
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen4_int,
|
||||
BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_zen4_int,
|
||||
|
||||
// scal2v
|
||||
BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_zen_int_avx512,
|
||||
BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_zen4_int,
|
||||
BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_zen_int,
|
||||
cntx
|
||||
);
|
||||
@@ -301,23 +301,23 @@ void bli_cntx_init_zen5( cntx_t* cntx )
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
32,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
|
||||
|
||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
|
||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64m_avx512, TRUE,
|
||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
|
||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
|
||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
|
||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64n_avx512, TRUE,
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
|
||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
|
||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen4_asm_6x64m, TRUE,
|
||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
|
||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
|
||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64m, TRUE,
|
||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen4_asm_6x64n, TRUE,
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen4_asm_6x64n, TRUE,
|
||||
|
||||
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_cv_zen4_asm_24x4m, FALSE,
|
||||
BLIS_RRC, BLIS_SCOMPLEX, bli_cgemmsup_cv_zen4_asm_24x4m, FALSE,
|
||||
@@ -398,14 +398,14 @@ void bli_cntx_init_zen5( cntx_t* cntx )
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_cv_zen5_asm_24x8m, FALSE,
|
||||
|
||||
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
|
||||
@@ -307,8 +307,8 @@ void bli_dgemv_unf_var1
|
||||
case BLIS_ARCH_ZEN5:
|
||||
#if defined(BLIS_KERNELS_ZEN5)
|
||||
gemv_kr_ptr = bli_dgemv_t_zen4_int; // DGEMV
|
||||
scalv_kr_ptr = bli_dscalv_zen_int_avx512; // DSCALV
|
||||
copyv_kr_ptr = bli_dcopyv_zen5_asm_avx512; // DCOPYV
|
||||
scalv_kr_ptr = bli_dscalv_zen4_int; // DSCALV
|
||||
copyv_kr_ptr = bli_dcopyv_zen5_asm; // DCOPYV
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
|
||||
fast_path_thresh = 12000;
|
||||
#endif
|
||||
@@ -318,8 +318,8 @@ void bli_dgemv_unf_var1
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
gemv_kr_ptr = bli_dgemv_t_zen4_int; // DGEMV
|
||||
scalv_kr_ptr = bli_dscalv_zen_int_avx512; // DSCALV
|
||||
copyv_kr_ptr = bli_dcopyv_zen4_asm_avx512; // DCOPYV
|
||||
scalv_kr_ptr = bli_dscalv_zen4_int; // DSCALV
|
||||
copyv_kr_ptr = bli_dcopyv_zen4_asm; // DCOPYV
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
|
||||
fast_path_thresh = 11000;
|
||||
#endif
|
||||
@@ -925,7 +925,7 @@ void bli_zgemv_unf_var1
|
||||
factor of DOTXF kernel
|
||||
*/
|
||||
|
||||
dotxf_kr_ptr = bli_zdotxf_zen_int_8_avx512;
|
||||
dotxf_kr_ptr = bli_zdotxf_zen4_int_8;
|
||||
b_fuse = 8;
|
||||
|
||||
scal2v_kr_ptr = bli_zscal2v_zen_int;
|
||||
|
||||
@@ -330,7 +330,7 @@ void bli_dgemv_unf_var2 (
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
bli_dgemv_n_avx2(
|
||||
bli_dgemv_n_zen(
|
||||
transa,
|
||||
conjx,
|
||||
m,
|
||||
@@ -630,7 +630,7 @@ void bli_sgemv_unf_var2
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
/* y = beta * y; */
|
||||
/* beta=0 case is handled by scalv internally */
|
||||
bli_sscalv_zen_int10
|
||||
bli_sscalv_zen_int_10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
@@ -736,7 +736,7 @@ void bli_zgemv_unf_var2
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
axpyf_kr_ptr = bli_zaxpyf_zen_int_8_avx512;
|
||||
axpyf_kr_ptr = bli_zaxpyf_zen4_int_8;
|
||||
b_fuse = 8;
|
||||
|
||||
scal2v_kr_ptr = bli_zscal2v_zen_int;
|
||||
@@ -745,7 +745,7 @@ void bli_zgemv_unf_var2
|
||||
|
||||
copyv_kr_ptr = bli_zcopyv_zen_int;
|
||||
|
||||
setv_kr_ptr = bli_zsetv_zen_int_avx512;
|
||||
setv_kr_ptr = bli_zsetv_zen4_int;
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2019 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -308,7 +308,7 @@ void bli_dtrsv_unf_var1
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
kfp_df = bli_ddotxf_zen_int_avx512;
|
||||
kfp_df = bli_ddotxf_zen4_int;
|
||||
b_fuse = 8;
|
||||
break;
|
||||
#endif
|
||||
|
||||
@@ -313,12 +313,12 @@ void bli_dtrsv_unf_var2
|
||||
{
|
||||
if ( m < 2500 )
|
||||
{
|
||||
kfp_af = bli_daxpyf_zen_int8_avx512;
|
||||
kfp_af = bli_daxpyf_zen4_int_8;
|
||||
b_fuse = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
kfp_af = bli_daxpyf_zen_int12_avx512;
|
||||
kfp_af = bli_daxpyf_zen4_int_12;
|
||||
b_fuse = 12;
|
||||
}
|
||||
#if defined(BLIS_ENABLE_OPENMP)
|
||||
@@ -331,7 +331,7 @@ void bli_dtrsv_unf_var2
|
||||
// If NT == 1, don't use MT kernel.
|
||||
if ( n_threads > 1 )
|
||||
{
|
||||
kfp_af = bli_daxpyf_zen_int32_avx512_mt;
|
||||
kfp_af = bli_daxpyf_zen4_int_32_mt;
|
||||
b_fuse = 32;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -189,7 +189,7 @@ void bli_gemm_ker_var2
|
||||
( bli_obj_is_real( b ) ) // check if B is real
|
||||
)
|
||||
{
|
||||
bli_dgemm_avx512_asm_8x24_macro_kernel
|
||||
bli_dgemm_zen4_asm_8x24_macro_kernel
|
||||
(
|
||||
n, m, k, buf_c, buf_a, buf_b, rs_c, buf_beta
|
||||
);
|
||||
|
||||
@@ -410,7 +410,7 @@ err_t bli_dgemm_tiny
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_FAMILY_ZEN5) || defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64)
|
||||
return bli_dgemm_tiny_24x8
|
||||
return bli_dgemm_tiny_zen4_24x8
|
||||
(
|
||||
1 * (transa == BLIS_CONJ_NO_TRANSPOSE),
|
||||
1 * (transb == BLIS_CONJ_NO_TRANSPOSE),
|
||||
@@ -431,7 +431,7 @@ err_t bli_dgemm_tiny
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
return bli_dgemm_tiny_6x8
|
||||
return bli_dgemm_tiny_zen_6x8
|
||||
(
|
||||
1 * (transa == BLIS_CONJ_NO_TRANSPOSE),
|
||||
1 * (transb == BLIS_CONJ_NO_TRANSPOSE),
|
||||
@@ -466,7 +466,7 @@ err_t bli_dgemm_tiny
|
||||
((m + k-n) < 1500) && ((n + k-m) < 1500) ) ||
|
||||
((n <= 100) && (k <=100)))))
|
||||
{
|
||||
return bli_dgemm_tiny_24x8
|
||||
return bli_dgemm_tiny_zen4_24x8
|
||||
(
|
||||
1 * (transa == BLIS_CONJ_NO_TRANSPOSE),
|
||||
1 * (transb == BLIS_CONJ_NO_TRANSPOSE),
|
||||
@@ -490,7 +490,7 @@ err_t bli_dgemm_tiny
|
||||
case BLIS_ARCH_ZEN3:
|
||||
if( ( (m <= 8) || ( (m <= 1000) && (n <= 24) && (k >= 4) ) ) && (k <= 1500) )
|
||||
{
|
||||
return bli_dgemm_tiny_6x8
|
||||
return bli_dgemm_tiny_zen_6x8
|
||||
(
|
||||
1 * (transa == BLIS_CONJ_NO_TRANSPOSE),
|
||||
1 * (transb == BLIS_CONJ_NO_TRANSPOSE),
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -81,17 +81,17 @@ typedef void (*gemmt_ker_ft)
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
gemmt_ker_ft ker_fpus_zen4[3] =
|
||||
{
|
||||
bli_dgemmsup_rv_zen4_asm_24x8m_upper_0,
|
||||
bli_dgemmsup_rv_zen4_asm_24x8m_upper_1,
|
||||
bli_dgemmsup_rv_zen4_asm_24x8m_upper_2
|
||||
bli_dgemmsup_cv_zen4_asm_24x8m_upper_0,
|
||||
bli_dgemmsup_cv_zen4_asm_24x8m_upper_1,
|
||||
bli_dgemmsup_cv_zen4_asm_24x8m_upper_2
|
||||
};
|
||||
|
||||
//Look-up table for Gemmt Lower Variant Kernels
|
||||
gemmt_ker_ft ker_fpls_zen4[3] =
|
||||
{
|
||||
bli_dgemmsup_rv_zen4_asm_24x8m_lower_0,
|
||||
bli_dgemmsup_rv_zen4_asm_24x8m_lower_1,
|
||||
bli_dgemmsup_rv_zen4_asm_24x8m_lower_2
|
||||
bli_dgemmsup_cv_zen4_asm_24x8m_lower_0,
|
||||
bli_dgemmsup_cv_zen4_asm_24x8m_lower_1,
|
||||
bli_dgemmsup_cv_zen4_asm_24x8m_lower_2
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -58,7 +58,7 @@ void bli_trsm_front
|
||||
|
||||
#if 0
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
|
||||
gint_t status = bli_trsm_small( side, alpha, a, b, cntx, cntl );
|
||||
gint_t status = bli_trsm_small_zen( side, alpha, a, b, cntx, cntl );
|
||||
if ( status == BLIS_SUCCESS ) return;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -45,7 +45,7 @@ void bli_trsm_front
|
||||
);
|
||||
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX
|
||||
err_t bli_trsm_small
|
||||
err_t bli_trsm_small_zen
|
||||
(
|
||||
side_t side,
|
||||
obj_t* alpha,
|
||||
|
||||
@@ -319,7 +319,7 @@ f77_int idamax_blis_impl
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
// AVX512 Kernel
|
||||
amaxv_fun_ptr = bli_damaxv_zen_int_avx512;
|
||||
amaxv_fun_ptr = bli_damaxv_zen4_int;
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
|
||||
@@ -215,14 +215,14 @@ void saxpy_blis_impl
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
axpyv_ker_ptr = bli_saxpyv_zen_int_avx512;
|
||||
axpyv_ker_ptr = bli_saxpyv_zen4_int;
|
||||
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
axpyv_ker_ptr = bli_saxpyv_zen_int10;
|
||||
axpyv_ker_ptr = bli_saxpyv_zen_int_10;
|
||||
|
||||
break;
|
||||
default:
|
||||
@@ -354,7 +354,7 @@ void daxpy_blis_impl
|
||||
{
|
||||
case BLIS_ARCH_ZEN5:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
axpyv_ker_ptr = bli_daxpyv_zen_int_avx512;
|
||||
axpyv_ker_ptr = bli_daxpyv_zen4_int;
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
|
||||
fast_path_thresh = 34000;
|
||||
#endif
|
||||
@@ -362,7 +362,7 @@ void daxpy_blis_impl
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
axpyv_ker_ptr = bli_daxpyv_zen_int_avx512;
|
||||
axpyv_ker_ptr = bli_daxpyv_zen4_int;
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
|
||||
fast_path_thresh = 11000;
|
||||
#endif
|
||||
@@ -373,7 +373,7 @@ void daxpy_blis_impl
|
||||
case BLIS_ARCH_ZEN3:
|
||||
|
||||
// AVX2 Kernel
|
||||
axpyv_ker_ptr = bli_daxpyv_zen_int10;
|
||||
axpyv_ker_ptr = bli_daxpyv_zen_int_10;
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
|
||||
fast_path_thresh = 4000;
|
||||
#endif
|
||||
@@ -590,7 +590,7 @@ void caxpy_blis_impl
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
bli_caxpyv_zen_int5
|
||||
bli_caxpyv_zen_int_5
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
@@ -722,7 +722,7 @@ void zaxpy_blis_impl
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
// AVX512 Kernel
|
||||
axpyv_ker_ptr = bli_zaxpyv_zen_int_avx512;
|
||||
axpyv_ker_ptr = bli_zaxpyv_zen4_int;
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
@@ -730,7 +730,7 @@ void zaxpy_blis_impl
|
||||
case BLIS_ARCH_ZEN3:
|
||||
|
||||
// AVX2 Kernel
|
||||
axpyv_ker_ptr = bli_zaxpyv_zen_int5;
|
||||
axpyv_ker_ptr = bli_zaxpyv_zen_int_5;
|
||||
break;
|
||||
|
||||
default:
|
||||
|
||||
@@ -181,7 +181,7 @@ void scopy_blis_impl
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
copyv_ker_ptr = bli_scopyv_zen4_asm_avx512;
|
||||
copyv_ker_ptr = bli_scopyv_zen4_asm;
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
@@ -311,7 +311,7 @@ void dcopy_blis_impl
|
||||
case BLIS_ARCH_ZEN5:
|
||||
#if defined(BLIS_KERNELS_ZEN5)
|
||||
// For Zen4 and Zen5, kernel implemented in AVX512 is used
|
||||
copyv_ker_ptr = bli_dcopyv_zen5_asm_avx512;
|
||||
copyv_ker_ptr = bli_dcopyv_zen5_asm;
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
|
||||
fast_path_thresh = 43000;
|
||||
#endif
|
||||
@@ -320,7 +320,7 @@ void dcopy_blis_impl
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
// For Zen4 and Zen5, kernel implemented in AVX512 is used
|
||||
copyv_ker_ptr = bli_dcopyv_zen4_asm_avx512;
|
||||
copyv_ker_ptr = bli_dcopyv_zen4_asm;
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
|
||||
fast_path_thresh = 3300;
|
||||
#endif
|
||||
@@ -558,7 +558,7 @@ void zcopy_blis_impl
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
// For Zen4 and Zen5 architecture, kernel implemented in AVX512 is used
|
||||
copyv_ker_ptr = bli_zcopyv_zen4_asm_avx512;
|
||||
copyv_ker_ptr = bli_zcopyv_zen4_asm;
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
|
||||
@@ -206,7 +206,7 @@ float sdot_blis_impl
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
|
||||
// AVX-512 Kernel
|
||||
dotv_ker_ptr = bli_sdotv_zen_int_avx512;
|
||||
dotv_ker_ptr = bli_sdotv_zen4_int;
|
||||
|
||||
break;
|
||||
#endif
|
||||
@@ -215,7 +215,7 @@ float sdot_blis_impl
|
||||
case BLIS_ARCH_ZEN3:
|
||||
|
||||
// AVX-2 Kernel
|
||||
dotv_ker_ptr = bli_sdotv_zen_int10;
|
||||
dotv_ker_ptr = bli_sdotv_zen_int_10;
|
||||
|
||||
break;
|
||||
default:
|
||||
@@ -347,7 +347,7 @@ double ddot_blis_impl
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN5)
|
||||
// AVX-512 Kernel
|
||||
dotv_ker_ptr = bli_ddotv_zen_int_avx512;
|
||||
dotv_ker_ptr = bli_ddotv_zen4_int;
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
|
||||
fast_path_thresh = 6600;
|
||||
#endif
|
||||
@@ -358,7 +358,7 @@ double ddot_blis_impl
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
// AVX-512 Kernel
|
||||
dotv_ker_ptr = bli_ddotv_zen_int_avx512;
|
||||
dotv_ker_ptr = bli_ddotv_zen4_int;
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
|
||||
fast_path_thresh = 5600;
|
||||
#endif
|
||||
@@ -370,7 +370,7 @@ double ddot_blis_impl
|
||||
case BLIS_ARCH_ZEN3:
|
||||
|
||||
// AVX2 Kernel
|
||||
dotv_ker_ptr = bli_ddotv_zen_int10;
|
||||
dotv_ker_ptr = bli_ddotv_zen_int_10;
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
|
||||
fast_path_thresh = 2500;
|
||||
#endif
|
||||
@@ -686,7 +686,7 @@ scomplex cdotu_blis_impl
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
/* Call BLIS kernel. */
|
||||
bli_cdotv_zen_int5
|
||||
bli_cdotv_zen_int_5
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
@@ -807,14 +807,14 @@ dcomplex zdotu_blis_impl
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
zdotv_ker_ptr = bli_zdotv_zen_int_avx512;
|
||||
zdotv_ker_ptr = bli_zdotv_zen4_int;
|
||||
break;
|
||||
#endif
|
||||
|
||||
case BLIS_ARCH_ZEN3:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN:
|
||||
zdotv_ker_ptr = bli_zdotv_zen_int5;
|
||||
zdotv_ker_ptr = bli_zdotv_zen_int_5;
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -1097,7 +1097,7 @@ scomplex cdotc_blis_impl
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
/* Call BLIS kernel. */
|
||||
bli_cdotv_zen_int5
|
||||
bli_cdotv_zen_int_5
|
||||
(
|
||||
BLIS_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
@@ -1220,15 +1220,15 @@ dcomplex zdotc_blis_impl
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
// Currently only the AVX512 intrinsic kernel is enabled.
|
||||
zdotv_ker_ptr = bli_zdotv_zen_int_avx512;
|
||||
// zdotv_ker_ptr = bli_zdotv_zen4_asm_avx512;
|
||||
zdotv_ker_ptr = bli_zdotv_zen4_int;
|
||||
// zdotv_ker_ptr = bli_zdotv_zen4_asm;
|
||||
break;
|
||||
#endif
|
||||
|
||||
case BLIS_ARCH_ZEN3:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN:
|
||||
zdotv_ker_ptr = bli_zdotv_zen_int5;
|
||||
zdotv_ker_ptr = bli_zdotv_zen_int_5;
|
||||
break;
|
||||
|
||||
default:
|
||||
|
||||
@@ -689,7 +689,7 @@ void dgemm_blis_impl
|
||||
if ( arch_id == BLIS_ARCH_ZEN || arch_id == BLIS_ARCH_ZEN2 ||
|
||||
arch_id == BLIS_ARCH_ZEN3 )
|
||||
{
|
||||
k1_status = bli_dgemm_8x6_avx2_k1_nn
|
||||
k1_status = bli_dgemm_zen_int_8x6_k1_nn
|
||||
(
|
||||
m0, n0, k0,
|
||||
(double*)alpha,
|
||||
@@ -702,7 +702,7 @@ void dgemm_blis_impl
|
||||
#if defined(BLIS_FAMILY_ZEN5) || defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64)
|
||||
else if ( arch_id == BLIS_ARCH_ZEN5 || arch_id == BLIS_ARCH_ZEN4 )
|
||||
{
|
||||
k1_status = bli_dgemm_24x8_avx512_k1_nn
|
||||
k1_status = bli_dgemm_zen4_int_24x8_k1_nn
|
||||
(
|
||||
m0, n0, k0,
|
||||
(double*)alpha,
|
||||
@@ -1179,7 +1179,7 @@ void zgemm_blis_impl
|
||||
if ( arch_id == BLIS_ARCH_ZEN || arch_id == BLIS_ARCH_ZEN2 ||
|
||||
arch_id == BLIS_ARCH_ZEN3 )
|
||||
{
|
||||
k1_status = bli_zgemm_4x4_avx2_k1_nn
|
||||
k1_status = bli_zgemm_zen_int_4x4_k1_nn
|
||||
(
|
||||
m0, n0, k0,
|
||||
(dcomplex*)alpha,
|
||||
@@ -1197,7 +1197,7 @@ void zgemm_blis_impl
|
||||
// This holds true irrespective of the broadcast direction( n0 )
|
||||
if( m0 < 30 )
|
||||
{
|
||||
k1_status = bli_zgemm_4x4_avx2_k1_nn
|
||||
k1_status = bli_zgemm_zen_int_4x4_k1_nn
|
||||
(
|
||||
m0, n0, k0,
|
||||
(dcomplex*)alpha,
|
||||
@@ -1209,7 +1209,7 @@ void zgemm_blis_impl
|
||||
}
|
||||
else
|
||||
{
|
||||
k1_status = bli_zgemm_16x4_avx512_k1_nn
|
||||
k1_status = bli_zgemm_zen4_int_16x4_k1_nn
|
||||
(
|
||||
m0, n0, k0,
|
||||
(dcomplex*)alpha,
|
||||
@@ -1226,7 +1226,7 @@ void zgemm_blis_impl
|
||||
// ( i.e, small or tiny sizes ), or if the load directon( m0 ) < 10
|
||||
if( ( m0 < 30 && n0 < 30 ) || m0 < 10 )
|
||||
{
|
||||
k1_status = bli_zgemm_4x4_avx2_k1_nn
|
||||
k1_status = bli_zgemm_zen_int_4x4_k1_nn
|
||||
(
|
||||
m0, n0, k0,
|
||||
(dcomplex*)alpha,
|
||||
@@ -1238,7 +1238,7 @@ void zgemm_blis_impl
|
||||
}
|
||||
else
|
||||
{
|
||||
k1_status = bli_zgemm_16x4_avx512_k1_nn
|
||||
k1_status = bli_zgemm_zen4_int_16x4_k1_nn
|
||||
(
|
||||
m0, n0, k0,
|
||||
(dcomplex*)alpha,
|
||||
@@ -1740,7 +1740,7 @@ void cgemm_blis_impl
|
||||
|
||||
if ( ( arch_id == BLIS_ARCH_ZEN4 ) || ( arch_id == BLIS_ARCH_ZEN5 ) )
|
||||
{
|
||||
bli_cgemm_32x4_avx512_k1_nn
|
||||
bli_cgemm_zen4_int_32x4_k1_nn
|
||||
(
|
||||
m0, n0, k0,
|
||||
(scomplex*)alpha,
|
||||
|
||||
@@ -739,7 +739,7 @@ void cgemv_blis_impl
|
||||
scomplex rho;
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
bli_cdotv_zen_int5
|
||||
bli_cdotv_zen_int_5
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
@@ -987,7 +987,7 @@ void zgemv_blis_impl
|
||||
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
bli_zdotv_zen_int5
|
||||
bli_zdotv_zen_int_5
|
||||
(
|
||||
conja,
|
||||
BLIS_NO_CONJUGATE,
|
||||
|
||||
@@ -165,13 +165,13 @@ void sscal_blis_impl
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
scalv_ker_ptr = bli_sscalv_zen_int_avx512;
|
||||
scalv_ker_ptr = bli_sscalv_zen4_int;
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
scalv_ker_ptr = bli_sscalv_zen_int10;
|
||||
scalv_ker_ptr = bli_sscalv_zen_int_10;
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -257,7 +257,7 @@ void dscal_blis_impl
|
||||
case BLIS_ARCH_ZEN5:
|
||||
#if defined(BLIS_KERNELS_ZEN5)
|
||||
// AVX512 Kernel
|
||||
scalv_ker_ptr = bli_dscalv_zen_int_avx512;
|
||||
scalv_ker_ptr = bli_dscalv_zen4_int;
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
ST_THRESH = 63894;
|
||||
#endif
|
||||
@@ -266,7 +266,7 @@ void dscal_blis_impl
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
// AVX512 Kernel
|
||||
scalv_ker_ptr = bli_dscalv_zen_int_avx512;
|
||||
scalv_ker_ptr = bli_dscalv_zen4_int;
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
ST_THRESH = 27500;
|
||||
#endif
|
||||
@@ -277,7 +277,7 @@ void dscal_blis_impl
|
||||
case BLIS_ARCH_ZEN3:
|
||||
|
||||
// AVX2 Kernel
|
||||
scalv_ker_ptr = bli_dscalv_zen_int10;
|
||||
scalv_ker_ptr = bli_dscalv_zen_int_10;
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
ST_THRESH = 30000;
|
||||
#endif
|
||||
@@ -459,7 +459,7 @@ void zdscal_blis_impl
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
// AVX512 Kernel
|
||||
scalv_ker_ptr = bli_zdscalv_zen_int_avx512;
|
||||
scalv_ker_ptr = bli_zdscalv_zen4_int;
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
@@ -467,7 +467,7 @@ void zdscal_blis_impl
|
||||
case BLIS_ARCH_ZEN3:
|
||||
|
||||
// AVX2 Kernel
|
||||
scalv_ker_ptr = bli_zdscalv_zen_int10;
|
||||
scalv_ker_ptr = bli_zdscalv_zen_int_10;
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -627,7 +627,7 @@ void cscal_blis_impl
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
// AVX512 Kernel
|
||||
scalv_ker_ptr = bli_cscalv_zen_int_avx512;
|
||||
scalv_ker_ptr = bli_cscalv_zen4_int;
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
@@ -719,7 +719,7 @@ void zscal_blis_impl
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
// AVX512 Kernel
|
||||
scalv_ker_ptr = bli_zscalv_zen_int_avx512;
|
||||
scalv_ker_ptr = bli_zscalv_zen4_int;
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
|
||||
@@ -163,7 +163,7 @@ void sswap_blis_impl
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
|
||||
/* Call BLIS kernel */
|
||||
bli_sswapv_zen_int8
|
||||
bli_sswapv_zen_int_8
|
||||
(
|
||||
n0,
|
||||
x0, incx0,
|
||||
@@ -264,7 +264,7 @@ void dswap_blis_impl
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
|
||||
bli_dswapv_zen_int8
|
||||
bli_dswapv_zen_int_8
|
||||
(
|
||||
n0,
|
||||
x0, incx0,
|
||||
|
||||
@@ -776,7 +776,7 @@ void strsm_blis_impl
|
||||
(is_parallel && (m0+n0)<320))
|
||||
{
|
||||
err_t small_status;
|
||||
small_status = bli_trsm_small
|
||||
small_status = bli_trsm_small_zen
|
||||
(
|
||||
blis_side,
|
||||
&alphao,
|
||||
@@ -1181,22 +1181,22 @@ void dtrsm_blis_impl
|
||||
{
|
||||
if ( m0 <= 120 )
|
||||
{
|
||||
ker_ft = bli_trsm_small_AVX512;
|
||||
ker_ft = bli_trsm_small_zen4;
|
||||
}
|
||||
else if ( (log10(n0) + (0.65*log10(m0)) ) < 4.4 )
|
||||
{
|
||||
ker_ft = bli_trsm_small_ZEN5;
|
||||
ker_ft = bli_trsm_small_zen5;
|
||||
}
|
||||
}
|
||||
else //if ( blis_side == BLIS_RIGHT )
|
||||
{
|
||||
if ( (log10(m0) + (3.2*log10(n0)) ) < 7 )
|
||||
{
|
||||
ker_ft = bli_trsm_small_AVX512;
|
||||
ker_ft = bli_trsm_small_zen4;
|
||||
}
|
||||
else if ( (log10(m0) + (0.85*log10(n0)) ) < 5 )
|
||||
{
|
||||
ker_ft = bli_trsm_small_ZEN5;
|
||||
ker_ft = bli_trsm_small_zen5;
|
||||
}
|
||||
}
|
||||
break;
|
||||
@@ -1210,11 +1210,11 @@ void dtrsm_blis_impl
|
||||
except for sizes where n is multiple of 8.*/
|
||||
if (((n0 % 8 == 0) && (n0 < 50)) || ((m0 > 50) && (n0 > 50)))
|
||||
{
|
||||
ker_ft = bli_trsm_small_AVX512;
|
||||
ker_ft = bli_trsm_small_zen4;
|
||||
}
|
||||
else
|
||||
{
|
||||
ker_ft = bli_trsm_small;
|
||||
ker_ft = bli_trsm_small_zen;
|
||||
}
|
||||
}
|
||||
break;
|
||||
@@ -1226,7 +1226,7 @@ void dtrsm_blis_impl
|
||||
if ((!is_parallel && ((dim_a < 1500) && (size_b < 5e6)) ) ||
|
||||
(is_parallel && (m0+n0)<200))
|
||||
{
|
||||
ker_ft = bli_trsm_small;
|
||||
ker_ft = bli_trsm_small_zen;
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -1242,7 +1242,7 @@ void dtrsm_blis_impl
|
||||
{
|
||||
if ( n0 < 4300 )
|
||||
{
|
||||
ker_ft = bli_trsm_small_mt_ZEN5;
|
||||
ker_ft = bli_trsm_small_zen5_mt;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1253,7 +1253,7 @@ void dtrsm_blis_impl
|
||||
{
|
||||
if ( (n0 < 1812 || m0 < 3220) && (m0 < 14000) )
|
||||
{
|
||||
ker_ft = bli_trsm_small_mt_ZEN5;
|
||||
ker_ft = bli_trsm_small_zen5_mt;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1268,7 +1268,7 @@ void dtrsm_blis_impl
|
||||
if( (ker_ft == NULL) && (is_parallel) &&
|
||||
((dim_a < 2500) && (size_b < 5e6)) )
|
||||
{
|
||||
ker_ft = bli_trsm_small_mt_AVX512;
|
||||
ker_ft = bli_trsm_small_zen4_mt;
|
||||
}
|
||||
break;
|
||||
#endif// BLIS_KERNELS_ZEN4
|
||||
@@ -1279,7 +1279,7 @@ void dtrsm_blis_impl
|
||||
if( (ker_ft == NULL) && (is_parallel) &&
|
||||
((dim_a < 2500) && (size_b < 5e6)) )
|
||||
{
|
||||
ker_ft = bli_trsm_small_mt;
|
||||
ker_ft = bli_trsm_small_zen_mt;
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -1723,7 +1723,7 @@ void ztrsm_blis_impl
|
||||
{
|
||||
if (!bli_obj_has_conj(&ao)) // if transa == 'C', go to native code path
|
||||
{
|
||||
ker_ft = bli_trsm_small_mt_ZEN5; // 12x4 non fused kernel for ZEN5
|
||||
ker_ft = bli_trsm_small_zen5_mt; // 12x4 non fused kernel for ZEN5
|
||||
}
|
||||
}
|
||||
break;
|
||||
@@ -1735,7 +1735,7 @@ void ztrsm_blis_impl
|
||||
{
|
||||
if (!bli_obj_has_conj(&ao))
|
||||
{
|
||||
ker_ft = bli_trsm_small_mt_AVX512; // 4x4 fused kernel for ZEN4
|
||||
ker_ft = bli_trsm_small_zen4_mt; // 4x4 fused kernel for ZEN4
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1744,7 +1744,7 @@ void ztrsm_blis_impl
|
||||
// better accuracy in large sizes
|
||||
if (dim_a <= 500)
|
||||
#endif
|
||||
ker_ft = bli_trsm_small_mt;
|
||||
ker_ft = bli_trsm_small_zen_mt;
|
||||
}
|
||||
}
|
||||
break;
|
||||
@@ -1772,22 +1772,22 @@ void ztrsm_blis_impl
|
||||
{
|
||||
if ( m0 <= 88 )
|
||||
{
|
||||
ker_ft = bli_trsm_small_AVX512;
|
||||
ker_ft = bli_trsm_small_zen4;
|
||||
}
|
||||
else if ( (log10(n0) + (0.15*log10(m0)) ) < 2.924 )
|
||||
{
|
||||
ker_ft = bli_trsm_small_ZEN5;
|
||||
ker_ft = bli_trsm_small_zen5;
|
||||
}
|
||||
}
|
||||
else //if ( blis_side == BLIS_RIGHT )
|
||||
{
|
||||
if ( (log10(m0) + (2.8*log10(n0)) ) < 6 )
|
||||
{
|
||||
ker_ft = bli_trsm_small_AVX512;
|
||||
ker_ft = bli_trsm_small_zen4;
|
||||
}
|
||||
else if ( (log10(m0) + (1.058*log10(n0)) ) < 5.373 )
|
||||
{
|
||||
ker_ft = bli_trsm_small_ZEN5;
|
||||
ker_ft = bli_trsm_small_zen5;
|
||||
}
|
||||
}
|
||||
break;
|
||||
@@ -1800,7 +1800,7 @@ void ztrsm_blis_impl
|
||||
// conjugate
|
||||
if (!bli_obj_has_conj(&ao))
|
||||
{
|
||||
ker_ft = bli_trsm_small_AVX512;
|
||||
ker_ft = bli_trsm_small_zen4;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1809,7 +1809,7 @@ void ztrsm_blis_impl
|
||||
// better accuracy in large sizes
|
||||
if (dim_a <= 500)
|
||||
#endif
|
||||
ker_ft = bli_trsm_small;
|
||||
ker_ft = bli_trsm_small_zen;
|
||||
}
|
||||
}
|
||||
break;
|
||||
@@ -1823,7 +1823,7 @@ void ztrsm_blis_impl
|
||||
// better accuracy in large sizes
|
||||
if (dim_a <= 500)
|
||||
#endif
|
||||
ker_ft = bli_trsm_small;
|
||||
ker_ft = bli_trsm_small_zen;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -2229,7 +2229,7 @@ void ctrsm_blis_impl
|
||||
(is_parallel && (m0+n0)<320))
|
||||
{
|
||||
err_t small_status;
|
||||
small_status = bli_trsm_small
|
||||
small_status = bli_trsm_small_zen
|
||||
(
|
||||
blis_side,
|
||||
&alphao,
|
||||
|
||||
@@ -192,7 +192,7 @@ void saxpby_blis_impl
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
axpbyv_ker_ptr = bli_saxpbyv_zen_int10;
|
||||
axpbyv_ker_ptr = bli_saxpbyv_zen_int_10;
|
||||
|
||||
break;
|
||||
default:
|
||||
@@ -324,14 +324,14 @@ void daxpby_blis_impl
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
axpbyv_ker_ptr = bli_daxpbyv_zen_int_avx512;
|
||||
axpbyv_ker_ptr = bli_daxpbyv_zen4_int;
|
||||
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
axpbyv_ker_ptr = bli_daxpbyv_zen_int10;
|
||||
axpbyv_ker_ptr = bli_daxpbyv_zen_int_10;
|
||||
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -353,7 +353,7 @@ void bli_cnormfv_unb_var1
|
||||
size_t buffer_size = n * sizeof( scomplex );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_scnorm2fv_unb_var1_avx2(): get mem pool block\n" );
|
||||
printf( "bli_scnorm2fv_zen_int_unb_var1(): get mem pool block\n" );
|
||||
#endif
|
||||
|
||||
// Acquire a Buffer(n*size(scomplex)) from the memory broker
|
||||
@@ -378,12 +378,12 @@ void bli_cnormfv_unb_var1
|
||||
incx_buf = 1;
|
||||
}
|
||||
|
||||
bli_scnorm2fv_unb_var1_avx2( n, x_buf, incx_buf, norm, cntx );
|
||||
bli_scnorm2fv_zen_int_unb_var1( n, x_buf, incx_buf, norm, cntx );
|
||||
|
||||
if ( bli_mem_is_alloc( &mem_buf_X ) )
|
||||
{
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
|
||||
printf( "bli_scnorm2fv_zen_int_unb_var1(): releasing mem pool block\n" );
|
||||
#endif
|
||||
// Return the buffer to pool.
|
||||
bli_pba_release( &rntm_l , &mem_buf_X );
|
||||
@@ -392,7 +392,7 @@ void bli_cnormfv_unb_var1
|
||||
else
|
||||
{
|
||||
// Call the kernel with the unit-strided vector x
|
||||
bli_scnorm2fv_unb_var1_avx2( n, x_buf, incx_buf, norm, cntx );
|
||||
bli_scnorm2fv_zen_int_unb_var1( n, x_buf, incx_buf, norm, cntx );
|
||||
}
|
||||
|
||||
break;
|
||||
@@ -470,8 +470,8 @@ void bli_znormfv_unb_var1
|
||||
case BLIS_ARCH_ZEN:
|
||||
#ifdef BLIS_KERNELS_ZEN
|
||||
|
||||
norm_fp = bli_dznorm2fv_unb_var1_avx2;
|
||||
reduce_fp = bli_dnorm2fv_unb_var1_avx2;
|
||||
norm_fp = bli_dznorm2fv_zen_int_unb_var1;
|
||||
reduce_fp = bli_dnorm2fv_zen_int_unb_var1;
|
||||
fast_path_thresh = 2000;
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
@@ -947,7 +947,7 @@ void bli_snormfv_unb_var1
|
||||
size_t buffer_size = n * sizeof( float );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_snorm2fv_unb_var1_avx2(): get mem pool block\n" );
|
||||
printf( "bli_snorm2fv_zen_int_unb_var1(): get mem pool block\n" );
|
||||
#endif
|
||||
|
||||
// Acquire a Buffer(n*size(float)) from the memory broker
|
||||
@@ -972,12 +972,12 @@ void bli_snormfv_unb_var1
|
||||
incx_buf = 1;
|
||||
}
|
||||
|
||||
bli_snorm2fv_unb_var1_avx2( n, x_buf, incx_buf, norm, cntx );
|
||||
bli_snorm2fv_zen_int_unb_var1( n, x_buf, incx_buf, norm, cntx );
|
||||
|
||||
if ( bli_mem_is_alloc( &mem_buf_X ) )
|
||||
{
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
|
||||
printf( "bli_snorm2fv_zen_int_unb_var1(): releasing mem pool block\n" );
|
||||
#endif
|
||||
// Return the buffer to pool.
|
||||
bli_pba_release( &rntm_l , &mem_buf_X );
|
||||
@@ -986,7 +986,7 @@ void bli_snormfv_unb_var1
|
||||
else
|
||||
{
|
||||
// Call the kernel with the unit-strided vector x
|
||||
bli_snorm2fv_unb_var1_avx2( n, x_buf, incx_buf, norm, cntx );
|
||||
bli_snorm2fv_zen_int_unb_var1( n, x_buf, incx_buf, norm, cntx );
|
||||
}
|
||||
|
||||
break;
|
||||
@@ -1065,9 +1065,9 @@ void bli_dnormfv_unb_var1
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
|
||||
if( n <= 30 )
|
||||
norm_fp = bli_dnorm2fv_unb_var1_avx2;
|
||||
norm_fp = bli_dnorm2fv_zen_int_unb_var1;
|
||||
else
|
||||
norm_fp = bli_dnorm2fv_unb_var1_avx512;
|
||||
norm_fp = bli_dnorm2fv_zen4_int_unb_var1;
|
||||
|
||||
#ifdef __clang__
|
||||
fast_path_thresh = 6000;
|
||||
@@ -1085,9 +1085,9 @@ void bli_dnormfv_unb_var1
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
|
||||
if( n <= 250 )
|
||||
norm_fp = bli_dnorm2fv_unb_var1_avx2;
|
||||
norm_fp = bli_dnorm2fv_zen_int_unb_var1;
|
||||
else
|
||||
norm_fp = bli_dnorm2fv_unb_var1_avx512;
|
||||
norm_fp = bli_dnorm2fv_zen4_int_unb_var1;
|
||||
|
||||
fast_path_thresh = 4000;
|
||||
|
||||
@@ -1102,7 +1102,7 @@ void bli_dnormfv_unb_var1
|
||||
case BLIS_ARCH_ZEN:
|
||||
#ifdef BLIS_KERNELS_ZEN
|
||||
|
||||
norm_fp = bli_dnorm2fv_unb_var1_avx2;
|
||||
norm_fp = bli_dnorm2fv_zen_int_unb_var1;
|
||||
fast_path_thresh = 4000;
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
|
||||
@@ -328,14 +328,14 @@ set(CAN_TEST_INFO_VALUE ON)
|
||||
# The following part will be used to set up macros that relate to the version
|
||||
# of BLIS library being tested.
|
||||
if(ENABLE_THREADING STREQUAL "openmp")
|
||||
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${PROJECT_SOURCE_DIR}/cmake/get_version.cpp
|
||||
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${CMAKE_SOURCE_DIR}/cmake/get_version.cpp
|
||||
COMPILE_DEFINITIONS -I${BLIS_PATH}/include/ -I${BLIS_PATH}/include/blis
|
||||
LINK_LIBRARIES ${BLIS_LIBRARY} ${COMMON_LIBS} OpenMP::OpenMP_CXX ${ASAN_FLAGS} ${COVERAGE_FLAGS}
|
||||
RUN_OUTPUT_VARIABLE TEST_BLIS_VERSION
|
||||
COMPILE_OUTPUT_VARIABLE COMP_VAR_VERSION
|
||||
)
|
||||
else()
|
||||
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${PROJECT_SOURCE_DIR}/cmake/get_version.cpp
|
||||
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${CMAKE_SOURCE_DIR}/cmake/get_version.cpp
|
||||
COMPILE_DEFINITIONS -I${BLIS_PATH}/include/ -I${BLIS_PATH}/include/blis
|
||||
LINK_LIBRARIES ${BLIS_LIBRARY} ${COMMON_LIBS} ${ASAN_FLAGS} ${COVERAGE_FLAGS}
|
||||
RUN_OUTPUT_VARIABLE TEST_BLIS_VERSION
|
||||
@@ -387,14 +387,14 @@ endif()
|
||||
# This way, kernel tests won't be compiled/run for shared versions of BLIS.
|
||||
if(BLIS_LINKING_TYPE STREQUAL "static")
|
||||
if(ENABLE_THREADING STREQUAL "openmp")
|
||||
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${PROJECT_SOURCE_DIR}/cmake/config_ukr_tests.cpp
|
||||
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${CMAKE_SOURCE_DIR}/cmake/config_ukr_tests.cpp
|
||||
COMPILE_DEFINITIONS -I${BLIS_PATH}/include/ -I${BLIS_PATH}/include/blis
|
||||
LINK_LIBRARIES ${BLIS_LIBRARY} ${COMMON_LIBS} OpenMP::OpenMP_CXX ${ASAN_FLAGS} ${COVERAGE_FLAGS}
|
||||
RUN_OUTPUT_VARIABLE UKR_CONFIG
|
||||
COMPILE_OUTPUT_VARIABLE COMP_VAR
|
||||
)
|
||||
else()
|
||||
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${PROJECT_SOURCE_DIR}/cmake/config_ukr_tests.cpp
|
||||
try_run(RUNRESULT COMPILERESULT "${CMAKE_BINARY_DIR}/temp" SOURCES ${CMAKE_SOURCE_DIR}/cmake/config_ukr_tests.cpp
|
||||
COMPILE_DEFINITIONS -I${BLIS_PATH}/include/ -I${BLIS_PATH}/include/blis
|
||||
LINK_LIBRARIES ${BLIS_LIBRARY} ${COMMON_LIBS} ${ASAN_FLAGS} ${COVERAGE_FLAGS}
|
||||
RUN_OUTPUT_VARIABLE UKR_CONFIG
|
||||
|
||||
@@ -39,41 +39,150 @@
|
||||
*/
|
||||
|
||||
#ifdef AOCL_DEV
|
||||
#define K_zen4_int_40x2_mt zen4_int_40x2_mt
|
||||
#define K_zen4_int_40x8_mt zen4_int_40x8_mt
|
||||
#define K_zen4_int_40x2_st zen4_int_40x2_st
|
||||
#define K_zen4_int_40x8_st zen4_int_40x8_st
|
||||
#define K_bli_zdotv_zen_int_5 bli_zdotv_zen_int_5
|
||||
#define K_bli_cdotv_zen_int_5 bli_cdotv_zen_int_5
|
||||
#define K_bli_zaxpyv_zen_int_5 bli_zaxpyv_zen_int_5
|
||||
#define K_bli_caxpyv_zen_int_5 bli_caxpyv_zen_int_5
|
||||
#define K_bli_sswapv_zen_int_8 bli_sswapv_zen_int_8
|
||||
#define K_bli_dswapv_zen_int_8 bli_dswapv_zen_int_8
|
||||
#define K_bli_zdscalv_zen_int_10 bli_zdscalv_zen_int_10
|
||||
#define K_bli_sscalv_zen_int_10 bli_sscalv_zen_int_10
|
||||
#define K_bli_dscalv_zen_int_10 bli_dscalv_zen_int_10
|
||||
#define K_bli_sdotv_zen_int_10 bli_sdotv_zen_int_10
|
||||
#define K_bli_ddotv_zen_int_10 bli_ddotv_zen_int_10
|
||||
#define K_bli_saxpyv_zen_int_10 bli_saxpyv_zen_int_10
|
||||
#define K_bli_daxpyv_zen_int_10 bli_daxpyv_zen_int_10
|
||||
#define K_bli_saxpbyv_zen_int_10 bli_saxpbyv_zen_int_10
|
||||
#define K_bli_daxpbyv_zen_int_10 bli_daxpbyv_zen_int_10
|
||||
#define K_bli_dgemmsup_cv_zen5_asm_24x8m bli_dgemmsup_cv_zen5_asm_24x8m
|
||||
#define K_bli_dgemmsup_cv_zen4_asm_24x8m bli_dgemmsup_cv_zen4_asm_24x8m
|
||||
#define K_bli_dgemmsup_cv_zen4_asm_24x8m_new bli_dgemmsup_cv_zen4_asm_24x8m_new
|
||||
#define K_bli_dgemm_tiny_zen4_24x8 bli_dgemm_tiny_zen4_24x8
|
||||
#define K_bli_dgemm_tiny_zen_6x8 bli_dgemm_tiny_zen_6x8
|
||||
#define K_bli_zaxpyf_zen4_int_8 bli_zaxpyf_zen4_int_8
|
||||
#define K_bli_daxpyf_zen4_int bli_daxpyf_zen4_int
|
||||
#define K_bli_ddotxf_zen4_int bli_ddotxf_zen4_int
|
||||
#define K_bli_dgemm_zen4_asm_8x24 bli_dgemm_zen4_asm_8x24
|
||||
#define K_bli_ztrsm_small_zen_int_pack bli_ztrsm_small_zen_int_pack
|
||||
#define K_bli_ctrsm_small_zen_int_pack bli_ctrsm_small_zen_int_pack
|
||||
#define K_bli_strsm_small_zen_int_pack bli_strsm_small_zen_int_pack
|
||||
#define K_bli_dtrsm_small_zen_int_pack bli_dtrsm_small_zen_int_pack
|
||||
#define K_bli_ztrsm_small_zen5 bli_ztrsm_small_zen5
|
||||
#define K_bli_dtrsm_small_zen4_int_pack bli_dtrsm_small_zen4_int_pack
|
||||
#define K_bli_trsm_small_ref bli_trsm_small_ref
|
||||
#define K_bli_trsm_small_zen bli_trsm_small_zen
|
||||
#define K_bli_trsm_small_zen bli_trsm_small_zen
|
||||
#define K_bli_trsm_small_zen5_mt bli_trsm_small_zen5_mt
|
||||
#define K_bli_trsm_small_zen5 bli_trsm_small_zen5
|
||||
#define K_bli_trsm_small_zen4_mt bli_trsm_small_zen4_mt
|
||||
#define K_bli_trsm_small_zen4 bli_trsm_small_zen4
|
||||
#define K_bli_zsetv_zen4_int bli_zsetv_zen4_int
|
||||
#define K_bli_dsetv_zen4_int bli_dsetv_zen4_int
|
||||
#define K_bli_ssetv_zen4_int bli_ssetv_zen4_int
|
||||
#define K_bli_dgemv_n_zen4_int_32x8_st bli_dgemv_n_zen4_int_32x8_st
|
||||
#define K_scalv_zen4_int scalv_zen4_int
|
||||
#define K_scalv_zen4_int scalv_zen4_int
|
||||
#define K_bli_zscalv_zen4_int bli_zscalv_zen4_int
|
||||
#define K_bli_cscalv_zen4_int bli_cscalv_zen4_int
|
||||
#define K_bli_zdscalv_zen4_int bli_zdscalv_zen4_int
|
||||
#define K_bli_dscalv_zen4_int bli_dscalv_zen4_int
|
||||
#define K_bli_sscalv_zen4_int bli_sscalv_zen4_int
|
||||
#define K_bli_dscal2v_zen4_int bli_dscal2v_zen4_int
|
||||
#define K_bli_zdotxv_zen4_int bli_zdotxv_zen4_int
|
||||
#define K_bli_zdotv_zen4_asm bli_zdotv_zen4_asm
|
||||
#define K_bli_zdotv_zen4_int bli_zdotv_zen4_int
|
||||
#define K_bli_ddotv_zen4_int bli_ddotv_zen4_int
|
||||
#define K_bli_sdotv_zen4_int bli_sdotv_zen4_int
|
||||
#define K_bli_dcopyv_zen5_asm bli_dcopyv_zen5_asm
|
||||
#define K_bli_zcopyv_zen4_int bli_zcopyv_zen4_int
|
||||
#define K_bli_dcopyv_zen4_int bli_dcopyv_zen4_int
|
||||
#define K_bli_scopyv_zen4_int bli_scopyv_zen4_int
|
||||
#define K_bli_zcopyv_zen4_asm bli_zcopyv_zen4_asm
|
||||
#define K_bli_dcopyv_zen4_asm bli_dcopyv_zen4_asm
|
||||
#define K_bli_scopyv_zen4_asm bli_scopyv_zen4_asm
|
||||
#define K_bli_zaxpyv_zen4_int bli_zaxpyv_zen4_int
|
||||
#define K_bli_daxpyv_zen4_int bli_daxpyv_zen4_int
|
||||
#define K_bli_saxpyv_zen4_int bli_saxpyv_zen4_int
|
||||
#define K_bli_daxpbyv_zen4_int bli_daxpbyv_zen4_int
|
||||
#define K_bli_damaxv_zen4_int bli_damaxv_zen4_int
|
||||
#define K_bli_samaxv_zen4_int bli_samaxv_zen4_int
|
||||
#define K_bli_daddv_zen4_int bli_daddv_zen4_int
|
||||
#define K_bli_dnorm2fv_zen4_int_unb_var1 bli_dnorm2fv_zen4_int_unb_var1
|
||||
#define K_bli_snorm2fv_zen_int_unb_var1 bli_snorm2fv_zen_int_unb_var1
|
||||
#define K_bli_scnorm2fv_zen_int_unb_var1 bli_scnorm2fv_zen_int_unb_var1
|
||||
#define K_bli_dznorm2fv_zen_int_unb_var1 bli_dznorm2fv_zen_int_unb_var1
|
||||
#define K_bli_dnorm2fv_zen_int_unb_var1 bli_dnorm2fv_zen_int_unb_var1
|
||||
#define K_bli_sgemmsup_rd_zen4_asm_6x64n bli_sgemmsup_rd_zen4_asm_6x64n
|
||||
#define K_bli_sgemmsup_rd_zen4_asm_6x64m bli_sgemmsup_rd_zen4_asm_6x64m
|
||||
#define K_bli_sgemmsup_rv_zen4_asm_6x64n bli_sgemmsup_rv_zen4_asm_6x64n
|
||||
#define K_bli_sgemmsup_rv_zen4_asm_6x64m bli_sgemmsup_rv_zen4_asm_6x64m
|
||||
#define K_bli_sgemmsup_rv_zen4_asm_6x64n bli_sgemmsup_rv_zen4_asm_6x64n
|
||||
#define K_bli_sgemmsup_rv_zen4_asm_6x64m bli_sgemmsup_rv_zen4_asm_6x64m
|
||||
#define K_bli_dgemmtrsm_u_zen4_asm_16x14 bli_dgemmtrsm_u_zen4_asm_16x14
|
||||
#define K_bli_dgemmtrsm_l_zen4_asm_16x14 bli_dgemmtrsm_l_zen4_asm_16x14
|
||||
#define K_bli_dgemv_n_zen bli_dgemv_n_zen
|
||||
#define K_bli_dgemv_t_zen_int_16x1m bli_dgemv_t_zen_int_16x1m
|
||||
#define K_bli_dgemv_t_zen_int_16x2m bli_dgemv_t_zen_int_16x2m
|
||||
#define K_bli_dgemv_t_zen_int_16x3m bli_dgemv_t_zen_int_16x3m
|
||||
#define K_bli_dgemv_t_zen_int_16x4m bli_dgemv_t_zen_int_16x4m
|
||||
#define K_bli_dgemv_t_zen_int_16x5m bli_dgemv_t_zen_int_16x5m
|
||||
#define K_bli_dgemv_t_zen_int_16x6m bli_dgemv_t_zen_int_16x6m
|
||||
#define K_bli_dgemv_t_zen_int_16x7m bli_dgemv_t_zen_int_16x7m
|
||||
#define K_bli_dgemv_t_zen_int bli_dgemv_t_zen_int
|
||||
#define K_bli_dgemv_t_zen4_int_32x1m bli_dgemv_t_zen4_int_32x1m
|
||||
#define K_bli_dgemv_t_zen4_int_32x2m bli_dgemv_t_zen4_int_32x2m
|
||||
#define K_bli_dgemv_t_zen4_int_32x3m bli_dgemv_t_zen4_int_32x3m
|
||||
#define K_bli_dgemv_t_zen4_int_32x4m bli_dgemv_t_zen4_int_32x4m
|
||||
#define K_bli_dgemv_t_zen4_int_32x5m bli_dgemv_t_zen4_int_32x5m
|
||||
#define K_bli_dgemv_t_zen4_int_32x6m bli_dgemv_t_zen4_int_32x6m
|
||||
#define K_bli_dgemv_t_zen4_int_32x7m bli_dgemv_t_zen4_int_32x7m
|
||||
#define K_bli_dgemv_t_zen4_int bli_dgemv_t_zen4_int
|
||||
#define K_bli_dgemv_n_zen4_int_m_leftx1n bli_dgemv_n_zen4_int_m_leftx1n
|
||||
#define K_bli_dgemv_n_zen4_int_8x1n bli_dgemv_n_zen4_int_8x1n
|
||||
#define K_bli_dgemv_n_zen4_int_16x1n bli_dgemv_n_zen4_int_16x1n
|
||||
#define K_bli_dgemv_n_zen4_int_32x1n bli_dgemv_n_zen4_int_32x1n
|
||||
#define K_bli_dgemv_n_zen4_int_m_leftx2n bli_dgemv_n_zen4_int_m_leftx2n
|
||||
#define K_bli_dgemv_n_zen4_int_8x2n bli_dgemv_n_zen4_int_8x2n
|
||||
#define K_bli_dgemv_n_zen4_int_16x2n bli_dgemv_n_zen4_int_16x2n
|
||||
#define K_bli_dgemv_n_zen4_int_32x2n bli_dgemv_n_zen4_int_32x2n
|
||||
#define K_bli_dgemv_n_zen4_int_m_leftx3n bli_dgemv_n_zen4_int_m_leftx3n
|
||||
#define K_bli_dgemv_n_zen4_int_8x3n bli_dgemv_n_zen4_int_8x3n
|
||||
#define K_bli_dgemv_n_zen4_int_16x3n bli_dgemv_n_zen4_int_16x3n
|
||||
#define K_bli_dgemv_n_zen4_int_32x3n bli_dgemv_n_zen4_int_32x3n
|
||||
#define K_bli_dgemv_n_zen4_int_m_leftx4n bli_dgemv_n_zen4_int_m_leftx4n
|
||||
#define K_bli_dgemv_n_zen4_int_8x4n bli_dgemv_n_zen4_int_8x4n
|
||||
#define K_bli_dgemv_n_zen4_int_16x4n bli_dgemv_n_zen4_int_16x4n
|
||||
#define K_bli_dgemv_n_zen4_int_32x4n bli_dgemv_n_zen4_int_32x4n
|
||||
#define K_bli_dgemv_n_zen4_int_m_leftx8n bli_dgemv_n_zen4_int_m_leftx8n
|
||||
#define K_bli_dgemv_n_zen4_int_8x8n bli_dgemv_n_zen4_int_8x8n
|
||||
#define K_bli_dgemv_n_zen4_int_16x8n bli_dgemv_n_zen4_int_16x8n
|
||||
#define K_bli_dgemv_n_zen4_int_32x8n bli_dgemv_n_zen4_int_32x8n
|
||||
#define K_bli_dgemv_n_zen4_int_16mx1 bli_dgemv_n_zen4_int_16mx1
|
||||
#define K_bli_dgemv_n_zen4_int_16mx2 bli_dgemv_n_zen4_int_16mx2
|
||||
#define K_bli_dgemv_n_zen4_int_16mx3 bli_dgemv_n_zen4_int_16mx3
|
||||
#define K_bli_dgemv_n_zen4_int_16mx4 bli_dgemv_n_zen4_int_16mx4
|
||||
#define K_bli_dgemv_n_zen4_int_16mx5 bli_dgemv_n_zen4_int_16mx5
|
||||
#define K_bli_dgemv_n_zen4_int_16mx6 bli_dgemv_n_zen4_int_16mx6
|
||||
#define K_bli_dgemv_n_zen4_int_16mx7 bli_dgemv_n_zen4_int_16mx7
|
||||
#define K_bli_dgemv_n_zen4_int_16mx8 bli_dgemv_n_zen4_int_16mx8
|
||||
#define K_bli_dgemv_n_zen4_int bli_dgemv_n_zen4_int
|
||||
#define K_bli_cgemm_zen4_int_32x4_k1_nn bli_cgemm_zen4_int_32x4_k1_nn
|
||||
#define K_bli_zgemm_zen4_int_16x4_k1_nn bli_zgemm_zen4_int_16x4_k1_nn
|
||||
#define K_bli_dgemm_zen4_int_24x8_k1_nn bli_dgemm_zen4_int_24x8_k1_nn
|
||||
#define K_bli_zgemm_zen_int_4x4_k1_nn bli_zgemm_zen_int_4x4_k1_nn
|
||||
#define K_bli_dgemm_zen_int_8x6_k1_nn bli_dgemm_zen_int_8x6_k1_nn
|
||||
|
||||
#define AOCL_51
|
||||
#define K_bli_dgemv_n_zen4_int 1
|
||||
#define K_bli_dgemv_n_zen4_40x2_int_st 1
|
||||
#define K_bli_dgemv_n_zen4_40x2_int_mt 1
|
||||
#define K_bli_dgemv_m_zen4_40x8_int_st 1
|
||||
#define K_bli_dgemv_m_zen4_40x8_int_mt_Ndiv 1
|
||||
#define K_bli_dgemv_m_zen4_40x8_int_mt_Mdiv 1
|
||||
#define K_bli_dgemv_m_zen4_40x8_int_mt_Mdiv_Ndiv 1
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef AOCL_51
|
||||
#define K_bli_zgemmsup_cv_zen4_asm_fx1 1
|
||||
#define K_bli_zgemmsup_cv_zen4_asm_fx2 1
|
||||
#define K_bli_zgemmsup_cv_zen4_asm_fx3 1
|
||||
#define K_bli_zgemmsup_cv_zen4_asm_fx4 1
|
||||
#define K_bli_cgemm_32x4_avx512_k1_nn 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_24x4m 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_24x3m 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_24x2m 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_24x1m 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_16x4 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_16x3 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_16x2 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_16x1 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_8x4 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_8x3 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_8x2 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_8x1 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_fx4 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_fx3 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_fx2 1
|
||||
#define K_bli_cgemmsup_cv_zen4_asm_fx1 1
|
||||
#define K_bli_cgemm_zen4_asm_24x4 1
|
||||
#define K_bli_cgemm_zen4_asm_4x24 1
|
||||
|
||||
#define K_bli_zgemmsup_cd_zen4_asm_12x2m 1
|
||||
#define K_bli_zgemmsup_cd_zen4_asm_12x4m 1
|
||||
#define K_bli_zgemmsup_cd_zen4_asm_2x2 1
|
||||
@@ -82,54 +191,117 @@
|
||||
#define K_bli_zgemmsup_cd_zen4_asm_4x4 1
|
||||
#define K_bli_zgemmsup_cd_zen4_asm_8x2 1
|
||||
#define K_bli_zgemmsup_cd_zen4_asm_8x4 1
|
||||
#define K_bli_dgemmsup_rv_zen4_asm_24x8m_new 1
|
||||
#define K_bli_dgemv_t_zen_int 1
|
||||
#define K_bli_dgemv_t_zen_int_16x7m 1
|
||||
#define K_bli_dgemv_t_zen_int_16x6m 1
|
||||
#define K_bli_dgemv_t_zen_int_16x5m 1
|
||||
#define K_bli_dgemv_t_zen_int_16x4m 1
|
||||
#define K_bli_dgemv_t_zen_int_16x3m 1
|
||||
#define K_bli_dgemv_t_zen_int_16x2m 1
|
||||
#define K_bli_dgemv_t_zen_int_16x1m 1
|
||||
#define K_bli_dgemv_t_zen4_int 1
|
||||
#define K_bli_dgemv_t_zen4_int_32x7m 1
|
||||
#define K_bli_dgemv_t_zen4_int_32x6m 1
|
||||
#define K_bli_dgemv_t_zen4_int_32x5m 1
|
||||
#define K_bli_dgemv_t_zen4_int_32x4m 1
|
||||
#define K_bli_dgemv_t_zen4_int_32x3m 1
|
||||
#define K_bli_dgemv_t_zen4_int_32x2m 1
|
||||
#define K_bli_dgemv_t_zen4_int_32x1m 1
|
||||
#define K_bli_ztrsm_small_ZEN5 1
|
||||
#define K_bli_dgemv_n_zen_int_16mx8_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_16mx7_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_16mx6_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_16mx5_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_16mx4_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_16mx3_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_16mx2_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_16mx1_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_32x8n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_16x8n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_8x8n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_m_leftx8n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_32x4n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_16x4n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_8x4n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_m_leftx4n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_32x3n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_16x3n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_8x3n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_m_leftx3n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_32x2n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_16x2n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_8x2n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_m_leftx2n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_32x1n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_16x1n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_8x1n_avx512 1
|
||||
#define K_bli_dgemv_n_zen_int_m_leftx1n_avx512 1
|
||||
#define K_bli_dcopyv_zen4_asm_avx512_biway 1
|
||||
#define K_bli_dcopyv_zen5_asm_avx512 1
|
||||
#ifndef K_bli_dgemmsup_cv_zen4_asm_24x8m_new
|
||||
#define K_bli_dgemmsup_cv_zen4_asm_24x8m_new bli_dgemmsup_rv_zen4_asm_24x8m_new
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_t_zen_int
|
||||
#define K_bli_dgemv_t_zen_int bli_dgemv_t_zen_int_avx2
|
||||
#endif
|
||||
#define K_bli_dgemv_t_zen_int_mx7_avx2 1
|
||||
#define K_bli_dgemv_t_zen_int_mx6_avx2 1
|
||||
#define K_bli_dgemv_t_zen_int_mx5_avx2 1
|
||||
#define K_bli_dgemv_t_zen_int_mx4_avx2 1
|
||||
#define K_bli_dgemv_t_zen_int_mx3_avx2 1
|
||||
#define K_bli_dgemv_t_zen_int_mx2_avx2 1
|
||||
#define K_bli_dgemv_t_zen_int_mx1_avx2 1
|
||||
#ifndef K_bli_dgemv_t_zen4_int
|
||||
#define K_bli_dgemv_t_zen4_int bli_dgemv_t_zen_int_avx512
|
||||
#endif
|
||||
#define K_bli_dgemv_t_zen_int_mx7_avx512 1
|
||||
#define K_bli_dgemv_t_zen_int_mx6_avx512 1
|
||||
#define K_bli_dgemv_t_zen_int_mx5_avx512 1
|
||||
#define K_bli_dgemv_t_zen_int_mx4_avx512 1
|
||||
#define K_bli_dgemv_t_zen_int_mx3_avx512 1
|
||||
#define K_bli_dgemv_t_zen_int_mx2_avx512 1
|
||||
#define K_bli_dgemv_t_zen_int_mx1_avx512 1
|
||||
#ifndef K_bli_ztrsm_small_zen5
|
||||
#define K_bli_ztrsm_small_zen5 bli_ztrsm_small_ZEN5
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_16mx8
|
||||
#define K_bli_dgemv_n_zen4_int_16mx8 bli_dgemv_n_zen_int_16mx8_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_16mx7
|
||||
#define K_bli_dgemv_n_zen4_int_16mx7 bli_dgemv_n_zen_int_16mx7_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_16mx6
|
||||
#define K_bli_dgemv_n_zen4_int_16mx6 bli_dgemv_n_zen_int_16mx6_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_16mx5
|
||||
#define K_bli_dgemv_n_zen4_int_16mx5 bli_dgemv_n_zen_int_16mx5_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_16mx4
|
||||
#define K_bli_dgemv_n_zen4_int_16mx4 bli_dgemv_n_zen_int_16mx4_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_16mx3
|
||||
#define K_bli_dgemv_n_zen4_int_16mx3 bli_dgemv_n_zen_int_16mx3_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_16mx2
|
||||
#define K_bli_dgemv_n_zen4_int_16mx2 bli_dgemv_n_zen_int_16mx2_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_16mx1
|
||||
#define K_bli_dgemv_n_zen4_int_16mx1 bli_dgemv_n_zen_int_16mx1_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_32x8n
|
||||
#define K_bli_dgemv_n_zen4_int_32x8n bli_dgemv_n_zen_int_32x8n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_16x8n
|
||||
#define K_bli_dgemv_n_zen4_int_16x8n bli_dgemv_n_zen_int_16x8n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_8x8n
|
||||
#define K_bli_dgemv_n_zen4_int_8x8n bli_dgemv_n_zen_int_8x8n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_m_leftx8n
|
||||
#define K_bli_dgemv_n_zen4_int_m_leftx8n bli_dgemv_n_zen_int_m_leftx8n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_32x4n
|
||||
#define K_bli_dgemv_n_zen4_int_32x4n bli_dgemv_n_zen_int_32x4n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_16x4n
|
||||
#define K_bli_dgemv_n_zen4_int_16x4n bli_dgemv_n_zen_int_16x4n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_8x4n
|
||||
#define K_bli_dgemv_n_zen4_int_8x4n bli_dgemv_n_zen_int_8x4n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_m_leftx4n
|
||||
#define K_bli_dgemv_n_zen4_int_m_leftx4n bli_dgemv_n_zen_int_m_leftx4n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_32x3n
|
||||
#define K_bli_dgemv_n_zen4_int_32x3n bli_dgemv_n_zen_int_32x3n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_16x3n
|
||||
#define K_bli_dgemv_n_zen4_int_16x3n bli_dgemv_n_zen_int_16x3n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_8x3n
|
||||
#define K_bli_dgemv_n_zen4_int_8x3n bli_dgemv_n_zen_int_8x3n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_m_leftx3n
|
||||
#define K_bli_dgemv_n_zen4_int_m_leftx3n bli_dgemv_n_zen_int_m_leftx3n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_32x2n
|
||||
#define K_bli_dgemv_n_zen4_int_32x2n bli_dgemv_n_zen_int_32x2n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_16x2n
|
||||
#define K_bli_dgemv_n_zen4_int_16x2n bli_dgemv_n_zen_int_16x2n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_8x2n
|
||||
#define K_bli_dgemv_n_zen4_int_8x2n bli_dgemv_n_zen_int_8x2n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_m_leftx2n
|
||||
#define K_bli_dgemv_n_zen4_int_m_leftx2n bli_dgemv_n_zen_int_m_leftx2n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_32x1n
|
||||
#define K_bli_dgemv_n_zen4_int_32x1n bli_dgemv_n_zen_int_32x1n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_16x1n
|
||||
#define K_bli_dgemv_n_zen4_int_16x1n bli_dgemv_n_zen_int_16x1n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_8x1n
|
||||
#define K_bli_dgemv_n_zen4_int_8x1n bli_dgemv_n_zen_int_8x1n_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemv_n_zen4_int_m_leftx1n
|
||||
#define K_bli_dgemv_n_zen4_int_m_leftx1n bli_dgemv_n_zen_int_m_leftx1n_avx512
|
||||
#endif
|
||||
|
||||
#define AOCL_50
|
||||
|
||||
#endif
|
||||
@@ -141,35 +313,75 @@
|
||||
#define K_bli_ccopyv_zen_int 1
|
||||
#define K_bli_cscal2v_zen_int 1
|
||||
#define K_bli_cscalv_zen_int 1
|
||||
#define K_bli_cscalv_zen_int_avx512 1
|
||||
#ifndef K_bli_cscalv_zen4_int
|
||||
#define K_bli_cscalv_zen4_int bli_cscalv_zen_int_avx512
|
||||
#endif
|
||||
#define K_bli_csetv_zen_int 1
|
||||
#define K_bli_daddv_zen_int 1
|
||||
#define K_bli_daddv_zen_int_avx512 1
|
||||
#define K_bli_daxpbyv_zen_int_avx512 1
|
||||
#define K_bli_daxpyf_zen_int_avx512 1
|
||||
#define K_bli_dcopyv_zen4_asm_avx512 1
|
||||
#define K_bli_dgemm_avx512_asm_8x24 1
|
||||
#define K_bli_dnorm2fv_unb_var1_avx512 1
|
||||
#ifndef K_bli_daddv_zen4_int
|
||||
#define K_bli_daddv_zen4_int bli_daddv_zen_int_avx512
|
||||
#endif
|
||||
#ifndef K_bli_daxpbyv_zen4_int
|
||||
#define K_bli_daxpbyv_zen4_int bli_daxpbyv_zen_int_avx512
|
||||
#endif
|
||||
#ifndef K_bli_daxpyf_zen4_int
|
||||
#define K_bli_daxpyf_zen4_int bli_daxpyf_zen_int_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dcopyv_zen4_asm
|
||||
#define K_bli_dcopyv_zen4_asm bli_dcopyv_zen4_asm_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dgemm_zen4_asm_8x24
|
||||
#define K_bli_dgemm_zen4_asm_8x24 bli_dgemm_avx512_asm_8x24
|
||||
#endif
|
||||
#ifndef K_bli_dnorm2fv_zen4_int_unb_var1
|
||||
#define K_bli_dnorm2fv_zen4_int_unb_var1 bli_dnorm2fv_unb_var1_avx512
|
||||
#endif
|
||||
#define K_bli_dscal2v_zen_int 1
|
||||
#define K_bli_dscal2v_zen_int_avx512 1
|
||||
#define K_bli_dsetv_zen_int_avx512 1
|
||||
#ifndef K_bli_dscal2v_zen4_int
|
||||
#define K_bli_dscal2v_zen4_int bli_dscal2v_zen_int_avx512
|
||||
#endif
|
||||
#ifndef K_bli_dsetv_zen4_int
|
||||
#define K_bli_dsetv_zen4_int bli_dsetv_zen_int_avx512
|
||||
#endif
|
||||
#define K_bli_saddv_zen_int 1
|
||||
#define K_bli_scopyv_zen4_asm_avx512 1
|
||||
#ifndef K_bli_scopyv_zen4_asm
|
||||
#define K_bli_scopyv_zen4_asm bli_scopyv_zen4_asm_avx512
|
||||
#endif
|
||||
#define K_bli_sscal2v_zen_int 1
|
||||
#define K_bli_ssetv_zen_int_avx512 1
|
||||
#ifndef K_bli_ssetv_zen4_int
|
||||
#define K_bli_ssetv_zen4_int bli_ssetv_zen_int_avx512
|
||||
#endif
|
||||
#define K_bli_zaddv_zen_int 1
|
||||
#define K_bli_zaxpyf_zen_int_8_avx512 1
|
||||
#define K_bli_zaxpyv_zen_int_avx512 1
|
||||
#define K_bli_zcopyv_zen4_asm_avx512 1
|
||||
#define K_bli_zdotv_zen4_asm_avx512 1
|
||||
#define K_bli_zdotv_zen_int_avx512 1
|
||||
#define K_bli_zgemm_16x4_avx512_k1_nn 1
|
||||
#define K_bli_zscalv_zen_int_avx512 1
|
||||
#ifndef K_bli_zaxpyf_zen4_int_8
|
||||
#define K_bli_zaxpyf_zen4_int_8 bli_zaxpyf_zen_int_8_avx512
|
||||
#endif
|
||||
#ifndef K_bli_zaxpyv_zen4_int
|
||||
#define K_bli_zaxpyv_zen4_int bli_zaxpyv_zen_int_avx512
|
||||
#endif
|
||||
#ifndef K_bli_zcopyv_zen4_asm
|
||||
#define K_bli_zcopyv_zen4_asm bli_zcopyv_zen4_asm_avx512
|
||||
#endif
|
||||
#ifndef K_bli_zdotv_zen4_asm
|
||||
#define K_bli_zdotv_zen4_asm bli_zdotv_zen4_asm_avx512
|
||||
#endif
|
||||
#ifndef K_bli_zdotv_zen4_int
|
||||
#define K_bli_zdotv_zen4_int bli_zdotv_zen_int_avx512
|
||||
#endif
|
||||
#ifndef K_bli_zgemm_zen4_int_16x4_k1_nn
|
||||
#define K_bli_zgemm_zen4_int_16x4_k1_nn bli_zgemm_16x4_avx512_k1_nn
|
||||
#endif
|
||||
#ifndef K_bli_zscalv_zen4_int
|
||||
#define K_bli_zscalv_zen4_int bli_zscalv_zen_int_avx512
|
||||
#endif
|
||||
#define K_bli_zsetv_zen_int 1
|
||||
#define K_bli_zsetv_zen_int_avx512 1
|
||||
#ifndef K_bli_zsetv_zen4_int
|
||||
#define K_bli_zsetv_zen4_int bli_zsetv_zen_int_avx512
|
||||
#endif
|
||||
|
||||
// In AOCL 4.2 but interface changed at 5.0
|
||||
#define K_bli_zgemm_4x4_avx2_k1_nn 1
|
||||
#ifndef K_bli_zgemm_zen_int_4x4_k1_nn
|
||||
#define K_bli_zgemm_zen_int_4x4_k1_nn bli_zgemm_4x4_avx2_k1_nn
|
||||
#endif
|
||||
|
||||
#define AOCL_42
|
||||
|
||||
@@ -180,13 +392,19 @@
|
||||
|
||||
#define E_GEMM_COMPUTE
|
||||
|
||||
#define K_bli_dgemm_24x8_avx512_k1_nn 1
|
||||
#define K_bli_zdscalv_zen_int_avx512 1
|
||||
#ifndef K_bli_dgemm_zen4_int_24x8_k1_nn
|
||||
#define K_bli_dgemm_zen4_int_24x8_k1_nn bli_dgemm_24x8_avx512_k1_nn
|
||||
#endif
|
||||
#ifndef K_bli_zdscalv_zen4_int
|
||||
#define K_bli_zdscalv_zen4_int bli_zdscalv_zen_int_avx512
|
||||
#endif
|
||||
#define K_bli_zgemm_zen4_asm_4x12 1
|
||||
#define K_bli_zgemm_zen_asm_2x6 1
|
||||
|
||||
// In AOCL 4.1 but interface changed at 4.2
|
||||
#define K_bli_dgemm_8x6_avx2_k1_nn 1
|
||||
#ifndef K_bli_dgemm_zen_int_8x6_k1_nn
|
||||
#define K_bli_dgemm_zen_int_8x6_k1_nn bli_dgemm_8x6_avx2_k1_nn
|
||||
#endif
|
||||
|
||||
#define AOCL_41
|
||||
|
||||
@@ -196,7 +414,9 @@
|
||||
#ifdef AOCL_41
|
||||
|
||||
#define K_bli_caxpbyv_zen_int 1
|
||||
#define K_bli_caxpyv_zen_int5 1
|
||||
#ifndef K_bli_caxpyv_zen_int_5
|
||||
#define K_bli_caxpyv_zen_int_5 bli_caxpyv_zen_int5
|
||||
#endif
|
||||
#define K_bli_cgemm_haswell_asm_3x8 1
|
||||
#define K_bli_cgemmsup_rv_zen_asm_1x2 1
|
||||
#define K_bli_cgemmsup_rv_zen_asm_1x4 1
|
||||
@@ -213,68 +433,128 @@
|
||||
#define K_bli_cgemmsup_rv_zen_asm_3x8m 1
|
||||
#define K_bli_cgemmsup_rv_zen_asm_3x8n 1
|
||||
#define K_bli_damaxv_zen_int 1
|
||||
#define K_bli_damaxv_zen_int_avx512 1
|
||||
#ifndef K_bli_damaxv_zen4_int
|
||||
#define K_bli_damaxv_zen4_int bli_damaxv_zen_int_avx512
|
||||
#endif
|
||||
#define K_bli_daxpbyv_zen_int 1
|
||||
#define K_bli_daxpbyv_zen_int10 1
|
||||
#ifndef K_bli_daxpbyv_zen_int_10
|
||||
#define K_bli_daxpbyv_zen_int_10 bli_daxpbyv_zen_int10
|
||||
#endif
|
||||
#define K_bli_daxpyv_zen_int 1
|
||||
#define K_bli_daxpyv_zen_int10 1
|
||||
#define K_bli_daxpyv_zen_int_avx512 1
|
||||
#ifndef K_bli_daxpyv_zen_int_10
|
||||
#define K_bli_daxpyv_zen_int_10 bli_daxpyv_zen_int10
|
||||
#endif
|
||||
#ifndef K_bli_daxpyv_zen4_int
|
||||
#define K_bli_daxpyv_zen4_int bli_daxpyv_zen_int_avx512
|
||||
#endif
|
||||
#define K_bli_dcopyv_zen_int 1
|
||||
#define K_bli_ddotv_zen_int 1
|
||||
#define K_bli_ddotv_zen_int10 1
|
||||
#define K_bli_ddotv_zen_int_avx512 1
|
||||
#ifndef K_bli_ddotv_zen_int_10
|
||||
#define K_bli_ddotv_zen_int_10 bli_ddotv_zen_int10
|
||||
#endif
|
||||
#ifndef K_bli_ddotv_zen4_int
|
||||
#define K_bli_ddotv_zen4_int bli_ddotv_zen_int_avx512
|
||||
#endif
|
||||
#define K_bli_dgemm_haswell_asm_6x8 1
|
||||
#define K_bli_dgemm_zen4_asm_32x6 1
|
||||
#define K_bli_dgemm_zen4_asm_8x24 1
|
||||
#ifndef K_bli_dgemm_zen4_asm_8x24
|
||||
#define K_bli_dgemm_zen4_asm_8x24 bli_dgemm_zen4_asm_8x24
|
||||
#endif
|
||||
#define K_bli_dgemmsup_rd_haswell_asm_6x8m 1
|
||||
#define K_bli_dgemmsup_rd_haswell_asm_6x8n 1
|
||||
#define K_bli_dgemmsup_rv_haswell_asm_6x8m 1
|
||||
#define K_bli_dgemmsup_rv_haswell_asm_6x8n 1
|
||||
#define K_bli_dgemmsup_rv_zen4_asm_24x8m 1
|
||||
#define K_bli_dgemmsup_rv_zen5_asm_24x8m 1
|
||||
#ifndef K_bli_dgemmsup_cv_zen4_asm_24x8m
|
||||
#define K_bli_dgemmsup_cv_zen4_asm_24x8m bli_dgemmsup_rv_zen4_asm_24x8m
|
||||
#endif
|
||||
#ifndef K_bli_dgemmsup_cv_zen5_asm_24x8m
|
||||
#define K_bli_dgemmsup_cv_zen5_asm_24x8m bli_dgemmsup_rv_zen5_asm_24x8m
|
||||
#endif
|
||||
#define K_bli_dgemmtrsm_l_haswell_asm_6x8 1
|
||||
#define K_bli_dgemmtrsm_l_zen4_asm_8x24 1
|
||||
#define K_bli_dgemmtrsm_u_haswell_asm_6x8 1
|
||||
#define K_bli_dgemmtrsm_u_zen4_asm_8x24 1
|
||||
#define K_bli_dnorm2fv_unb_var1_avx2 1
|
||||
#ifndef K_bli_dnorm2fv_zen_int_unb_var1
|
||||
#define K_bli_dnorm2fv_zen_int_unb_var1 bli_dnorm2fv_unb_var1_avx2
|
||||
#endif
|
||||
#define K_bli_dscalv_zen_int 1
|
||||
#define K_bli_dscalv_zen_int10 1
|
||||
#define K_bli_dscalv_zen_int_avx512 1
|
||||
#ifndef K_bli_dscalv_zen_int_10
|
||||
#define K_bli_dscalv_zen_int_10 bli_dscalv_zen_int10
|
||||
#endif
|
||||
#ifndef K_bli_dscalv_zen4_int
|
||||
#define K_bli_dscalv_zen4_int bli_dscalv_zen_int_avx512
|
||||
#endif
|
||||
#define K_bli_dsetv_zen_int 1
|
||||
#define K_bli_dswapv_zen_int8 1
|
||||
#define K_bli_dznorm2fv_unb_var1_avx2 1
|
||||
#ifndef K_bli_dswapv_zen_int_8
|
||||
#define K_bli_dswapv_zen_int_8 bli_dswapv_zen_int8
|
||||
#endif
|
||||
#ifndef K_bli_dznorm2fv_zen_int_unb_var1
|
||||
#define K_bli_dznorm2fv_zen_int_unb_var1 bli_dznorm2fv_unb_var1_avx2
|
||||
#endif
|
||||
#define K_bli_samaxv_zen_int 1
|
||||
#define K_bli_samaxv_zen_int_avx512 1
|
||||
#ifndef K_bli_samaxv_zen4_int
|
||||
#define K_bli_samaxv_zen4_int bli_samaxv_zen_int_avx512
|
||||
#endif
|
||||
#define K_bli_saxpbyv_zen_int 1
|
||||
#define K_bli_saxpbyv_zen_int10 1
|
||||
#ifndef K_bli_saxpbyv_zen_int_10
|
||||
#define K_bli_saxpbyv_zen_int_10 bli_saxpbyv_zen_int10
|
||||
#endif
|
||||
#define K_bli_saxpyv_zen_int 1
|
||||
#define K_bli_saxpyv_zen_int10 1
|
||||
#define K_bli_saxpyv_zen_int_avx512 1
|
||||
#define K_bli_scnorm2fv_unb_var1_avx2 1
|
||||
#ifndef K_bli_saxpyv_zen_int_10
|
||||
#define K_bli_saxpyv_zen_int_10 bli_saxpyv_zen_int10
|
||||
#endif
|
||||
#ifndef K_bli_saxpyv_zen4_int
|
||||
#define K_bli_saxpyv_zen4_int bli_saxpyv_zen_int_avx512
|
||||
#endif
|
||||
#ifndef K_bli_scnorm2fv_zen_int_unb_var1
|
||||
#define K_bli_scnorm2fv_zen_int_unb_var1 bli_scnorm2fv_unb_var1_avx2
|
||||
#endif
|
||||
#define K_bli_scopyv_zen_int 1
|
||||
#define K_bli_sgemm_haswell_asm_6x16 1
|
||||
#define K_bli_sgemm_skx_asm_32x12_l2 1
|
||||
#define K_bli_sgemmsup_rd_zen_asm_6x16m 1
|
||||
#define K_bli_sgemmsup_rd_zen_asm_6x16n 1
|
||||
#define K_bli_sgemmsup_rd_zen_asm_6x64m_avx512 1
|
||||
#define K_bli_sgemmsup_rd_zen_asm_6x64n_avx512 1
|
||||
#ifndef K_bli_sgemmsup_rd_zen4_asm_6x64m
|
||||
#define K_bli_sgemmsup_rd_zen4_asm_6x64m bli_sgemmsup_rd_zen_asm_6x64m_avx512
|
||||
#endif
|
||||
#ifndef K_bli_sgemmsup_rd_zen4_asm_6x64n
|
||||
#define K_bli_sgemmsup_rd_zen4_asm_6x64n bli_sgemmsup_rd_zen_asm_6x64n_avx512
|
||||
#endif
|
||||
#define K_bli_sgemmsup_rv_zen_asm_6x16m 1
|
||||
#define K_bli_sgemmsup_rv_zen_asm_6x16n 1
|
||||
#define K_bli_sgemmsup_rv_zen_asm_6x64m_avx512 1
|
||||
#define K_bli_sgemmsup_rv_zen_asm_6x64n_avx512 1
|
||||
#ifndef K_bli_sgemmsup_rv_zen4_asm_6x64m
|
||||
#define K_bli_sgemmsup_rv_zen4_asm_6x64m bli_sgemmsup_rv_zen_asm_6x64m_avx512
|
||||
#endif
|
||||
#ifndef K_bli_sgemmsup_rv_zen4_asm_6x64n
|
||||
#define K_bli_sgemmsup_rv_zen4_asm_6x64n bli_sgemmsup_rv_zen_asm_6x64n_avx512
|
||||
#endif
|
||||
#define K_bli_sgemmtrsm_l_haswell_asm_6x16 1
|
||||
#define K_bli_sgemmtrsm_u_haswell_asm_6x16 1
|
||||
#define K_bli_snorm2fv_unb_var1_avx2 1
|
||||
#ifndef K_bli_snorm2fv_zen_int_unb_var1
|
||||
#define K_bli_snorm2fv_zen_int_unb_var1 bli_snorm2fv_unb_var1_avx2
|
||||
#endif
|
||||
#define K_bli_sscalv_zen_int 1
|
||||
#define K_bli_sscalv_zen_int10 1
|
||||
#ifndef K_bli_sscalv_zen_int_10
|
||||
#define K_bli_sscalv_zen_int_10 bli_sscalv_zen_int10
|
||||
#endif
|
||||
#define K_bli_ssetv_zen_int 1
|
||||
#define K_bli_sswapv_zen_int8 1
|
||||
#define K_bli_trsm_small 1
|
||||
#define K_bli_trsm_small_AVX512 1
|
||||
#ifndef K_bli_sswapv_zen_int_8
|
||||
#define K_bli_sswapv_zen_int_8 bli_sswapv_zen_int8
|
||||
#endif
|
||||
#ifndef K_bli_trsm_small_zen
|
||||
#define K_bli_trsm_small_zen bli_trsm_small
|
||||
#endif
|
||||
#ifndef K_bli_trsm_small_zen4
|
||||
#define K_bli_trsm_small_zen4 bli_trsm_small_AVX512
|
||||
#endif
|
||||
#define K_bli_zaxpbyv_zen_int 1
|
||||
#define K_bli_zaxpyv_zen_int5 1
|
||||
#ifndef K_bli_zaxpyv_zen_int_5
|
||||
#define K_bli_zaxpyv_zen_int_5 bli_zaxpyv_zen_int5
|
||||
#endif
|
||||
#define K_bli_zcopyv_zen_int 1
|
||||
#define K_bli_zdscalv_zen_int10 1
|
||||
#ifndef K_bli_zdscalv_zen_int_10
|
||||
#define K_bli_zdscalv_zen_int_10 bli_zdscalv_zen_int10
|
||||
#endif
|
||||
#define K_bli_zgemm_haswell_asm_3x4 1
|
||||
#define K_bli_zgemm_zen4_asm_12x4 1
|
||||
#define K_bli_zgemmsup_cv_zen4_asm_12x1m 1
|
||||
@@ -322,6 +602,6 @@
|
||||
|
||||
// If kernels have been removed, we need to undefine them here.
|
||||
|
||||
#ifdef AOCL_51
|
||||
#undef K_bli_dgemm_zen4_asm_8x24
|
||||
#endif
|
||||
//#ifdef AOCL_51
|
||||
// #undef K_bli_dgemm_zen4_asm_8x24
|
||||
//#endif
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Portions of this file consist of AI-generated content.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -146,8 +146,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
// ----------------------------------------------
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_daddv_zen_int_avx512 kernel.
|
||||
The code structure for bli_daddv_zen_int_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_daddv_zen4_int kernel.
|
||||
The code structure for bli_daddv_zen4_int( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 64 --> L64
|
||||
Fringe loops : In blocks of 32 --> L32
|
||||
@@ -157,12 +157,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
#ifdef K_bli_daddv_zen_int_avx512
|
||||
#ifdef K_bli_daddv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_daddv_zen_int_avx512_unitStrides,
|
||||
bli_daddv_zen4_int_unitStrides,
|
||||
daddvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_daddv_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_daddv_zen4_int), // kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(64), // size n, for L64
|
||||
@@ -179,12 +179,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_daddv_zen_int_avx512
|
||||
#ifdef K_bli_daddv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_daddv_zen_int_avx512_nonUnitStrides,
|
||||
bli_daddv_zen4_int_nonUnitStrides,
|
||||
daddvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_daddv_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_daddv_zen4_int), // kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(7), // size n, for LScalar
|
||||
|
||||
@@ -141,8 +141,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_damaxv_zen_int_avx512 kernel.
|
||||
The code structure for bli_damaxv_zen_int_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_damaxv_zen4_int kernel.
|
||||
The code structure for bli_damaxv_zen4_int( ... ) is as follows :
|
||||
|
||||
For unit strides :
|
||||
Main loop : In blocks of 64 --> L64
|
||||
@@ -154,12 +154,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_damaxv_zen_int_avx512
|
||||
#ifdef K_bli_damaxv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_damaxv_zen_int_avx512_unitStrides,
|
||||
bli_damaxv_zen4_int_unitStrides,
|
||||
damaxvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_damaxv_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_damaxv_zen4_int), // kernel address
|
||||
::testing::Values(gtint_t(64), // for size n, L64
|
||||
gtint_t(32), // L32
|
||||
gtint_t(16), // L16
|
||||
@@ -174,13 +174,13 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_damaxv_zen_int_avx512
|
||||
#ifdef K_bli_damaxv_zen4_int
|
||||
// Unit testing with non-unit strides.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_damaxv_zen_int_avx512_nonUnitStrides,
|
||||
bli_damaxv_zen4_int_nonUnitStrides,
|
||||
damaxvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_damaxv_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_damaxv_zen4_int), // kernel address
|
||||
::testing::Values(gtint_t(10), // n, size of the vector
|
||||
gtint_t(25)),
|
||||
::testing::Values(gtint_t(5)), // incx
|
||||
|
||||
@@ -126,8 +126,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_samaxv_zen_int_avx512 kernel.
|
||||
The code structure for bli_samaxv_zen_int_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_samaxv_zen4_int kernel.
|
||||
The code structure for bli_samaxv_zen4_int( ... ) is as follows :
|
||||
|
||||
For unit strides :
|
||||
Main loop : In blocks of 128 --> L128
|
||||
@@ -139,12 +139,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_samaxv_zen_int_avx512
|
||||
#ifdef K_bli_samaxv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_samaxv_zen_int_avx512_unitStrides,
|
||||
bli_samaxv_zen4_int_unitStrides,
|
||||
samaxvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_samaxv_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_samaxv_zen4_int), // kernel address
|
||||
::testing::Values(gtint_t(128), // for size n, L128
|
||||
gtint_t(64), // L64
|
||||
gtint_t(32), // L32
|
||||
@@ -160,12 +160,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with non-unit strides.
|
||||
#ifdef K_bli_samaxv_zen_int_avx512
|
||||
#ifdef K_bli_samaxv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_samaxv_zen_int_avx512_nonUnitStrides,
|
||||
bli_samaxv_zen4_int_nonUnitStrides,
|
||||
samaxvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_samaxv_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_samaxv_zen4_int), // kernel address
|
||||
::testing::Values(gtint_t(10), // n, size of the vector
|
||||
gtint_t(25)),
|
||||
::testing::Values(gtint_t(5)), // incx
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -113,8 +113,8 @@ TEST_P( daxpbyvGeneric, UKR )
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
/*
|
||||
Unit testing for functionality of bli_daxpbyv_zen_int10 kernel.
|
||||
The code structure for bli_daxpbyv_zen_int10( ... ) is as follows :
|
||||
Unit testing for functionality of bli_daxpbyv_zen_int_10 kernel.
|
||||
The code structure for bli_daxpbyv_zen_int_10( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 40 --> L40
|
||||
Fringe loops : In blocks of 20 --> L20
|
||||
@@ -126,12 +126,12 @@ TEST_P( daxpbyvGeneric, UKR )
|
||||
*/
|
||||
|
||||
// Unit testing with unit stride, across all loops.
|
||||
#ifdef K_bli_daxpbyv_zen_int10
|
||||
#ifdef K_bli_daxpbyv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_daxpbyv_zen_int10_unitStrides,
|
||||
bli_daxpbyv_zen_int_10_unitStrides,
|
||||
daxpbyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_daxpbyv_zen_int10), // kernel address
|
||||
::testing::Values(K_bli_daxpbyv_zen_int_10), // kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(40), // size n, for L40
|
||||
@@ -160,12 +160,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing for non unit strides
|
||||
#ifdef K_bli_daxpbyv_zen_int10
|
||||
#ifdef K_bli_daxpbyv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_daxpbyv_zen_int10_nonUnitStrides,
|
||||
bli_daxpbyv_zen_int_10_nonUnitStrides,
|
||||
daxpbyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_daxpbyv_zen_int10), // kernel address
|
||||
::testing::Values(K_bli_daxpbyv_zen_int_10), // kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(gtint_t(10), // n, size of the vector
|
||||
gtint_t(25)),
|
||||
@@ -244,8 +244,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_daxpbyv_zen_int_avx512 kernel.
|
||||
The code structure for bli_daxpbyv_zen_int_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_daxpbyv_zen4_int kernel.
|
||||
The code structure for bli_daxpbyv_zen4_int( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 64 --> L64
|
||||
Fringe loops : In blocks of 32 --> L32
|
||||
@@ -257,12 +257,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
*/
|
||||
|
||||
// Unit testing with unit stride, across all loops.
|
||||
#ifdef K_bli_daxpbyv_zen_int_avx512
|
||||
#ifdef K_bli_daxpbyv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_daxpbyv_zen_int_avx512_unitStrides,
|
||||
bli_daxpbyv_zen4_int_unitStrides,
|
||||
daxpbyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_daxpbyv_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_daxpbyv_zen4_int), // kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(64), // size n, for L64
|
||||
@@ -286,12 +286,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing for non unit strides
|
||||
#ifdef K_bli_daxpbyv_zen_int_avx512
|
||||
#ifdef K_bli_daxpbyv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_daxpbyv_zen_int_avx512_nonUnitStrides,
|
||||
bli_daxpbyv_zen4_int_nonUnitStrides,
|
||||
daxpbyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_daxpbyv_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_daxpbyv_zen4_int), // kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(gtint_t(10), // n, size of the vector
|
||||
gtint_t(25)),
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -114,8 +114,8 @@ TEST_P( saxpbyvGeneric, UKR )
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
/*
|
||||
Unit testing for functionality of bli_saxpbyv_zen_int10 kernel.
|
||||
The code structure for bli_saxpbyv_zen_int10( ... ) is as follows :
|
||||
Unit testing for functionality of bli_saxpbyv_zen_int_10 kernel.
|
||||
The code structure for bli_saxpbyv_zen_int_10( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 80 --> L80
|
||||
Fringe loops : In blocks of 40 --> L40
|
||||
@@ -128,12 +128,12 @@ TEST_P( saxpbyvGeneric, UKR )
|
||||
*/
|
||||
|
||||
// Unit testing with unit stride, across all loops.
|
||||
#ifdef K_bli_saxpbyv_zen_int10
|
||||
#ifdef K_bli_saxpbyv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_saxpbyv_zen_int10_unitStride,
|
||||
bli_saxpbyv_zen_int_10_unitStride,
|
||||
saxpbyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_saxpbyv_zen_int10), // kernel address
|
||||
::testing::Values(K_bli_saxpbyv_zen_int_10), // kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(80), // size n, for L80
|
||||
@@ -161,12 +161,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing for non unit strides
|
||||
#ifdef K_bli_saxpbyv_zen_int10
|
||||
#ifdef K_bli_saxpbyv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_saxpbyv_zen_int_unitStride,
|
||||
saxpbyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_saxpbyv_zen_int10), // kernel address
|
||||
::testing::Values(K_bli_saxpbyv_zen_int_10), // kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(gtint_t(10), // n, size of the vector
|
||||
gtint_t(25)),
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Portions of this file consist of AI-generated content.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -125,15 +125,15 @@ TEST_P( daxpyfGeneric, UKR )
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_daxpyf_zen_int_avx512 kernel.
|
||||
Unit testing for functionality of bli_daxpyf_zen4_int kernel.
|
||||
*/
|
||||
// Unit testing with unit strides, across all fuse-factors.
|
||||
#ifdef K_bli_daxpyf_zen_int_avx512
|
||||
#ifdef K_bli_daxpyf_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_daxpyf_zen_int_avx512_unitStrides,
|
||||
bli_daxpyf_zen4_int_unitStrides,
|
||||
daxpyfGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_daxpyf_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_daxpyf_zen4_int), // kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(gtint_t(1),
|
||||
@@ -167,12 +167,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with non-unit strides, across all fuse-factors.
|
||||
#ifdef K_bli_daxpyf_zen_int_avx512
|
||||
#ifdef K_bli_daxpyf_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_daxpyf_zen_int_avx512_nonUnitStrides,
|
||||
bli_daxpyf_zen4_int_nonUnitStrides,
|
||||
daxpyfGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_daxpyf_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_daxpyf_zen4_int), // kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(gtint_t(15), gtint_t(27)), // for size n
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Portions of this file consist of AI-generated content.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -121,12 +121,12 @@ TEST_P( zaxpyfGeneric, UKR )
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_zaxpyf_zen_int_8_avx512
|
||||
#ifdef K_bli_zaxpyf_zen4_int_8
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zaxpyf_zen_int_2_avx512_unitStrides,
|
||||
zaxpyfGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zaxpyf_zen_int_8_avx512), // kernel address
|
||||
::testing::Values(K_bli_zaxpyf_zen4_int_8), // kernel address
|
||||
::testing::Values('n'
|
||||
#if defined(TEST_BLIS_TYPED)
|
||||
,'c'
|
||||
@@ -156,12 +156,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with non-unit strides, across all loops.
|
||||
#ifdef K_bli_zaxpyf_zen_int_8_avx512
|
||||
#ifdef K_bli_zaxpyf_zen4_int_8
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zaxpyf_zen_int_2_avx512_nonUnitStrides,
|
||||
zaxpyfGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zaxpyf_zen_int_8_avx512), // kernel address
|
||||
::testing::Values(K_bli_zaxpyf_zen4_int_8), // kernel address
|
||||
::testing::Values('n'
|
||||
#if defined(TEST_BLIS_TYPED)
|
||||
,'c'
|
||||
@@ -195,12 +195,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_zaxpyf_zen_int_8_avx512
|
||||
#ifdef K_bli_zaxpyf_zen4_int_8
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zaxpyf_zen_int_4_avx512_unitStrides,
|
||||
zaxpyfGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zaxpyf_zen_int_8_avx512), // kernel address
|
||||
::testing::Values(K_bli_zaxpyf_zen4_int_8), // kernel address
|
||||
::testing::Values('n'
|
||||
#if defined(TEST_BLIS_TYPED)
|
||||
,'c'
|
||||
@@ -230,12 +230,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with non-unit strides, across all loops.
|
||||
#ifdef K_bli_zaxpyf_zen_int_8_avx512
|
||||
#ifdef K_bli_zaxpyf_zen4_int_8
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zaxpyf_zen_int_4_avx512_nonUnitStrides,
|
||||
zaxpyfGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zaxpyf_zen_int_8_avx512), // kernel address
|
||||
::testing::Values(K_bli_zaxpyf_zen4_int_8), // kernel address
|
||||
::testing::Values('n'
|
||||
#if defined(TEST_BLIS_TYPED)
|
||||
,'c'
|
||||
@@ -259,8 +259,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
/*
|
||||
Unit testing for functionality of bli_zaxpyf_zen_int_8_avx512 kernel.
|
||||
The code structure for bli_zaxpyf_zen_int_8_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_zaxpyf_zen4_int_8 kernel.
|
||||
The code structure for bli_zaxpyf_zen4_int_8( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 8 --> L8
|
||||
Fringe loops : In blocks of 4 --> L4
|
||||
@@ -269,12 +269,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_zaxpyf_zen_int_8_avx512
|
||||
#ifdef K_bli_zaxpyf_zen4_int_8
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zaxpyf_zen_int_8_avx512_unitStrides,
|
||||
bli_zaxpyf_zen4_int_8_unitStrides,
|
||||
zaxpyfGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zaxpyf_zen_int_8_avx512), // kernel address
|
||||
::testing::Values(K_bli_zaxpyf_zen4_int_8), // kernel address
|
||||
::testing::Values('n'
|
||||
#if defined(TEST_BLIS_TYPED)
|
||||
,'c'
|
||||
@@ -304,12 +304,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with non-unit strides, across all loops.
|
||||
#ifdef K_bli_zaxpyf_zen_int_8_avx512
|
||||
#ifdef K_bli_zaxpyf_zen4_int_8
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zaxpyf_zen_int_8_avx512_nonUnitStrides,
|
||||
bli_zaxpyf_zen4_int_8_nonUnitStrides,
|
||||
zaxpyfGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zaxpyf_zen_int_8_avx512), // kernel address
|
||||
::testing::Values(K_bli_zaxpyf_zen4_int_8), // kernel address
|
||||
::testing::Values('n'
|
||||
#if defined(TEST_BLIS_TYPED)
|
||||
,'c'
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Portions of this file consist of AI-generated content.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -94,8 +94,8 @@ TEST_P( caxpyvGeneric, UKR )
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
/*
|
||||
Unit testing for functionality of bli_caxpyv_zen_int5 kernel.
|
||||
The code structure for bli_caxpyv_zen_int5( ... ) is as follows :
|
||||
Unit testing for functionality of bli_caxpyv_zen_int_5 kernel.
|
||||
The code structure for bli_caxpyv_zen_int_5( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 20 --> L20
|
||||
Fringe loops : In blocks of 8 --> L8
|
||||
@@ -105,12 +105,12 @@ TEST_P( caxpyvGeneric, UKR )
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_caxpyv_zen_int5
|
||||
#ifdef K_bli_caxpyv_zen_int_5
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_caxpyv_zen_int5_unitStrides,
|
||||
bli_caxpyv_zen_int_5_unitStrides,
|
||||
caxpyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_caxpyv_zen_int5), // kernel address
|
||||
::testing::Values(K_bli_caxpyv_zen_int_5), // kernel address
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c' // conjx
|
||||
@@ -138,12 +138,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing for non unit strides
|
||||
#ifdef K_bli_caxpyv_zen_int5
|
||||
#ifdef K_bli_caxpyv_zen_int_5
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_caxpyv_zen_int5_nonUnitStrides,
|
||||
bli_caxpyv_zen_int_5_nonUnitStrides,
|
||||
caxpyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_caxpyv_zen_int5), // kernel address
|
||||
::testing::Values(K_bli_caxpyv_zen_int_5), // kernel address
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c' // conjx
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -92,8 +92,8 @@ TEST_P( daxpyvGeneric, UKR )
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
/*
|
||||
Unit testing for functionality of bli_daxpyv_zen_int10 kernel.
|
||||
The code structure for bli_daxpyv_zen_int10( ... ) is as follows :
|
||||
Unit testing for functionality of bli_daxpyv_zen_int_10 kernel.
|
||||
The code structure for bli_daxpyv_zen_int_10( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 52 --> L52
|
||||
Fringe loops : In blocks of 40 --> L40
|
||||
@@ -106,12 +106,12 @@ TEST_P( daxpyvGeneric, UKR )
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_daxpyv_zen_int10
|
||||
#ifdef K_bli_daxpyv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_daxpyv_zen_int10_unitStrides,
|
||||
bli_daxpyv_zen_int_10_unitStrides,
|
||||
daxpyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_daxpyv_zen_int10), // kernel address
|
||||
::testing::Values(K_bli_daxpyv_zen_int_10), // kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(52), // size n, for L52
|
||||
@@ -141,12 +141,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing for non unit strides
|
||||
#ifdef K_bli_daxpyv_zen_int10
|
||||
#ifdef K_bli_daxpyv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_daxpyv_zen_int10_nonUnitStrides,
|
||||
bli_daxpyv_zen_int_10_nonUnitStrides,
|
||||
daxpyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_daxpyv_zen_int10), // kernel address
|
||||
::testing::Values(K_bli_daxpyv_zen_int_10), // kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(gtint_t(10), // n, size of the vector
|
||||
gtint_t(25)),
|
||||
@@ -163,7 +163,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
/*
|
||||
Unit testing for functionality of bli_daxpyv_zen_int kernel.
|
||||
The code structure for bli_daxpyv_zen_int10( ... ) is as follows :
|
||||
The code structure for bli_daxpyv_zen_int_10( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 16 --> L16
|
||||
Element wise loop post all these loops.
|
||||
@@ -216,8 +216,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_daxpyv_zen_int_avx512 kernel.
|
||||
The code structure for bli_daxpyv_zen_int_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_daxpyv_zen4_int kernel.
|
||||
The code structure for bli_daxpyv_zen4_int( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 64 --> L64
|
||||
Fringe loops : In blocks of 32 --> L32
|
||||
@@ -229,12 +229,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_daxpyv_zen_int_avx512
|
||||
#ifdef K_bli_daxpyv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_daxpyv_zen_int_avx512_unitStrides,
|
||||
bli_daxpyv_zen4_int_unitStrides,
|
||||
daxpyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_daxpyv_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_daxpyv_zen4_int),// kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(64), // size n, for L64
|
||||
@@ -262,12 +262,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing for non unit strides
|
||||
#ifdef K_bli_daxpyv_zen_int_avx512
|
||||
#ifdef K_bli_daxpyv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_daxpyv_zen_int_avx512_nonUnitStrides,
|
||||
bli_daxpyv_zen4_int_nonUnitStrides,
|
||||
daxpyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_daxpyv_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_daxpyv_zen4_int),// kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(gtint_t(10), // n, size of the vector
|
||||
gtint_t(25)),
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Portions of this file consist of AI-generated content.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -84,8 +84,8 @@ TEST_P( saxpyvGeneric, UKR )
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
/*
|
||||
Unit testing for functionality of bli_saxpyv_zen_int10 kernel.
|
||||
The code structure for bli_saxpyv_zen_int10( ... ) is as follows :
|
||||
Unit testing for functionality of bli_saxpyv_zen_int_10 kernel.
|
||||
The code structure for bli_saxpyv_zen_int_10( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 120 --> L120
|
||||
Fringe loops : In blocks of 80 --> L80
|
||||
@@ -98,12 +98,12 @@ TEST_P( saxpyvGeneric, UKR )
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
|
||||
#ifdef K_bli_saxpyv_zen_int10
|
||||
#ifdef K_bli_saxpyv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_saxpyv_zen_int10_unitStrides,
|
||||
bli_saxpyv_zen_int_10_unitStrides,
|
||||
saxpyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_saxpyv_zen_int10), // kernel address
|
||||
::testing::Values(K_bli_saxpyv_zen_int_10), // kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(120), // size n, for L120
|
||||
@@ -128,12 +128,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_saxpyv_zen_int10
|
||||
#ifdef K_bli_saxpyv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_saxpyv_zen_int10_nonUnitStrides,
|
||||
bli_saxpyv_zen_int_10_nonUnitStrides,
|
||||
saxpyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_saxpyv_zen_int10), // kernel address
|
||||
::testing::Values(K_bli_saxpyv_zen_int_10), // kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(7), // size n, for LScalar
|
||||
@@ -205,8 +205,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_saxpyv_zen_int_avx512 kernel.
|
||||
The code structure for bli_saxpyv_zen_int_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_saxpyv_zen4_int kernel.
|
||||
The code structure for bli_saxpyv_zen4_int( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 128 --> L128
|
||||
Fringe loops : In blocks of 64 --> L64
|
||||
@@ -218,12 +218,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
|
||||
#ifdef K_bli_saxpyv_zen_int_avx512
|
||||
#ifdef K_bli_saxpyv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_saxpyv_zen_int_avx512_unitStrides,
|
||||
bli_saxpyv_zen4_int_unitStrides,
|
||||
saxpyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_saxpyv_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_saxpyv_zen4_int), // kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(128), // size n, for L128
|
||||
@@ -244,12 +244,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_saxpyv_zen_int_avx512
|
||||
#ifdef K_bli_saxpyv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_saxpyv_zen_int_avx512_nonUnitStrides,
|
||||
bli_saxpyv_zen4_int_nonUnitStrides,
|
||||
saxpyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_saxpyv_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_saxpyv_zen4_int),// kernel address
|
||||
::testing::Values('n'), // use x, not conj(x) (since it is real)
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(7), // size n, for LScalar
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Portions of this file consist of AI-generated content.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -94,7 +94,7 @@ TEST_P( zaxpyvGeneric, UKR )
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
/*
|
||||
Unit testing for functionality of bli_zaxpyv_zen_int5 kernel.
|
||||
Unit testing for functionality of bli_zaxpyv_zen_int_5 kernel.
|
||||
The code structure for bli_zaxpyv_zen_int10( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 14 --> L14
|
||||
@@ -107,12 +107,12 @@ TEST_P( zaxpyvGeneric, UKR )
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_zaxpyv_zen_int5
|
||||
#ifdef K_bli_zaxpyv_zen_int_5
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zaxpyv_zen_int5_unitStrides,
|
||||
bli_zaxpyv_zen_int_5_unitStrides,
|
||||
zaxpyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zaxpyv_zen_int5), // kernel address
|
||||
::testing::Values(K_bli_zaxpyv_zen_int_5), // kernel address
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c' // conjx
|
||||
@@ -144,12 +144,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing for non unit strides
|
||||
#ifdef K_bli_zaxpyv_zen_int5
|
||||
#ifdef K_bli_zaxpyv_zen_int_5
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zaxpyv_zen_int5_nonUnitStrides,
|
||||
bli_zaxpyv_zen_int_5_nonUnitStrides,
|
||||
zaxpyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zaxpyv_zen_int5), // kernel address
|
||||
::testing::Values(K_bli_zaxpyv_zen_int_5), // kernel address
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c' // conjx
|
||||
@@ -171,8 +171,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_zaxpyv_zen_int_avx512 kernel.
|
||||
The code structure for bli_zaxpyv_zen_int_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_zaxpyv_zen4_int kernel.
|
||||
The code structure for bli_zaxpyv_zen4_int( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 32 --> L32
|
||||
Fringe loops : In blocks of 16 --> L16
|
||||
@@ -183,12 +183,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_zaxpyv_zen_int_avx512
|
||||
#ifdef K_bli_zaxpyv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zaxpyv_zen_int_avx512_unitStrides,
|
||||
bli_zaxpyv_zen4_int_unitStrides,
|
||||
zaxpyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zaxpyv_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_zaxpyv_zen4_int), // kernel address
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c' // conjx
|
||||
@@ -219,12 +219,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing for non unit strides
|
||||
#ifdef K_bli_zaxpyv_zen_int_avx512
|
||||
#ifdef K_bli_zaxpyv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zaxpyv_zen_int_avx512_nonUnitStrides,
|
||||
bli_zaxpyv_zen4_int_nonUnitStrides,
|
||||
zaxpyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zaxpyv_zen_int_avx512), // kernel address
|
||||
::testing::Values(K_bli_zaxpyv_zen4_int), // kernel address
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c' // conjx
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -136,8 +136,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_dcopyv_zen4_asm_avx512 kernel.
|
||||
The code structure for bli_dcopyv_zen4_asm_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_dcopyv_zen4_asm kernel.
|
||||
The code structure for bli_dcopyv_zen4_asm( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 128 --> L128
|
||||
Fringe loops : In blocks of 64 --> L64
|
||||
@@ -149,12 +149,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with Unit Strides(US), across all loops.
|
||||
#ifdef K_bli_dcopyv_zen4_asm_avx512
|
||||
#ifdef K_bli_dcopyv_zen4_asm
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dcopyv_zen4_asm_avx512_unitStrides,
|
||||
bli_dcopyv_zen4_asm_unitStrides,
|
||||
dcopyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dcopyv_zen4_asm_avx512),
|
||||
::testing::Values(K_bli_dcopyv_zen4_asm),
|
||||
::testing::Values('n'), // conjugate parameter, 'n' for dcopyv
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(128), // size n, for L128
|
||||
@@ -179,12 +179,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with Non-Unit Strides(US), across all loops.
|
||||
#ifdef K_bli_dcopyv_zen4_asm_avx512
|
||||
#ifdef K_bli_dcopyv_zen4_asm
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dcopyv_zen4_asm_avx512_nonUnitStrides,
|
||||
bli_dcopyv_zen4_asm_nonUnitStrides,
|
||||
dcopyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dcopyv_zen4_asm_avx512),
|
||||
::testing::Values(K_bli_dcopyv_zen4_asm),
|
||||
::testing::Values('n'), // conjugate parameter, 'n' for dcopyv
|
||||
::testing::Values(gtint_t(25), gtint_t(37)), // size of the vector
|
||||
::testing::Values(gtint_t(5)), // stride size for x
|
||||
@@ -196,8 +196,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
/*
|
||||
Unit testing for functionality of bli_dcopyv_zen4_asm_avx512_biway kernel.
|
||||
The code structure for bli_dcopyv_zen4_asm_avx512_biway( ... ) is as follows :
|
||||
Unit testing for functionality of bli_dcopyv_zen4_asm_biway kernel.
|
||||
The code structure for bli_dcopyv_zen4_asm_biway( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 128 --> L128
|
||||
Fringe loops : In blocks of 64 --> L64
|
||||
@@ -209,12 +209,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with Unit Strides(US), across all loops.
|
||||
#ifdef K_bli_dcopyv_zen4_asm_avx512_biway
|
||||
#ifdef K_bli_dcopyv_zen4_asm_biway
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dcopyv_zen4_asm_avx512_biway_unitStrides,
|
||||
bli_dcopyv_zen4_asm_biway_unitStrides,
|
||||
dcopyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dcopyv_zen4_asm_avx512_biway),
|
||||
::testing::Values(bli_dcopyv_zen4_asm_biway),
|
||||
::testing::Values('n'), // conjugate parameter, 'n' for dcopyv
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(128), // size n, for L128
|
||||
@@ -239,12 +239,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with Non-Unit Strides(US), across all loops.
|
||||
#ifdef K_bli_dcopyv_zen4_asm_avx512_biway
|
||||
#ifdef K_bli_dcopyv_zen4_asm_biway
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dcopyv_zen4_asm_avx512_biway_nonUnitStrides,
|
||||
bli_dcopyv_zen4_asm_biway_nonUnitStrides,
|
||||
dcopyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dcopyv_zen4_asm_avx512_biway),
|
||||
::testing::Values(bli_dcopyv_zen4_asm_biway),
|
||||
::testing::Values('n'), // conjugate parameter, 'n' for dcopyv
|
||||
::testing::Values(gtint_t(25), gtint_t(37)), // size of the vector
|
||||
::testing::Values(gtint_t(5)), // stride size for x
|
||||
@@ -258,8 +258,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN5) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_dcopyv_zen4_asm_avx512 kernel.
|
||||
The code structure for bli_dcopyv_zen5_asm_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_dcopyv_zen4_asm kernel.
|
||||
The code structure for bli_dcopyv_zen5_asm( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 128 --> L128
|
||||
Fringe loops : In blocks of 64 --> L64
|
||||
@@ -271,12 +271,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with Unit Strides(US), across all loops.
|
||||
#ifdef K_bli_dcopyv_zen5_asm_avx512
|
||||
#ifdef K_bli_dcopyv_zen5_asm
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dcopyv_zen5_asm_avx512_unitStrides,
|
||||
bli_dcopyv_zen5_asm_unitStrides,
|
||||
dcopyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dcopyv_zen5_asm_avx512),
|
||||
::testing::Values(K_bli_dcopyv_zen5_asm),
|
||||
::testing::Values('n'), // conjugate parameter, 'n' for dcopyv
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(128), // size n, for L128
|
||||
@@ -301,12 +301,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with Non-Unit Strides(US), across all loops.
|
||||
#ifdef K_bli_dcopyv_zen5_asm_avx512
|
||||
#ifdef K_bli_dcopyv_zen5_asm
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dcopyv_zen5_asm_avx512_nonUnitStrides,
|
||||
bli_dcopyv_zen5_asm_nonUnitStrides,
|
||||
dcopyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dcopyv_zen5_asm_avx512),
|
||||
::testing::Values(K_bli_dcopyv_zen5_asm),
|
||||
::testing::Values('n'), // conjugate parameter, 'n' for dcopyv
|
||||
::testing::Values(gtint_t(25), gtint_t(37)), // size of the vector
|
||||
::testing::Values(gtint_t(5)), // stride size for x
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -136,8 +136,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_scopyv_zen4_asm_avx512 kernel.
|
||||
The code structure for bli_scopyv_zen4_asm_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_scopyv_zen4_asm kernel.
|
||||
The code structure for bli_scopyv_zen4_asm( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 512 --> L512
|
||||
Fringe loops : In blocks of 256 --> L256
|
||||
@@ -150,12 +150,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with Unit Strides(US), across all loops.
|
||||
#ifdef K_bli_scopyv_zen4_asm_avx512
|
||||
#ifdef K_bli_scopyv_zen4_asm
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_scopyv_zen4_asm_avx512_unitStrides,
|
||||
bli_scopyv_zen4_asm_unitStrides,
|
||||
scopyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_scopyv_zen4_asm_avx512),
|
||||
::testing::Values(K_bli_scopyv_zen4_asm),
|
||||
::testing::Values('n'), // conjugate parameter, 'n' for scopyv
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(512), // size n, for L512
|
||||
@@ -182,12 +182,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with Non-Unit Strides(US), across all loops.
|
||||
#ifdef K_bli_scopyv_zen4_asm_avx512
|
||||
#ifdef K_bli_scopyv_zen4_asm
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_scopyv_zen4_asm_avx512_nonUnitStrides,
|
||||
bli_scopyv_zen4_asm_nonUnitStrides,
|
||||
scopyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_scopyv_zen4_asm_avx512),
|
||||
::testing::Values(K_bli_scopyv_zen4_asm),
|
||||
::testing::Values('n'), // conjugate parameter, 'n' for scopyv
|
||||
::testing::Values(gtint_t(25), gtint_t(37)), // size of the vector
|
||||
::testing::Values(gtint_t(5)), // stride size for x
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -141,8 +141,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_zcopyv_zen4_asm_avx512 kernel.
|
||||
The code structure for bli_zcopyv_zen4_asm_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_zcopyv_zen4_asm kernel.
|
||||
The code structure for bli_zcopyv_zen4_asm( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 128 --> L128
|
||||
Fringe loops : In blocks of 64 --> L64
|
||||
@@ -155,12 +155,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with Unit Strides(US), across all loops.
|
||||
#ifdef K_bli_zcopyv_zen4_asm_avx512
|
||||
#ifdef K_bli_zcopyv_zen4_asm
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zcopyv_zen4_asm_avx512_unitStrides,
|
||||
bli_zcopyv_zen4_asm_unitStrides,
|
||||
zcopyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zcopyv_zen4_asm_avx512),
|
||||
::testing::Values(K_bli_zcopyv_zen4_asm),
|
||||
::testing::Values('n' // n: use x, c: use conj(x)
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c' // this option is BLIS-api specific.
|
||||
@@ -191,12 +191,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with Non-Unit Strides(US), across all loops.
|
||||
#ifdef K_bli_zcopyv_zen4_asm_avx512
|
||||
#ifdef K_bli_zcopyv_zen4_asm
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zcopyv_zen4_asm_avx512_nonUnitStrides,
|
||||
bli_zcopyv_zen4_asm_nonUnitStrides,
|
||||
zcopyvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zcopyv_zen4_asm_avx512),
|
||||
::testing::Values(K_bli_zcopyv_zen4_asm),
|
||||
::testing::Values('n' // n: use x, c: use conj(x)
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c' // this option is BLIS-api specific.
|
||||
|
||||
@@ -163,7 +163,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
// Tests for bli_ddotv_zen_int10 (AVX2) kernel.
|
||||
// Tests for bli_ddotv_zen_int_10 (AVX2) kernel.
|
||||
/**
|
||||
* Loops:
|
||||
* L20 - Main loop, handles 20 elements
|
||||
@@ -174,12 +174,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
*
|
||||
* LNUnit - loop for non-unit increments
|
||||
*/
|
||||
#ifdef K_bli_ddotv_zen_int10
|
||||
#ifdef K_bli_ddotv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_ddotv_zen_int10_unitStride,
|
||||
bli_ddotv_zen_int_10_unitStride,
|
||||
ddotvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_ddotv_zen_int10),
|
||||
::testing::Values(K_bli_ddotv_zen_int_10),
|
||||
// conj(x): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// conj(y): uses n (no_conjugate) since it is real.
|
||||
@@ -220,12 +220,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_ddotv_zen_int10
|
||||
#ifdef K_bli_ddotv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_ddotv_zen_int10_nonUnitPositiveStrides,
|
||||
bli_ddotv_zen_int_10_nonUnitPositiveStrides,
|
||||
ddotvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_ddotv_zen_int10),
|
||||
::testing::Values(K_bli_ddotv_zen_int_10),
|
||||
// conj(x): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// conj(y): uses n (no_conjugate) since it is real.
|
||||
@@ -258,7 +258,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
// ----- Begin ZEN4 (AVX512) Kernel Tests -----
|
||||
// ----------------------------------------------
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
// Tests for bli_ddotv_zen_int_avx512 (AVX512) kernel.
|
||||
// Tests for bli_ddotv_zen4_int (AVX512) kernel.
|
||||
/**
|
||||
* Loops & If conditions:
|
||||
* L40 - Main loop, handles 40 elements
|
||||
@@ -268,12 +268,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
*
|
||||
* LNUnit - loop for non-unit increments
|
||||
*/
|
||||
#ifdef K_bli_ddotv_zen_int_avx512
|
||||
#ifdef K_bli_ddotv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_ddotv_zen_int_avx512_unitStride,
|
||||
bli_ddotv_zen4_int_unitStride,
|
||||
ddotvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_ddotv_zen_int_avx512),
|
||||
::testing::Values(K_bli_ddotv_zen4_int),
|
||||
// conj(x): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// conj(y): uses n (no_conjugate) since it is real.
|
||||
@@ -313,12 +313,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_ddotv_zen_int_avx512
|
||||
#ifdef K_bli_ddotv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_ddotv_zen_int_avx512_nonUnitPositiveStrides,
|
||||
bli_ddotv_zen4_int_nonUnitPositiveStrides,
|
||||
ddotvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_ddotv_zen_int_avx512),
|
||||
::testing::Values(K_bli_ddotv_zen4_int),
|
||||
// conj(x): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// conj(y): uses n (no_conjugate) since it is real.
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -90,7 +90,7 @@ TEST_P( zdotvGeneric, UKR )
|
||||
// ----- Begin ZEN4 (AVX512) Kernel Tests -----
|
||||
// ----------------------------------------------
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
// Tests for bli_zdotv_zen_int_avx512 (AVX512) kernel.
|
||||
// Tests for bli_zdotv_zen4_int (AVX512) kernel.
|
||||
/**
|
||||
* Loops & If conditions:
|
||||
* L32 - Main loop, handles 32 elements
|
||||
@@ -102,12 +102,12 @@ TEST_P( zdotvGeneric, UKR )
|
||||
*
|
||||
* LNUnit - loop for non-unit increments
|
||||
*/
|
||||
#ifdef K_bli_zdotv_zen_int_avx512
|
||||
#ifdef K_bli_zdotv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zdotv_zen_int_avx512_unitStride,
|
||||
bli_zdotv_zen4_int_unitStride,
|
||||
zdotvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zdotv_zen_int_avx512),
|
||||
::testing::Values(K_bli_zdotv_zen4_int),
|
||||
// conj(x): use n (no_conjugate) or c (conjugate).
|
||||
::testing::Values('n', 'c'),
|
||||
// conj(y): use n (no_conjugate) or c (conjugate).
|
||||
@@ -147,12 +147,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_zdotv_zen_int_avx512
|
||||
#ifdef K_bli_zdotv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zdotv_zen_int_avx512_nonUnitPositiveStrides,
|
||||
bli_zdotv_zen4_int_nonUnitPositiveStrides,
|
||||
zdotvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zdotv_zen_int_avx512),
|
||||
::testing::Values(K_bli_zdotv_zen4_int),
|
||||
// conj(x): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// conj(y): uses n (no_conjugate) since it is real.
|
||||
@@ -176,7 +176,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
// Tests for bli_zdotv_zen_int_avx512 (AVX512) kernel.
|
||||
// Tests for bli_zdotv_zen4_int (AVX512) kernel.
|
||||
/**
|
||||
* Loops & If conditions:
|
||||
* L32 - Main loop, handles 32 elements
|
||||
@@ -188,12 +188,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
*
|
||||
* LNUnit - loop for non-unit increments
|
||||
*/
|
||||
#ifdef K_bli_zdotv_zen4_asm_avx512
|
||||
#ifdef K_bli_zdotv_zen4_asm
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
DISABLED_bli_zdotv_zen4_asm_avx512_unitStride,
|
||||
DISABLED_bli_zdotv_zen4_asm_unitStride,
|
||||
zdotvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zdotv_zen4_asm_avx512),
|
||||
::testing::Values(K_bli_zdotv_zen4_asm),
|
||||
// conj(x): use n (no_conjugate) or c (conjugate).
|
||||
::testing::Values('n', 'c'),
|
||||
// conj(y): use n (no_conjugate) or c (conjugate).
|
||||
@@ -233,12 +233,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_zdotv_zen4_asm_avx512
|
||||
#ifdef K_bli_zdotv_zen4_asm
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zdotv_zen4_asm_avx512_nonUnitPositiveStrides,
|
||||
bli_zdotv_zen4_asm_nonUnitPositiveStrides,
|
||||
zdotvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zdotv_zen4_asm_avx512),
|
||||
::testing::Values(K_bli_zdotv_zen4_asm),
|
||||
// conj(x): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// conj(y): uses n (no_conjugate) since it is real.
|
||||
|
||||
@@ -1643,9 +1643,9 @@ public:
|
||||
};
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
#ifdef K_bli_cgemm_32x4_avx512_k1_nn
|
||||
#ifdef K_bli_cgemm_zen4_int_32x4_k1_nn
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_cgemm_32x4_avx512_k1_nn,
|
||||
bli_cgemm_zen4_int_32x4_k1_nn,
|
||||
cgemmUkrk1,
|
||||
::testing::Combine(
|
||||
|
||||
@@ -1656,7 +1656,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values('c'), // storage
|
||||
::testing::Range(gtint_t(1), gtint_t(65), 1), // values of m
|
||||
::testing::Range(gtint_t(1), gtint_t(9), 1), // values of n
|
||||
::testing::Values(bli_cgemm_32x4_avx512_k1_nn),
|
||||
::testing::Values(K_bli_cgemm_zen4_int_32x4_k1_nn),
|
||||
::testing::Values(true, false) // memory test
|
||||
),
|
||||
::cgemmUkrk1Print()
|
||||
|
||||
@@ -271,9 +271,9 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
|
||||
#ifdef K_bli_dgemmsup_rv_zen4_asm_24x8m
|
||||
#ifdef K_bli_dgemmsup_cv_zen4_asm_24x8m
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemmsup_rv_zen4_asm_24x8m_col_stored_c,
|
||||
bli_dgemmsup_cv_zen4_asm_24x8m_col_stored_c,
|
||||
dgemmGenericSUP,
|
||||
::testing::Combine(
|
||||
::testing::Range(gtint_t(1), gtint_t(25), 1), // values of m
|
||||
@@ -282,7 +282,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values(2.0, 1.0, -1.0), // alpha value
|
||||
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
|
||||
::testing::Values('c'), // storage of c
|
||||
::testing::Values(bli_dgemmsup_rv_zen4_asm_24x8m), // dgemm_sup kernel
|
||||
::testing::Values(K_bli_dgemmsup_cv_zen4_asm_24x8m), // dgemm_sup kernel
|
||||
::testing::Values(gtint_t(8)), // Micro kernel block MR
|
||||
::testing::Values('n'), // transa
|
||||
::testing::Values('n'), // transb
|
||||
@@ -293,9 +293,9 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dgemmsup_rv_zen4_asm_24x8m
|
||||
#ifdef K_bli_dgemmsup_cv_zen4_asm_24x8m
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemmsup_rv_zen4_asm_24x8m_row_stored_c,
|
||||
bli_dgemmsup_cv_zen4_asm_24x8m_row_stored_c,
|
||||
dgemmGenericSUP,
|
||||
::testing::Combine(
|
||||
::testing::Range(gtint_t(1), gtint_t(25), 1), // values of m
|
||||
@@ -304,7 +304,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values(2.0, 1.0, -1.0), // alpha value
|
||||
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
|
||||
::testing::Values('r'), // storage of c
|
||||
::testing::Values(bli_dgemmsup_rv_zen4_asm_24x8m), // dgemm_sup kernel
|
||||
::testing::Values(K_bli_dgemmsup_cv_zen4_asm_24x8m), // dgemm_sup kernel
|
||||
::testing::Values(gtint_t(8)), // Micro kernel block MR
|
||||
::testing::Values('t'), // transa
|
||||
::testing::Values('n'), // transb
|
||||
@@ -315,9 +315,9 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dgemmsup_rv_zen4_asm_24x8m_new
|
||||
#ifdef K_bli_dgemmsup_cv_zen4_asm_24x8m_new
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemmsup_rv_zen4_asm_24x8m_new_col_stored_c,
|
||||
bli_dgemmsup_cv_zen4_asm_24x8m_new_col_stored_c,
|
||||
dgemmGenericSUP,
|
||||
::testing::Combine(
|
||||
::testing::Range(gtint_t(1), gtint_t(25), 1), // values of m
|
||||
@@ -326,7 +326,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values(2.0, 1.0, -1.0), // alpha value
|
||||
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
|
||||
::testing::Values('c'), // storage of c
|
||||
::testing::Values(bli_dgemmsup_rv_zen4_asm_24x8m_new), // dgemm_sup kernel
|
||||
::testing::Values(K_bli_dgemmsup_cv_zen4_asm_24x8m_new), // dgemm_sup kernel
|
||||
::testing::Values(gtint_t(8)), // Micro kernel block MR
|
||||
::testing::Values('n'), // transa
|
||||
::testing::Values('n'), // transb
|
||||
@@ -337,9 +337,9 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dgemmsup_rv_zen4_asm_24x8m_new
|
||||
#ifdef K_bli_dgemmsup_cv_zen4_asm_24x8m_new
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemmsup_rv_zen4_asm_24x8m_new_row_stored_c,
|
||||
bli_dgemmsup_cv_zen4_asm_24x8m_new_row_stored_c,
|
||||
dgemmGenericSUP,
|
||||
::testing::Combine(
|
||||
::testing::Range(gtint_t(1), gtint_t(25), 1), // values of m
|
||||
@@ -348,7 +348,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values(2.0, 1.0, -1.0), // alpha value
|
||||
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
|
||||
::testing::Values('r'), // storage of c
|
||||
::testing::Values(bli_dgemmsup_rv_zen4_asm_24x8m_new), // dgemm_sup kernel
|
||||
::testing::Values(K_bli_dgemmsup_cv_zen4_asm_24x8m_new), // dgemm_sup kernel
|
||||
::testing::Values(gtint_t(8)), // Micro kernel block MR
|
||||
::testing::Values('t'), // transa
|
||||
::testing::Values('n'), // transb
|
||||
@@ -363,9 +363,9 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN5) && defined(GTEST_AVX512)
|
||||
|
||||
#ifdef K_bli_dgemmsup_rv_zen5_asm_24x8m
|
||||
#ifdef K_bli_dgemmsup_cv_zen5_asm_24x8m
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemmsup_rv_zen5_asm_24x8m_col_stored_c,
|
||||
bli_dgemmsup_cv_zen5_asm_24x8m_col_stored_c,
|
||||
dgemmGenericSUP,
|
||||
::testing::Combine(
|
||||
::testing::Range(gtint_t(1), gtint_t(25), 1), // values of m
|
||||
@@ -374,7 +374,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values(2.0, 1.0, -1.0), // alpha value
|
||||
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
|
||||
::testing::Values('c'), // storage of c
|
||||
::testing::Values(bli_dgemmsup_rv_zen5_asm_24x8m), // dgemm_sup kernel
|
||||
::testing::Values(K_bli_dgemmsup_cv_zen5_asm_24x8m), // dgemm_sup kernel
|
||||
::testing::Values(gtint_t(8)), // Micro kernel block MR
|
||||
::testing::Values('n'), // transa
|
||||
::testing::Values('n'), // transb
|
||||
@@ -385,9 +385,9 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dgemmsup_rv_zen5_asm_24x8m
|
||||
#ifdef K_bli_dgemmsup_cv_zen5_asm_24x8m
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemmsup_rv_zen5_asm_24x8m_row_stored_c,
|
||||
bli_dgemmsup_cv_zen5_asm_24x8m_row_stored_c,
|
||||
dgemmGenericSUP,
|
||||
::testing::Combine(
|
||||
::testing::Range(gtint_t(1), gtint_t(25), 1), // values of m
|
||||
@@ -396,7 +396,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values(2.0, 1.0, -1.0), // alpha value
|
||||
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
|
||||
::testing::Values('r'), // storage of c
|
||||
::testing::Values(bli_dgemmsup_rv_zen5_asm_24x8m), // dgemm_sup kernel
|
||||
::testing::Values(K_bli_dgemmsup_cv_zen5_asm_24x8m), // dgemm_sup kernel
|
||||
::testing::Values(gtint_t(8)), // Micro kernel block MR
|
||||
::testing::Values('t'), // transa
|
||||
::testing::Values('n'), // transb
|
||||
@@ -510,26 +510,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dgemm_avx512_asm_8x24
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemm_avx512_asm_8x24,
|
||||
dgemmGenericNat,
|
||||
::testing::Combine(
|
||||
::testing::Range(gtint_t(0), gtint_t(17), 1), // values of k
|
||||
::testing::Values(2.0, 1.0, -1.0), // alpha value
|
||||
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
|
||||
::testing::Values('r', 'c'), // storage
|
||||
::testing::Values(8), // values of m
|
||||
::testing::Values(24), // values of n
|
||||
::testing::Values(bli_dgemm_avx512_asm_8x24),
|
||||
::testing::Values(true, false) // memory test
|
||||
),
|
||||
::dgemmGenericNatPrint()
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dgemm_zen4_asm_8x24
|
||||
// Old version of bli_dgemm_avx512_asm_8x24 kernel, removed in 5.1
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemm_zen4_asm_8x24,
|
||||
dgemmGenericNat,
|
||||
@@ -540,7 +521,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values('r', 'c'), // storage
|
||||
::testing::Values(8), // values of m
|
||||
::testing::Values(24), // values of n
|
||||
::testing::Values(bli_dgemm_zen4_asm_8x24),
|
||||
::testing::Values(K_bli_dgemm_zen4_asm_8x24),
|
||||
::testing::Values(true, false) // memory test
|
||||
),
|
||||
::dgemmGenericNatPrint()
|
||||
@@ -665,9 +646,9 @@ public:
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
|
||||
#ifdef K_bli_dgemm_24x8_avx512_k1_nn
|
||||
#ifdef K_bli_dgemm_zen4_int_24x8_k1_nn
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemm_24x8_avx512_k1_nn,
|
||||
bli_dgemm_zen4_int_24x8_k1_nn,
|
||||
dgemmGenericK1,
|
||||
::testing::Combine(
|
||||
|
||||
@@ -676,7 +657,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values('c'), // storage
|
||||
::testing::Range(gtint_t(1), gtint_t(25), 1), // values of m
|
||||
::testing::Range(gtint_t(1), gtint_t(9), 1), // values of n
|
||||
::testing::Values(bli_dgemm_24x8_avx512_k1_nn),
|
||||
::testing::Values(K_bli_dgemm_zen4_int_24x8_k1_nn),
|
||||
::testing::Values(true, false) // memory test
|
||||
),
|
||||
::dgemmGenericK1Print()
|
||||
@@ -688,9 +669,9 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
|
||||
#ifdef K_bli_dgemm_8x6_avx2_k1_nn
|
||||
#ifdef K_bli_dgemm_zen_int_8x6_k1_nn
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemm_8x6_avx2_k1_nn,
|
||||
bli_dgemm_zen_int_8x6_k1_nn,
|
||||
dgemmGenericK1,
|
||||
::testing::Combine(
|
||||
::testing::Values(2.0, 1.0, -1.0), // alpha value
|
||||
@@ -698,7 +679,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values('c'), // storage
|
||||
::testing::Range(gtint_t(1), gtint_t(9), 1), // values of m
|
||||
::testing::Range(gtint_t(1), gtint_t(7), 1), // values of n
|
||||
::testing::Values(bli_dgemm_8x6_avx2_k1_nn),
|
||||
::testing::Values(K_bli_dgemm_zen_int_8x6_k1_nn),
|
||||
::testing::Values(true, false) // memory test
|
||||
),
|
||||
::dgemmGenericK1Print()
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -270,7 +270,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
|
||||
#ifdef K_bli_sgemmsup_rv_zen_asm_6x64m_avx512
|
||||
#ifdef K_bli_sgemmsup_rv_zen4_asm_6x64m
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_sgemmsup_rv_zen_asm_6x64m_row_stored_c,
|
||||
sgemmGenericSUP,
|
||||
@@ -281,7 +281,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values(2.0, 1.0, -1.0), // alpha value
|
||||
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
|
||||
::testing::Values('r'), // storage of c
|
||||
::testing::Values(bli_sgemmsup_rv_zen_asm_6x64m_avx512), // sgemm_sup kernel
|
||||
::testing::Values(K_bli_sgemmsup_rv_zen4_asm_6x64m), // sgemm_sup kernel
|
||||
::testing::Values(gtint_t(6)), // Micro kernel block MR
|
||||
::testing::Values('t'), // transa
|
||||
::testing::Values('n'), // transb
|
||||
@@ -292,7 +292,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_sgemmsup_rv_zen_asm_6x64m_avx512
|
||||
#ifdef K_bli_sgemmsup_rv_zen4_asm_6x64m
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_sgemmsup_rv_zen_asm_6x64m_col_stored_c,
|
||||
sgemmGenericSUP,
|
||||
@@ -303,7 +303,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values(2.0, 1.0, -1.0), // alpha value
|
||||
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
|
||||
::testing::Values('c'), // storage of c
|
||||
::testing::Values(bli_sgemmsup_rv_zen_asm_6x64m_avx512), // sgemm_sup_kernel
|
||||
::testing::Values(K_bli_sgemmsup_rv_zen4_asm_6x64m), // sgemm_sup_kernel
|
||||
::testing::Values(gtint_t(6)), // Micro kernel block MR
|
||||
::testing::Values('n'), // transa
|
||||
::testing::Values('t'), // transb
|
||||
@@ -315,7 +315,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
/*
|
||||
The bli_sgemmsup_rd_zen_asm_6x64m_avx512(standalone), accepts inputs with the
|
||||
The bli_sgemmsup_rd_zen4_asm_6x64m(standalone), accepts inputs with the
|
||||
following contingency for n.
|
||||
n <= NR, where NR is 64
|
||||
The code structure for the sgemm_sup rd kernels(m-var) are as follows:
|
||||
@@ -336,7 +336,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
*/
|
||||
|
||||
// Checking with row storage of C
|
||||
#ifdef K_bli_sgemmsup_rd_zen_asm_6x64m_avx512
|
||||
#ifdef K_bli_sgemmsup_rd_zen4_asm_6x64m
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_sgemmsup_rd_zen_asm_6x64m_row_stored_c,
|
||||
sgemmGenericSUP,
|
||||
@@ -357,7 +357,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values(2.0, 1.0, -1.0), // alpha value
|
||||
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
|
||||
::testing::Values('r'), // storage of c
|
||||
::testing::Values(bli_sgemmsup_rd_zen_asm_6x64m_avx512), // sgemm_sup_kernel
|
||||
::testing::Values(K_bli_sgemmsup_rd_zen4_asm_6x64m), // sgemm_sup_kernel
|
||||
::testing::Values(gtint_t(6)), // Micro kernel block MR
|
||||
::testing::Values('n'), // transa, has to be N for row storage
|
||||
::testing::Values('t'), // transb, has to be T for row storage
|
||||
@@ -371,7 +371,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
// Checking with col storage of C
|
||||
// NOTE : Since we are inducing transpose at opertaion level, for code coverage, we
|
||||
// have to interchange m and n instantiations
|
||||
#ifdef K_bli_sgemmsup_rd_zen_asm_6x64m_avx512
|
||||
#ifdef K_bli_sgemmsup_rd_zen4_asm_6x64m
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_sgemmsup_rd_zen_asm_6x64m_col_stored_c,
|
||||
sgemmGenericSUP,
|
||||
@@ -392,7 +392,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values(2.0, 1.0, -1.0), // alpha value
|
||||
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
|
||||
::testing::Values('c'), // storage of c
|
||||
::testing::Values(bli_sgemmsup_rd_zen_asm_6x64m_avx512), // sgemm_sup_kernel
|
||||
::testing::Values(K_bli_sgemmsup_rd_zen4_asm_6x64m), // sgemm_sup_kernel
|
||||
::testing::Values(gtint_t(6)), // Micro kernel block MR
|
||||
::testing::Values('t'), // transa, has to be T for row storage
|
||||
::testing::Values('n'), // transb, has to be N for row storage
|
||||
@@ -403,7 +403,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_sgemmsup_rv_zen_asm_6x64n_avx512
|
||||
#ifdef K_bli_sgemmsup_rv_zen4_asm_6x64n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_sgemmsup_rv_zen_asm_6x64n_row_stored_c,
|
||||
sgemmGenericSUP,
|
||||
@@ -414,7 +414,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values(2.0, 1.0, -1.0), // alpha value
|
||||
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
|
||||
::testing::Values('r'), // storage of c
|
||||
::testing::Values(bli_sgemmsup_rv_zen_asm_6x64n_avx512), // sgemm_sup_kernel
|
||||
::testing::Values(K_bli_sgemmsup_rv_zen4_asm_6x64n), // sgemm_sup_kernel
|
||||
::testing::Values(gtint_t(6)), // Micro kernel block MR
|
||||
::testing::Values('t'), // transa
|
||||
::testing::Values('n'), // transb
|
||||
@@ -425,7 +425,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_sgemmsup_rd_zen_asm_6x64n_avx512
|
||||
#ifdef K_bli_sgemmsup_rd_zen4_asm_6x64n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_sgemmsup_rd_zen_asm_6x64n_row_stored_c,
|
||||
sgemmGenericSUP,
|
||||
@@ -436,7 +436,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values(2.0, 1.0, -1.0), // alpha value
|
||||
::testing::Values(1.0, 0.0, -1.0, 2.3), // beta value
|
||||
::testing::Values('r'), // storage of c
|
||||
::testing::Values(bli_sgemmsup_rd_zen_asm_6x64n_avx512), // sgemm_sup_kernel
|
||||
::testing::Values(K_bli_sgemmsup_rd_zen4_asm_6x64n), // sgemm_sup_kernel
|
||||
::testing::Values(gtint_t(6)), // Micro kernel block MR
|
||||
::testing::Values('n'), // transa
|
||||
::testing::Values('t'), // transb
|
||||
|
||||
@@ -1816,9 +1816,9 @@ public:
|
||||
};
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
#ifdef K_bli_zgemm_16x4_avx512_k1_nn
|
||||
#ifdef K_bli_zgemm_zen4_int_16x4_k1_nn
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zgemm_16x4_avx512_k1_nn,
|
||||
bli_zgemm_zen4_int_16x4_k1_nn,
|
||||
zgemmUkrk1,
|
||||
::testing::Combine(
|
||||
|
||||
@@ -1829,7 +1829,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values('c'), // storage
|
||||
::testing::Range(gtint_t(1), gtint_t(33), 1), // values of m
|
||||
::testing::Range(gtint_t(1), gtint_t(9), 1), // values of n
|
||||
::testing::Values(bli_zgemm_16x4_avx512_k1_nn),
|
||||
::testing::Values(K_bli_zgemm_zen4_int_16x4_k1_nn),
|
||||
::testing::Values(true, false) // memory test
|
||||
),
|
||||
::zgemmUkrk1Print()
|
||||
@@ -1838,9 +1838,9 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
#ifdef K_bli_zgemm_4x4_avx2_k1_nn
|
||||
#ifdef K_bli_zgemm_zen_int_4x4_k1_nn
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zgemm_4x4_avx2_k1_nn,
|
||||
bli_zgemm_zen_int_4x4_k1_nn,
|
||||
zgemmUkrk1,
|
||||
::testing::Combine(
|
||||
::testing::Values(dcomplex{1.0, 0.0}, dcomplex{-1.0, 0.0},
|
||||
@@ -1850,7 +1850,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values('c'), // storage
|
||||
::testing::Range(gtint_t(1), gtint_t(9), 1), // values of m
|
||||
::testing::Range(gtint_t(1), gtint_t(9), 1), // values of n
|
||||
::testing::Values(bli_zgemm_4x4_avx2_k1_nn),
|
||||
::testing::Values(K_bli_zgemm_zen_int_4x4_k1_nn),
|
||||
::testing::Values(true, false) // memory test
|
||||
),
|
||||
::zgemmUkrk1Print()
|
||||
|
||||
@@ -687,14 +687,14 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
// -------------------------------
|
||||
static dgemv_ker_ft_conja m_ker_fp[8] =
|
||||
{
|
||||
bli_dgemv_n_zen_int_16mx1_avx512, // n = 1
|
||||
bli_dgemv_n_zen_int_16mx2_avx512, // n = 2
|
||||
bli_dgemv_n_zen_int_16mx3_avx512, // n = 3
|
||||
bli_dgemv_n_zen_int_16mx4_avx512, // n = 4
|
||||
bli_dgemv_n_zen_int_16mx5_avx512, // n = 5
|
||||
bli_dgemv_n_zen_int_16mx6_avx512, // n = 6
|
||||
bli_dgemv_n_zen_int_16mx7_avx512, // n = 7
|
||||
bli_dgemv_n_zen_int_16mx8_avx512, // n = 8; base kernel
|
||||
bli_dgemv_n_zen4_int_16mx1, // n = 1
|
||||
bli_dgemv_n_zen4_int_16mx2, // n = 2
|
||||
bli_dgemv_n_zen4_int_16mx3, // n = 3
|
||||
bli_dgemv_n_zen4_int_16mx4, // n = 4
|
||||
bli_dgemv_n_zen4_int_16mx5, // n = 5
|
||||
bli_dgemv_n_zen4_int_16mx6, // n = 6
|
||||
bli_dgemv_n_zen4_int_16mx7, // n = 7
|
||||
bli_dgemv_n_zen4_int_16mx8, // n = 8; base kernel
|
||||
};
|
||||
|
||||
#define DGEMV_TEST_M(N) \
|
||||
@@ -721,45 +721,45 @@ static dgemv_ker_ft_conja m_ker_fp[8] =
|
||||
), \
|
||||
(::gemvUKRPrint<double, dgemv_ker_ft_conja>()) \
|
||||
);
|
||||
#ifdef K_bli_dgemv_n_zen_int_16mx8_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_16mx8
|
||||
DGEMV_TEST_M(8)
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dgemv_n_zen_int_16mx7_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_16mx7
|
||||
DGEMV_TEST_M(7)
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dgemv_n_zen_int_16mx6_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_16mx6
|
||||
DGEMV_TEST_M(6)
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dgemv_n_zen_int_16mx5_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_16mx5
|
||||
DGEMV_TEST_M(5)
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dgemv_n_zen_int_16mx4_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_16mx4
|
||||
DGEMV_TEST_M(4)
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dgemv_n_zen_int_16mx3_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_16mx3
|
||||
DGEMV_TEST_M(3)
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dgemv_n_zen_int_16mx2_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_16mx2
|
||||
DGEMV_TEST_M(2)
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dgemv_n_zen_int_16mx1_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_16mx1
|
||||
DGEMV_TEST_M(1)
|
||||
#endif
|
||||
|
||||
// 32x8n kernel will handle case where m >= 32.
|
||||
#ifdef K_bli_dgemv_n_zen_int_32x8n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_32x8n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_32x8n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_32x8n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_32x8n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -778,12 +778,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// 16x8n kernel will handle case where m = [16, 32).
|
||||
#ifdef K_bli_dgemv_n_zen_int_16x8n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_16x8n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_16x8n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_16x8n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_16x8n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -802,12 +802,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// 8x8n kernel will handle case where m = [8, 15).
|
||||
#ifdef K_bli_dgemv_n_zen_int_8x8n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_8x8n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_8x8n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_8x8n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_8x8n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -826,12 +826,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// m_leftx8n kernel will handle case where m = [1, 7).
|
||||
#ifdef K_bli_dgemv_n_zen_int_m_leftx8n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_m_leftx8n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_m_leftx8n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_m_leftx8n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_m_leftx8n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -850,12 +850,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// 32x4n kernel will handle case where m >= 32 and n = 4.
|
||||
#ifdef K_bli_dgemv_n_zen_int_32x4n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_32x4n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_32x4n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_32x4n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_32x4n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -874,12 +874,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// 16x4n kernel will handle case where m = [16, 32) and n = 4.
|
||||
#ifdef K_bli_dgemv_n_zen_int_16x4n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_16x4n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_16x4n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_16x4n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_16x4n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -897,12 +897,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// 8x4n kernel will handle case where m = [8, 15) and n = 4.
|
||||
#ifdef K_bli_dgemv_n_zen_int_8x4n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_8x4n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_8x4n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_8x4n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_8x4n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -920,12 +920,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// m_leftx4n kernel will handle case where m = [1, 7) and n = 4.
|
||||
#ifdef K_bli_dgemv_n_zen_int_m_leftx4n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_m_leftx4n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_m_leftx4n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_m_leftx4n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_m_leftx4n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -943,12 +943,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// 32x3n kernel will handle case where m >= 32 and n = 3.
|
||||
#ifdef K_bli_dgemv_n_zen_int_32x3n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_32x3n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_32x3n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_32x3n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_32x3n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -967,12 +967,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// 16x3n kernel will handle case where m = [16, 32) and n = 3.
|
||||
#ifdef K_bli_dgemv_n_zen_int_16x3n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_16x3n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_16x3n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_16x3n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_16x3n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -990,12 +990,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// 8x3n kernel will handle case where m = [8, 15) and n = 3.
|
||||
#ifdef K_bli_dgemv_n_zen_int_8x3n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_8x3n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_8x3n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_8x3n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_8x3n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -1013,12 +1013,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// m_leftx3n kernel will handle case where m = [1, 7) and n = 3.
|
||||
#ifdef K_bli_dgemv_n_zen_int_m_leftx3n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_m_leftx3n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_m_leftx3n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_m_leftx3n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_m_leftx3n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -1036,12 +1036,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// 32x2n kernel will handle case where m >= 32 and n = 2.
|
||||
#ifdef K_bli_dgemv_n_zen_int_32x2n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_32x2n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_32x2n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_32x2n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_32x2n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -1060,12 +1060,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// 16x2n kernel will handle case where m = [16, 32) and n = 2.
|
||||
#ifdef K_bli_dgemv_n_zen_int_16x2n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_16x2n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_16x2n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_16x2n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_16x2n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -1083,12 +1083,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// 8x2n kernel will handle case where m = [8, 15) and n = 2.
|
||||
#ifdef K_bli_dgemv_n_zen_int_8x2n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_8x2n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_8x2n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_8x2n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_8x2n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -1106,12 +1106,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// m_leftx2n kernel will handle case where m = [1, 7) and n = 2.
|
||||
#ifdef K_bli_dgemv_n_zen_int_m_leftx2n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_m_leftx2n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_m_leftx2n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_m_leftx2n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_m_leftx2n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -1129,12 +1129,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// 32x1n kernel will handle case where m >= 32 and n = 1.
|
||||
#ifdef K_bli_dgemv_n_zen_int_32x1n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_32x1n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_32x1n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_32x1n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_32x1n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -1153,12 +1153,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// 16x1n kernel will handle case where m = [16, 32) and n = 1.
|
||||
#ifdef K_bli_dgemv_n_zen_int_16x1n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_16x1n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_16x1n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_16x1n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_16x1n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -1176,12 +1176,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// 8x1n kernel will handle case where m = [8, 15) and n = 1.
|
||||
#ifdef K_bli_dgemv_n_zen_int_8x1n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_8x1n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_8x1n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_8x1n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_8x1n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -1199,12 +1199,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// m_leftx1n kernel will handle case where m = [1, 7) and n = 1.
|
||||
#ifdef K_bli_dgemv_n_zen_int_m_leftx1n_avx512
|
||||
#ifdef K_bli_dgemv_n_zen4_int_m_leftx1n
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_m_leftx1n_avx512,
|
||||
dgemvGenericConja,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen_int_m_leftx1n_avx512),
|
||||
::testing::Values(K_bli_dgemv_n_zen4_int_m_leftx1n),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -1247,12 +1247,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef K_bli_dgemv_n_zen4_40x2_int_st
|
||||
#ifdef K_bli_dgemv_n_zen4_int_40x2_st
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemv_n_zen4_40x2_int_st,
|
||||
bli_dgemv_n_zen4_int_40x2_st,
|
||||
dgemvGenericTransa,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen4_40x2_int_st),
|
||||
::testing::Values(bli_dgemv_n_zen4_int_40x2_st),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -1271,12 +1271,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_n_zen4_40x2_int_mt)
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_n_zen4_int_40x2_mt)
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemv_n_zen4_40x2_int_mt,
|
||||
bli_dgemv_n_zen4_int_40x2_mt,
|
||||
dgemvGenericTransa,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_n_zen4_40x2_int_mt),
|
||||
::testing::Values(bli_dgemv_n_zen4_int_40x2_mt),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -1295,12 +1295,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dgemv_m_zen4_40x8_int_st
|
||||
#ifdef K_bli_dgemv_m_zen4_int_40x8_st
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemv_m_zen4_40x8_int_st,
|
||||
bli_dgemv_m_zen4_int_40x8_st,
|
||||
dgemvGenericTransa,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_m_zen4_40x8_int_st),
|
||||
::testing::Values(bli_dgemv_m_zen4_int_40x8_st),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -1319,12 +1319,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_m_zen4_40x8_int_mt_Ndiv)
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_m_zen4_int_40x8_mt_Ndiv)
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemv_m_zen4_40x8_int_mt_Ndiv,
|
||||
bli_dgemv_m_zen4_int_40x8_mt_Ndiv,
|
||||
dgemvGenericTransa,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_m_zen4_40x8_int_mt_Ndiv),
|
||||
::testing::Values(bli_dgemv_m_zen4_int_40x8_mt_Ndiv),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -1343,12 +1343,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_m_zen4_40x8_int_mt_Mdiv)
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_m_zen4_int_40x8_mt_Mdiv)
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemv_m_zen4_40x8_int_mt_Mdiv,
|
||||
bli_dgemv_m_zen4_int_40x8_mt_Mdiv,
|
||||
dgemvGenericTransa,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_m_zen4_40x8_int_mt_Mdiv),
|
||||
::testing::Values(bli_dgemv_m_zen4_int_40x8_mt_Mdiv),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
@@ -1367,12 +1367,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_m_zen4_40x8_int_mt_Mdiv_Ndiv)
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(K_bli_dgemv_m_zen4_int_40x8_mt_Mdiv_Ndiv)
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dgemv_m_zen4_40x8_int_mt_Mdiv_Ndiv,
|
||||
bli_dgemv_m_zen4_int_40x8_mt_Mdiv_Ndiv,
|
||||
dgemvGenericTransa,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dgemv_m_zen4_40x8_int_mt_Mdiv_Ndiv),
|
||||
::testing::Values(bli_dgemv_m_zen4_int_40x8_mt_Mdiv_Ndiv),
|
||||
::testing::Values( 'c' ), // storage format
|
||||
::testing::Values( 'n' ), // transa
|
||||
::testing::Values( 'n' ), // conjx
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -72,8 +72,8 @@ TEST_P( dnrm2Generic, UKR )
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
/*
|
||||
Unit testing for functionality of bli_dnorm2fv_unb_var1_avx2 kernel.
|
||||
The code structure for bli_dnorm2fv_unb_var1_avx2( ... ) is as follows :
|
||||
Unit testing for functionality of bli_dnorm2fv_zen_int_unb_var1 kernel.
|
||||
The code structure for bli_dnorm2fv_zen_int_unb_var1( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 8 --> L8
|
||||
Fringe loops : In blocks of 4 --> L4
|
||||
@@ -82,12 +82,12 @@ TEST_P( dnrm2Generic, UKR )
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_dnorm2fv_unb_var1_avx2
|
||||
#ifdef K_bli_dnorm2fv_zen_int_unb_var1
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dnorm2fv_unb_var1_avx2_unitStrides,
|
||||
bli_dnorm2fv_zen_int_unb_var1_unitStrides,
|
||||
dnrm2Generic,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dnorm2fv_unb_var1_avx2), // ukr function
|
||||
::testing::Values(K_bli_dnorm2fv_zen_int_unb_var1), // ukr function
|
||||
// m size of vector
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(8), // size n, for L8
|
||||
@@ -105,12 +105,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with non-unit strides.
|
||||
#ifdef K_bli_dnorm2fv_unb_var1_avx2
|
||||
#ifdef K_bli_dnorm2fv_zen_int_unb_var1
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dnorm2fv_unb_var1_avx2_nonUnitStrides,
|
||||
bli_dnorm2fv_zen_int_unb_var1_nonUnitStrides,
|
||||
dnrm2Generic,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dnorm2fv_unb_var1_avx2), // ukr function
|
||||
::testing::Values(K_bli_dnorm2fv_zen_int_unb_var1), // ukr function
|
||||
// m size of vector
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(25), // n, size of the vector
|
||||
@@ -127,8 +127,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_dnorm2fv_unb_var1_avx512 kernel.
|
||||
The code structure for bli_dnorm2fv_unb_var1_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_dnorm2fv_zen4_int_unb_var1 kernel.
|
||||
The code structure for bli_dnorm2fv_zen4_int_unb_var1( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 32 --> L32
|
||||
Fringe loops : In blocks of 16 --> L16
|
||||
@@ -138,12 +138,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_dnorm2fv_unb_var1_avx512
|
||||
#ifdef K_bli_dnorm2fv_zen4_int_unb_var1
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dnorm2fv_unb_var1_avx512_unitStrides,
|
||||
bli_dnorm2fv_zen4_int_unb_var1_unitStrides,
|
||||
dnrm2Generic,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dnorm2fv_unb_var1_avx512), // ukr function
|
||||
::testing::Values(K_bli_dnorm2fv_zen4_int_unb_var1), // ukr function
|
||||
// m size of vector
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(32), // size n, for L32
|
||||
@@ -162,12 +162,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with non-unit strides.
|
||||
#ifdef K_bli_dnorm2fv_unb_var1_avx512
|
||||
#ifdef K_bli_dnorm2fv_zen4_int_unb_var1
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dnorm2fv_unb_var1_avx512_nonUnitStrides,
|
||||
bli_dnorm2fv_zen4_int_unb_var1_nonUnitStrides,
|
||||
dnrm2Generic,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dnorm2fv_unb_var1_avx512), // ukr function
|
||||
::testing::Values(K_bli_dnorm2fv_zen4_int_unb_var1), // ukr function
|
||||
// m size of vector
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(25), // n, size of the vector
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -72,8 +72,8 @@ TEST_P( dznrm2Generic, UKR )
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
/*
|
||||
Unit testing for functionality of bli_dznorm2fv_unb_var1_avx2 kernel.
|
||||
The code structure for bli_dznorm2fv_unb_var1_avx2( ... ) is as follows :
|
||||
Unit testing for functionality of bli_dznorm2fv_zen_int_unb_var1 kernel.
|
||||
The code structure for bli_dznorm2fv_zen_int_unb_var1( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 4 --> L4
|
||||
Fringe loops : In blocks of 2 --> L2
|
||||
@@ -82,12 +82,12 @@ TEST_P( dznrm2Generic, UKR )
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_dznorm2fv_unb_var1_avx2
|
||||
#ifdef K_bli_dznorm2fv_zen_int_unb_var1
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dznorm2fv_unb_var1_avx2_unitStrides,
|
||||
bli_dznorm2fv_zen_int_unb_var1_unitStrides,
|
||||
dznrm2Generic,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dznorm2fv_unb_var1_avx2), // ukr function
|
||||
::testing::Values(K_bli_dznorm2fv_zen_int_unb_var1), // ukr function
|
||||
// m size of vector
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(4), // size n, for L4
|
||||
@@ -105,12 +105,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with non-unit strides.
|
||||
#ifdef K_bli_dznorm2fv_unb_var1_avx2
|
||||
#ifdef K_bli_dznorm2fv_zen_int_unb_var1
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dznorm2fv_unb_var1_avx2_nonUnitStrides,
|
||||
bli_dznorm2fv_zen_int_unb_var1_nonUnitStrides,
|
||||
dznrm2Generic,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dznorm2fv_unb_var1_avx2), // ukr function
|
||||
::testing::Values(K_bli_dznorm2fv_zen_int_unb_var1), // ukr function
|
||||
// m size of vector
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(25), // n, size of the vector
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -72,8 +72,8 @@ TEST_P( scnrm2Generic, UKR )
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
/*
|
||||
Unit testing for functionality of bli_scnorm2fv_unb_var1_avx2 kernel.
|
||||
The code structure for bli_scnorm2fv_unb_var1_avx2( ... ) is as follows :
|
||||
Unit testing for functionality of bli_scnorm2fv_zen_int_unb_var1 kernel.
|
||||
The code structure for bli_scnorm2fv_zen_int_unb_var1( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 16 --> L16
|
||||
Fringe loops : In blocks of 12 --> L12
|
||||
@@ -85,12 +85,12 @@ TEST_P( scnrm2Generic, UKR )
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_scnorm2fv_unb_var1_avx2
|
||||
#ifdef K_bli_scnorm2fv_zen_int_unb_var1
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_scnorm2fv_unb_var1_avx2_unitStrides,
|
||||
bli_scnorm2fv_zen_int_unb_var1_unitStrides,
|
||||
scnrm2Generic,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_scnorm2fv_unb_var1_avx2), // ukr function
|
||||
::testing::Values(K_bli_scnorm2fv_zen_int_unb_var1), // ukr function
|
||||
// m size of vector
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(64), // size n, for L16
|
||||
@@ -106,12 +106,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with non-unit strides.
|
||||
#ifdef K_bli_scnorm2fv_unb_var1_avx2
|
||||
#ifdef K_bli_scnorm2fv_zen_int_unb_var1
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_scnorm2fv_unb_var1_avx2_nonUnitStrides,
|
||||
bli_scnorm2fv_zen_int_unb_var1_nonUnitStrides,
|
||||
scnrm2Generic,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_scnorm2fv_unb_var1_avx2), // ukr function
|
||||
::testing::Values(K_bli_scnorm2fv_zen_int_unb_var1), // ukr function
|
||||
// m size of vector
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(25), // n, size of the vector
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -72,8 +72,8 @@ TEST_P( snrm2Generic, UKR )
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
/*
|
||||
Unit testing for functionality of bli_snorm2fv_unb_var1_avx2 kernel.
|
||||
The code structure for bli_snorm2fv_unb_var1_avx2( ... ) is as follows :
|
||||
Unit testing for functionality of bli_snorm2fv_zen_int_unb_var1 kernel.
|
||||
The code structure for bli_snorm2fv_zen_int_unb_var1( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 32 --> L32
|
||||
Fringe loops : In blocks of 24 --> L24
|
||||
@@ -85,12 +85,12 @@ TEST_P( snrm2Generic, UKR )
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_snorm2fv_unb_var1_avx2
|
||||
#ifdef K_bli_snorm2fv_zen_int_unb_var1
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_snorm2fv_unb_var1_avx2_unitStrides,
|
||||
bli_snorm2fv_zen_int_unb_var1_unitStrides,
|
||||
snrm2Generic,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_snorm2fv_unb_var1_avx2), // ukr function
|
||||
::testing::Values(K_bli_snorm2fv_zen_int_unb_var1), // ukr function
|
||||
// m size of vector
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(64), // size n, for L32
|
||||
@@ -106,12 +106,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with non-unit strides.
|
||||
#ifdef K_bli_snorm2fv_unb_var1_avx2
|
||||
#ifdef K_bli_snorm2fv_zen_int_unb_var1
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_snorm2fv_unb_var1_avx2_nonUnitStrides,
|
||||
bli_snorm2fv_zen_int_unb_var1_nonUnitStrides,
|
||||
snrm2Generic,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_snorm2fv_unb_var1_avx2), // ukr function
|
||||
::testing::Values(K_bli_snorm2fv_zen_int_unb_var1), // ukr function
|
||||
// m size of vector
|
||||
::testing::Values(// Testing the loops standalone
|
||||
gtint_t(25), // n, size of the vector
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -163,8 +163,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
// ----------------------------------------------
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_dscal2v_zen_int_avx512 kernel.
|
||||
The code structure for bli_dscal2v_zen_int_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_dscal2v_zen4_int kernel.
|
||||
The code structure for bli_dscal2v_zen4_int( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 64 --> L64
|
||||
Fringe loops : In blocks of 32 --> L32
|
||||
@@ -175,12 +175,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
#ifdef K_bli_dscal2v_zen_int_avx512
|
||||
#ifdef K_bli_dscal2v_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dscal2v_zen_int_avx512_unitPositiveStride,
|
||||
bli_dscal2v_zen4_int_unitPositiveStride,
|
||||
dscal2vGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dscal2v_zen_int_avx512),
|
||||
::testing::Values(K_bli_dscal2v_zen4_int),
|
||||
// conjx: uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
::testing::Values(// Testing the loops standalone
|
||||
@@ -201,12 +201,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dscal2v_zen_int_avx512
|
||||
#ifdef K_bli_dscal2v_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dscal2v_zen_int_avx512_nonUnitPositiveStrides,
|
||||
bli_dscal2v_zen4_int_nonUnitPositiveStrides,
|
||||
dscal2vGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dscal2v_zen_int_avx512),
|
||||
::testing::Values(K_bli_dscal2v_zen4_int),
|
||||
// conjx: uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
::testing::Values(// Testing the loops standalone
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -180,7 +180,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
// ----- Begin ZEN4 (AVX512) Kernel Tests -----
|
||||
// ----------------------------------------------
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
// Tests for bli_cscalv_zen_int_avx512 (AVX512) kernel.
|
||||
// Tests for bli_cscalv_zen4_int (AVX512) kernel.
|
||||
/**
|
||||
* Loops:
|
||||
* L96 - Main loop, handles 96 scomplex elements
|
||||
@@ -193,12 +193,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
*
|
||||
* LScalar - handles non-unit increments
|
||||
*/
|
||||
#ifdef K_bli_cscalv_zen_int_avx512
|
||||
#ifdef K_bli_cscalv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_cscalv_zen_int_avx512_unitPositiveStride,
|
||||
bli_cscalv_zen4_int_unitPositiveStride,
|
||||
cscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_cscalv_zen_int_avx512),
|
||||
::testing::Values(K_bli_cscalv_zen4_int),
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
@@ -236,12 +236,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_cscalv_zen_int_avx512
|
||||
#ifdef K_bli_cscalv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_cscalv_zen_int_avx512_nonUnitPositiveStrides,
|
||||
bli_cscalv_zen4_int_nonUnitPositiveStrides,
|
||||
cscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_cscalv_zen_int_avx512),
|
||||
::testing::Values(K_bli_cscalv_zen4_int),
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -160,7 +160,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
// Tests for bli_dscalv_zen_int10 (AVX2) kernel.
|
||||
// Tests for bli_dscalv_zen_int_10 (AVX2) kernel.
|
||||
/**
|
||||
* Cases and Loops:
|
||||
* C0 L64 - Main loop, handles 64 elements
|
||||
@@ -172,12 +172,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
*
|
||||
* LNUnit - loop for non-unit increments
|
||||
*/
|
||||
#ifdef K_bli_dscalv_zen_int10
|
||||
#ifdef K_bli_dscalv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dscalv_zen_int10_unitPositiveStride,
|
||||
bli_dscalv_zen_int_10_unitPositiveStride,
|
||||
dscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dscalv_zen_int10),
|
||||
::testing::Values(K_bli_dscalv_zen_int_10),
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
@@ -223,12 +223,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dscalv_zen_int10
|
||||
#ifdef K_bli_dscalv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dscalv_zen_int10_nonUnitPositiveStrides,
|
||||
bli_dscalv_zen_int_10_nonUnitPositiveStrides,
|
||||
dscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dscalv_zen_int10),
|
||||
::testing::Values(K_bli_dscalv_zen_int_10),
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
@@ -260,7 +260,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
// ----- Begin ZEN4 (AVX512) Kernel Tests -----
|
||||
// ----------------------------------------------
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
// Tests for bli_dscalv_zen_int_avx512 (AVX512) kernel.
|
||||
// Tests for bli_dscalv_zen4_int (AVX512) kernel.
|
||||
/**
|
||||
* Loops:
|
||||
* L64 - Main loop, handles 64 elements
|
||||
@@ -271,12 +271,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
* L2 - handles 2 elements
|
||||
* LScalar - leftover loop (also handles non-unit increments)
|
||||
*/
|
||||
#ifdef K_bli_dscalv_zen_int_avx512
|
||||
#ifdef K_bli_dscalv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dscalv_zen_int_avx512_unitPositiveStride,
|
||||
bli_dscalv_zen4_int_unitPositiveStride,
|
||||
dscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dscalv_zen_int_avx512),
|
||||
::testing::Values(K_bli_dscalv_zen4_int),
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
@@ -335,12 +335,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dscalv_zen_int_avx512
|
||||
#ifdef K_bli_dscalv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dscalv_zen_int_avx512_nonUnitPositiveStrides,
|
||||
bli_dscalv_zen4_int_nonUnitPositiveStrides,
|
||||
dscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dscalv_zen_int_avx512),
|
||||
::testing::Values(K_bli_dscalv_zen4_int),
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -159,7 +159,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
// Tests for bli_sscalv_zen_int10 (AVX2) kernel.
|
||||
// Tests for bli_sscalv_zen_int_10 (AVX2) kernel.
|
||||
/**
|
||||
* Cases and Loops:
|
||||
* C0 L128 - Main loop, handles 128 elements
|
||||
@@ -174,12 +174,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
*
|
||||
* LNUnit - loop for non-unit increments
|
||||
*/
|
||||
#ifdef K_bli_sscalv_zen_int10
|
||||
#ifdef K_bli_sscalv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_sscalv_zen_int10_unitPositiveStride,
|
||||
bli_sscalv_zen_int_10_unitPositiveStride,
|
||||
sscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_sscalv_zen_int10),
|
||||
::testing::Values(K_bli_sscalv_zen_int_10),
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
@@ -219,12 +219,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_sscalv_zen_int10
|
||||
#ifdef K_bli_sscalv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_sscalv_zen_int10_nonUnitPositiveStrides,
|
||||
bli_sscalv_zen_int_10_nonUnitPositiveStrides,
|
||||
sscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_sscalv_zen_int10),
|
||||
::testing::Values(K_bli_sscalv_zen_int_10),
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -93,7 +93,7 @@ TEST_P( zdscalvGeneric, UKR )
|
||||
// ----- Begin ZEN1/2/3 (AVX2) Kernel Tests -----
|
||||
// ----------------------------------------------
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
// Tests for bli_zdscalv_zen_int10 (AVX2) kernel.
|
||||
// Tests for bli_zdscalv_zen_int_10 (AVX2) kernel.
|
||||
/**
|
||||
* Loops:
|
||||
* L30 - Main loop, handles 30 elements
|
||||
@@ -104,12 +104,12 @@ TEST_P( zdscalvGeneric, UKR )
|
||||
* L2 - handles 2 elements
|
||||
* LScalar - leftover loop (also handles non-unit increments)
|
||||
*/
|
||||
#ifdef K_bli_zdscalv_zen_int10
|
||||
#ifdef K_bli_zdscalv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zdscalv_zen_int10_unitPositiveStride,
|
||||
bli_zdscalv_zen_int_10_unitPositiveStride,
|
||||
zdscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zdscalv_zen_int10),
|
||||
::testing::Values(K_bli_zdscalv_zen_int_10),
|
||||
// conj(alpha): specify if alpha needs to be conjugated.
|
||||
::testing::Values(
|
||||
'n',
|
||||
@@ -147,12 +147,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_zdscalv_zen_int10
|
||||
#ifdef K_bli_zdscalv_zen_int_10
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zdscalv_zen_int10_nonUnitPositiveStride,
|
||||
bli_zdscalv_zen_int_10_nonUnitPositiveStride,
|
||||
zdscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zdscalv_zen_int10),
|
||||
::testing::Values(K_bli_zdscalv_zen_int_10),
|
||||
// conj(alpha): specify if alpha needs to be conjugated.
|
||||
::testing::Values(
|
||||
'n',
|
||||
@@ -189,7 +189,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
// ----- Begin ZEN4 (AVX512) Kernel Tests -----
|
||||
// ----------------------------------------------
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
// Tests for bli_zdscalv_zen_int_avx512 (AVX512) kernel.
|
||||
// Tests for bli_zdscalv_zen4_int (AVX512) kernel.
|
||||
/**
|
||||
* Loops:
|
||||
* L16 - Main loop, handles 16 elements
|
||||
@@ -198,12 +198,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
* L2 - handles 2 elements
|
||||
* LScalar - leftover loop (also handles non-unit increments)
|
||||
*/
|
||||
#ifdef K_bli_zdscalv_zen_int_avx512
|
||||
#ifdef K_bli_zdscalv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zdscalv_zen_int_avx512_unitPositiveStride,
|
||||
bli_zdscalv_zen4_int_unitPositiveStride,
|
||||
zdscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zdscalv_zen_int_avx512),
|
||||
::testing::Values(K_bli_zdscalv_zen4_int),
|
||||
// conj(alpha): specify if alpha needs to be conjugated.
|
||||
::testing::Values(
|
||||
'n',
|
||||
@@ -236,12 +236,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_zdscalv_zen_int_avx512
|
||||
#ifdef K_bli_zdscalv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zdscalv_zen_int_avx512_nonUnitPositiveStrides,
|
||||
bli_zdscalv_zen4_int_nonUnitPositiveStrides,
|
||||
zdscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zdscalv_zen_int_avx512),
|
||||
::testing::Values(K_bli_zdscalv_zen4_int),
|
||||
// conj(alpha): specify if alpha needs to be conjugated.
|
||||
::testing::Values(
|
||||
'n',
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -178,7 +178,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
// ----- Begin ZEN4 (AVX512) Kernel Tests -----
|
||||
// ----------------------------------------------
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
// Tests for bli_zscalv_zen_int_avx512 (AVX512) kernel.
|
||||
// Tests for bli_zscalv_zen4_int (AVX512) kernel.
|
||||
/**
|
||||
* Loops:
|
||||
* L48 - Main loop, handles 48 elements
|
||||
@@ -189,12 +189,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
* L2 - handles 2 elements
|
||||
* LScalar - leftover loop (also handles non-unit increments)
|
||||
*/
|
||||
#ifdef K_bli_zscalv_zen_int_avx512
|
||||
#ifdef K_bli_zscalv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zscalv_zen_int_avx512_unitPositiveStride,
|
||||
bli_zscalv_zen4_int_unitPositiveStride,
|
||||
zscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zscalv_zen_int_avx512),
|
||||
::testing::Values(K_bli_zscalv_zen4_int),
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
@@ -230,12 +230,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_zscalv_zen_int_avx512
|
||||
#ifdef K_bli_zscalv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zscalv_zen_int_avx512_nonUnitPositiveStrides,
|
||||
bli_zscalv_zen4_int_nonUnitPositiveStrides,
|
||||
zscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zscalv_zen_int_avx512),
|
||||
::testing::Values(K_bli_zscalv_zen4_int),
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -144,8 +144,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_dsetv_zen_int_avx512 kernel.
|
||||
The code structure for bli_dsetv_zen_int_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_dsetv_zen4_int kernel.
|
||||
The code structure for bli_dsetv_zen4_int( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 256 --> L256
|
||||
Fringe loops : In blocks of 128 --> L128
|
||||
@@ -159,12 +159,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with Unit Strides(US), across all loops.
|
||||
#ifdef K_bli_dsetv_zen_int_avx512
|
||||
#ifdef K_bli_dsetv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dsetv_zen_int_avx512_unitStrides,
|
||||
bli_dsetv_zen4_int_unitStrides,
|
||||
dsetvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dsetv_zen_int_avx512),
|
||||
::testing::Values(K_bli_dsetv_zen4_int),
|
||||
::testing::Values('n', 'c'), // conjalpha
|
||||
::testing::Values(double(2.2)), // alpha
|
||||
::testing::Values(// Testing the loops standalone
|
||||
@@ -201,12 +201,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with Non-Unit Strides(US), across all loops.
|
||||
#ifdef K_bli_dsetv_zen_int_avx512
|
||||
#ifdef K_bli_dsetv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_dsetv_zen_int_avx512_nonUnitStrides,
|
||||
bli_dsetv_zen4_int_nonUnitStrides,
|
||||
dsetvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dsetv_zen_int_avx512),
|
||||
::testing::Values(K_bli_dsetv_zen4_int),
|
||||
::testing::Values('n', 'c'), // conjalpha
|
||||
::testing::Values(double(2.2)), // alpha
|
||||
::testing::Values(gtint_t(25), gtint_t(37)), // size of the vector
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -144,8 +144,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_ssetv_zen_int_avx512 kernel.
|
||||
The code structure for bli_ssetv_zen_int_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_ssetv_zen4_int kernel.
|
||||
The code structure for bli_ssetv_zen4_int( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 512 --> L512
|
||||
Fringe loops : In blocks of 256 --> L256
|
||||
@@ -158,12 +158,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_ssetv_zen_int_avx512
|
||||
#ifdef K_bli_ssetv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_ssetv_zen_int_avx512_unitStrides,
|
||||
bli_ssetv_zen4_int_unitStrides,
|
||||
ssetvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_ssetv_zen_int_avx512),
|
||||
::testing::Values(K_bli_ssetv_zen4_int),
|
||||
::testing::Values('n', 'c'), // conjalpha
|
||||
::testing::Values(float(1.2)), // alpha
|
||||
::testing::Values(// Testing the loops standalone
|
||||
@@ -197,12 +197,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with non-unit strides, across all loops.
|
||||
#ifdef K_bli_ssetv_zen_int_avx512
|
||||
#ifdef K_bli_ssetv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_ssetv_zen_int_avx512_nonUnitStrides,
|
||||
bli_ssetv_zen4_int_nonUnitStrides,
|
||||
ssetvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_ssetv_zen_int_avx512),
|
||||
::testing::Values(K_bli_ssetv_zen4_int),
|
||||
::testing::Values('n', 'c'), // conjalpha
|
||||
::testing::Values(float(1.2)), // alpha
|
||||
::testing::Values(gtint_t(25), gtint_t(37)), // size of the vector
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -152,8 +152,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
|
||||
/*
|
||||
Unit testing for functionality of bli_zsetv_zen_int_avx512 kernel.
|
||||
The code structure for bli_zsetv_zen_int_avx512( ... ) is as follows :
|
||||
Unit testing for functionality of bli_zsetv_zen4_int kernel.
|
||||
The code structure for bli_zsetv_zen4_int( ... ) is as follows :
|
||||
For unit strides :
|
||||
Main loop : In blocks of 128 --> L128
|
||||
Fringe loops : In blocks of 64 --> L64
|
||||
@@ -167,12 +167,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
For non-unit strides : A single loop, to process element wise.
|
||||
*/
|
||||
// Unit testing with unit strides, across all loops.
|
||||
#ifdef K_bli_zsetv_zen_int_avx512
|
||||
#ifdef K_bli_zsetv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zsetv_zen_int_avx512_unitStrides,
|
||||
bli_zsetv_zen4_int_unitStrides,
|
||||
zsetvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zsetv_zen_int_avx512),
|
||||
::testing::Values(K_bli_zsetv_zen4_int),
|
||||
::testing::Values('n' // conjx
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c'
|
||||
@@ -213,12 +213,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
// Unit testing with non-unit strides, across all loops.
|
||||
#ifdef K_bli_zsetv_zen_int_avx512
|
||||
#ifdef K_bli_zsetv_zen4_int
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_zsetv_zen_int_avx512_nonUnitStrides,
|
||||
bli_zsetv_zen4_int_nonUnitStrides,
|
||||
zsetvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_zsetv_zen_int_avx512),
|
||||
::testing::Values(K_bli_zsetv_zen4_int),
|
||||
::testing::Values('n' // conjx
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c'
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -75,16 +75,16 @@ TEST_P( dswapvGeneric, UKR )
|
||||
// ----------------------------------------------
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
|
||||
// Tests for bli_dswapv_zen_int8 (AVX2) kernel.
|
||||
// Tests for bli_dswapv_zen_int_8 (AVX2) kernel.
|
||||
// For unit inc on x and y:
|
||||
// Optimised code is avialble for n = 32, 16, 8, 4
|
||||
|
||||
#ifdef K_bli_dswapv_zen_int8
|
||||
#ifdef K_bli_dswapv_zen_int_8
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
UnitIncrements,
|
||||
dswapvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dswapv_zen_int8),
|
||||
::testing::Values(K_bli_dswapv_zen_int_8),
|
||||
// n: size of vector.
|
||||
::testing::Values(
|
||||
gtint_t(1), gtint_t(2), gtint_t(4), gtint_t(8), gtint_t(16), gtint_t(32),
|
||||
@@ -108,12 +108,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_dswapv_zen_int8
|
||||
#ifdef K_bli_dswapv_zen_int_8
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
NonUnitIncrements,
|
||||
dswapvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_dswapv_zen_int8),
|
||||
::testing::Values(K_bli_dswapv_zen_int_8),
|
||||
// n: size of vector.
|
||||
::testing::Values(
|
||||
gtint_t(1),
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -75,16 +75,16 @@ TEST_P( sswapvGeneric, UKR )
|
||||
// ----------------------------------------------
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
|
||||
// Tests for bli_dswapv_zen_int8 (AVX2) kernel.
|
||||
// Tests for bli_dswapv_zen_int_8 (AVX2) kernel.
|
||||
// For unit inc on x and y:
|
||||
// When n values are 64, 32, 16, 8, 4 it is avx2 optimised
|
||||
|
||||
#ifdef K_bli_sswapv_zen_int8
|
||||
#ifdef K_bli_sswapv_zen_int_8
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
UnitIncrements,
|
||||
sswapvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_sswapv_zen_int8),
|
||||
::testing::Values(K_bli_sswapv_zen_int_8),
|
||||
// n: size of vector.
|
||||
::testing::Values(
|
||||
gtint_t(1), gtint_t(2), gtint_t(8), gtint_t(16), gtint_t(32),
|
||||
@@ -108,12 +108,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_sswapv_zen_int8
|
||||
#ifdef K_bli_sswapv_zen_int_8
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
NonUnitIncrements,
|
||||
sswapvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_sswapv_zen_int8),
|
||||
::testing::Values(K_bli_sswapv_zen_int_8),
|
||||
// n: size of vector.
|
||||
::testing::Values(
|
||||
gtint_t(1),
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -88,12 +88,12 @@ TEST_P( ctrsmGenericSmall, UKR )
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
|
||||
#ifdef K_bli_trsm_small
|
||||
#ifdef K_bli_trsm_small_zen
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_trsm_small,
|
||||
bli_trsm_small_zen,
|
||||
ctrsmGenericSmall,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_trsm_small), // ker_ptr
|
||||
::testing::Values(K_bli_trsm_small_zen), // ker_ptr
|
||||
::testing::Values('l', 'r'), // side
|
||||
::testing::Values('l', 'u'), // uplo
|
||||
::testing::Values('n', 'u'), // diaga
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -165,12 +165,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef K_bli_trsm_small_AVX512
|
||||
#ifdef K_bli_trsm_small_zen4
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_trsm_small_AVX512,
|
||||
bli_trsm_small_zen4,
|
||||
dtrsmGenericSmall,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_trsm_small_AVX512), // ker_ptr
|
||||
::testing::Values(K_bli_trsm_small_zen4), // ker_ptr
|
||||
::testing::Values('l', 'r'), // side
|
||||
::testing::Values('l', 'u'), // uplo
|
||||
::testing::Values('n', 'u'), // diaga
|
||||
@@ -235,12 +235,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
|
||||
#ifdef K_bli_trsm_small
|
||||
#ifdef K_bli_trsm_small_zen
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_trsm_small,
|
||||
bli_trsm_small_zen,
|
||||
dtrsmGenericSmall,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_trsm_small), // ker_ptr
|
||||
::testing::Values(K_bli_trsm_small_zen), // ker_ptr
|
||||
::testing::Values('l', 'r'), // side
|
||||
::testing::Values('l', 'u'), // uplo
|
||||
::testing::Values('n', 'u'), // diaga
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -169,12 +169,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
|
||||
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
|
||||
#ifdef K_bli_trsm_small
|
||||
#ifdef K_bli_trsm_small_zen
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_trsm_small,
|
||||
bli_trsm_small_zen,
|
||||
strsmGenericSmall,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_trsm_small), // ker_ptr
|
||||
::testing::Values(K_bli_trsm_small_zen), // ker_ptr
|
||||
::testing::Values('l', 'r'), // side
|
||||
::testing::Values('l', 'u'), // uplo
|
||||
::testing::Values('n', 'u'), // diaga
|
||||
|
||||
@@ -135,12 +135,12 @@ TEST_P( ztrsmGenericSmall, UKR )
|
||||
#if defined(BLIS_KERNELS_ZEN5) && defined(GTEST_AVX512)
|
||||
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
|
||||
#ifdef K_bli_ztrsm_small_ZEN5
|
||||
#ifdef K_bli_ztrsm_small_zen5
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_trsm_small_ZEN5_r,
|
||||
bli_trsm_small_zen5_r,
|
||||
ztrsmGenericSmall,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_trsm_small_ZEN5), // ker_ptr
|
||||
::testing::Values(K_bli_trsm_small_zen5), // ker_ptr
|
||||
::testing::Values('r'), // side
|
||||
::testing::Values('l', 'u'), // uplo
|
||||
::testing::Values('n', 'u'), // diaga
|
||||
@@ -157,10 +157,10 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_trsm_small_ZEN5_l,
|
||||
bli_trsm_small_zen5_l,
|
||||
ztrsmGenericSmall,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_trsm_small_ZEN5), // ker_ptr
|
||||
::testing::Values(K_bli_trsm_small_zen5), // ker_ptr
|
||||
::testing::Values('l'), // side
|
||||
::testing::Values('l', 'u'), // uplo
|
||||
::testing::Values('n', 'u'), // diaga
|
||||
@@ -177,10 +177,10 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_trsm_small_ZEN5_gemm,
|
||||
bli_trsm_small_zen5_gemm,
|
||||
ztrsmGenericSmall,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_trsm_small_ZEN5), // ker_ptr
|
||||
::testing::Values(K_bli_trsm_small_zen5), // ker_ptr
|
||||
::testing::Values('l', 'r'), // side
|
||||
::testing::Values('l', 'u'), // uplo
|
||||
::testing::Values('n', 'u'), // diaga
|
||||
@@ -194,7 +194,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
),
|
||||
(::trsmSmallUKRPrint<dcomplex, trsm_small_ker_ft>())
|
||||
);
|
||||
#endif // K_bli_ztrsm_small_ZEN5
|
||||
#endif // K_bli_ztrsm_small_zen5
|
||||
#endif // BLIS_ENABLE_SMALL_MATRIX_TRSM
|
||||
|
||||
#endif // defined(BLIS_KERNELS_ZEN5) && defined(GTEST_AVX512)
|
||||
@@ -248,12 +248,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
|
||||
#ifdef K_bli_trsm_small_AVX512
|
||||
#ifdef K_bli_trsm_small_zen4
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_trsm_small_AVX512,
|
||||
bli_trsm_small_zen4,
|
||||
ztrsmGenericSmall,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_trsm_small_AVX512), // ker_ptr
|
||||
::testing::Values(K_bli_trsm_small_zen4), // ker_ptr
|
||||
::testing::Values('l', 'r'), // side
|
||||
::testing::Values('l', 'u'), // uplo
|
||||
::testing::Values('n', 'u'), // diaga
|
||||
@@ -324,12 +324,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
|
||||
#ifdef K_bli_trsm_small
|
||||
#ifdef K_bli_trsm_small_zen
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
bli_trsm_small,
|
||||
bli_trsm_small_zen,
|
||||
ztrsmGenericSmall,
|
||||
::testing::Combine(
|
||||
::testing::Values(bli_trsm_small), // ker_ptr
|
||||
::testing::Values(K_bli_trsm_small_zen), // ker_ptr
|
||||
::testing::Values('l', 'r'), // side
|
||||
::testing::Values('l', 'u'), // uplo
|
||||
::testing::Values('n', 'u'), // diaga
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -80,7 +80,7 @@ void bli_saxpbyv_zen_int
|
||||
// When beta = !( 0 or 1 ) --> SSCALV
|
||||
if ( bli_seq0( *alpha ) )
|
||||
{
|
||||
bli_sscalv_zen_int10
|
||||
bli_sscalv_zen_int_10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
@@ -337,7 +337,7 @@ void bli_daxpbyv_zen_int
|
||||
// When beta = !( 0 or 1 ) --> DSCALV
|
||||
if ( bli_deq0( *alpha ) )
|
||||
{
|
||||
bli_dscalv_zen_int10
|
||||
bli_dscalv_zen_int_10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
@@ -648,7 +648,7 @@ void bli_caxpbyv_zen_int
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_caxpyv_zen_int5
|
||||
bli_caxpyv_zen_int_5
|
||||
(
|
||||
conjx,
|
||||
n,
|
||||
@@ -1331,7 +1331,7 @@ void bli_zaxpbyv_zen_int
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_zaxpyv_zen_int5
|
||||
bli_zaxpyv_zen_int_5
|
||||
(
|
||||
conjx,
|
||||
n,
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -57,7 +57,7 @@ typedef union
|
||||
* x & y are single precision vectors of length n.
|
||||
* alpha & beta are scalars.
|
||||
*/
|
||||
void bli_saxpbyv_zen_int10
|
||||
void bli_saxpbyv_zen_int_10
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -80,7 +80,7 @@ void bli_saxpbyv_zen_int10
|
||||
// When beta = !( 0 or 1 ) --> SSCALV
|
||||
if ( bli_seq0( *alpha ) )
|
||||
{
|
||||
bli_sscalv_zen_int10
|
||||
bli_sscalv_zen_int_10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
@@ -733,7 +733,7 @@ void bli_saxpbyv_zen_int10
|
||||
* x & y are double precision vectors of length n.
|
||||
* alpha & beta are scalars.
|
||||
*/
|
||||
void bli_daxpbyv_zen_int10
|
||||
void bli_daxpbyv_zen_int_10
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -756,7 +756,7 @@ void bli_daxpbyv_zen_int10
|
||||
// When beta = !( 0 or 1 ) --> DSCALV
|
||||
if ( bli_deq0( *alpha ) )
|
||||
{
|
||||
bli_dscalv_zen_int10
|
||||
bli_dscalv_zen_int_10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2016 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2020, The University of Texas at Austin. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -55,7 +55,7 @@ typedef union
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_saxpyv_zen_int10
|
||||
void bli_saxpyv_zen_int_10
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -340,7 +340,7 @@ void bli_saxpyv_zen_int10
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_daxpyv_zen_int10
|
||||
BLIS_EXPORT_BLIS void bli_daxpyv_zen_int_10
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -472,7 +472,7 @@ BLIS_EXPORT_BLIS void bli_daxpyv_zen_int10
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_caxpyv_zen_int5
|
||||
void bli_caxpyv_zen_int_5
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -772,7 +772,7 @@ void bli_caxpyv_zen_int5
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_zaxpyv_zen_int5
|
||||
void bli_zaxpyv_zen_int_5
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -66,7 +66,7 @@ static int64_t mask_0[4] = {0, 0, 0, 0};
|
||||
static int64_t *mask_ptr[] = {mask_0, mask_1, mask_2, mask_3};
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_sdotv_zen_int10
|
||||
void bli_sdotv_zen_int_10
|
||||
(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
@@ -257,7 +257,7 @@ void bli_sdotv_zen_int10
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_ddotv_zen_int10
|
||||
void bli_ddotv_zen_int_10
|
||||
(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
@@ -426,7 +426,7 @@ void bli_ddotv_zen_int10
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
void bli_cdotv_zen_int5
|
||||
void bli_cdotv_zen_int_5
|
||||
(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
@@ -740,7 +740,7 @@ void bli_cdotv_zen_int5
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_zdotv_zen_int5
|
||||
void bli_zdotv_zen_int_5
|
||||
(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2021 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -97,7 +97,7 @@ float horizontal_add_sf(__m256 const a) {
|
||||
}
|
||||
|
||||
// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
|
||||
void bli_snorm2fv_unb_var1_avx2
|
||||
void bli_snorm2fv_zen_int_unb_var1
|
||||
(
|
||||
dim_t n,
|
||||
float* x, inc_t incx,
|
||||
@@ -834,7 +834,7 @@ void bli_snorm2fv_unb_var1_avx2
|
||||
}
|
||||
|
||||
// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
|
||||
void bli_scnorm2fv_unb_var1_avx2
|
||||
void bli_scnorm2fv_zen_int_unb_var1
|
||||
(
|
||||
dim_t n,
|
||||
scomplex* x, inc_t incx,
|
||||
@@ -1601,7 +1601,7 @@ void bli_scnorm2fv_unb_var1_avx2
|
||||
}
|
||||
|
||||
// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
|
||||
void bli_dnorm2fv_unb_var1_avx2
|
||||
void bli_dnorm2fv_zen_int_unb_var1
|
||||
(
|
||||
dim_t n,
|
||||
double* x, inc_t incx,
|
||||
@@ -1954,7 +1954,7 @@ void bli_dnorm2fv_unb_var1_avx2
|
||||
}
|
||||
|
||||
// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
|
||||
void bli_dznorm2fv_unb_var1_avx2
|
||||
void bli_dznorm2fv_zen_int_unb_var1
|
||||
(
|
||||
dim_t n,
|
||||
dcomplex* x, inc_t incx,
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2017 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2017 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_sscalv_zen_int10
|
||||
void bli_sscalv_zen_int_10
|
||||
(
|
||||
conj_t conjalpha,
|
||||
dim_t n,
|
||||
@@ -309,7 +309,7 @@ void bli_sscalv_zen_int10
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
|
||||
BLIS_EXPORT_BLIS void bli_dscalv_zen_int_10
|
||||
(
|
||||
conj_t conjalpha,
|
||||
dim_t n,
|
||||
@@ -582,7 +582,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
|
||||
}
|
||||
}
|
||||
|
||||
void bli_zdscalv_zen_int10
|
||||
void bli_zdscalv_zen_int_10
|
||||
(
|
||||
conj_t conjalpha,
|
||||
dim_t n,
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -134,7 +134,7 @@ void bli_ssetv_zen_int
|
||||
}
|
||||
}
|
||||
|
||||
void bli_dsetv_zen_int
|
||||
void bli_dsetv_zen_int
|
||||
(
|
||||
conj_t conjalpha,
|
||||
dim_t n,
|
||||
@@ -230,7 +230,7 @@ void bli_dsetv_zen_int
|
||||
}
|
||||
}
|
||||
|
||||
void bli_csetv_zen_int
|
||||
void bli_csetv_zen_int
|
||||
(
|
||||
conj_t conjalpha,
|
||||
dim_t n,
|
||||
@@ -332,7 +332,7 @@ void bli_csetv_zen_int
|
||||
|
||||
}
|
||||
|
||||
void bli_zsetv_zen_int
|
||||
void bli_zsetv_zen_int
|
||||
(
|
||||
conj_t conjalpha,
|
||||
dim_t n,
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -54,7 +54,7 @@ typedef union
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_sswapv_zen_int8
|
||||
void bli_sswapv_zen_int_8
|
||||
(
|
||||
dim_t n,
|
||||
float* restrict x, inc_t incx,
|
||||
@@ -202,7 +202,7 @@ void bli_sswapv_zen_int8
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_dswapv_zen_int8
|
||||
BLIS_EXPORT_BLIS void bli_dswapv_zen_int_8
|
||||
(
|
||||
dim_t n,
|
||||
double* restrict x, inc_t incx,
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -346,7 +346,7 @@ void bli_zaxpyf_zen_int_4
|
||||
|
||||
_mm_storeu_pd((double *)&alpha_chi1, temp[0]);
|
||||
|
||||
bli_zaxpyv_zen_int5
|
||||
bli_zaxpyv_zen_int_5
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2017 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -87,7 +87,7 @@ void bli_sdotxf_zen_int_8
|
||||
if ( bli_zero_dim1( m ) || PASTEMAC(s,eq0)( *alpha ) )
|
||||
{
|
||||
|
||||
bli_sscalv_zen_int10
|
||||
bli_sscalv_zen_int_10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
b_n,
|
||||
@@ -449,7 +449,7 @@ void bli_ddotxf_zen_int_8
|
||||
// simplifies to updating y.
|
||||
if (bli_zero_dim1(m) || PASTEMAC(d, eq0)(*alpha))
|
||||
{
|
||||
bli_dscalv_zen_int10(
|
||||
bli_dscalv_zen_int_10(
|
||||
BLIS_NO_CONJUGATE,
|
||||
b_n,
|
||||
beta,
|
||||
@@ -902,7 +902,7 @@ void bli_ddotxf_zen_int_4
|
||||
// simplifies to updating y.
|
||||
if (bli_zero_dim1(m) || PASTEMAC(d, eq0)(*alpha))
|
||||
{
|
||||
bli_dscalv_zen_int10(
|
||||
bli_dscalv_zen_int_10(
|
||||
BLIS_NO_CONJUGATE,
|
||||
b_n,
|
||||
beta,
|
||||
@@ -1297,7 +1297,7 @@ void bli_ddotxf_zen_int_2
|
||||
// simplifies to updating y.
|
||||
if (bli_zero_dim1(m) || PASTEMAC(d, eq0)(*alpha))
|
||||
{
|
||||
bli_dscalv_zen_int10(
|
||||
bli_dscalv_zen_int_10(
|
||||
BLIS_NO_CONJUGATE,
|
||||
b_n,
|
||||
beta,
|
||||
|
||||
@@ -897,7 +897,7 @@ void bli_dgemv_t_zen_int
|
||||
return;
|
||||
}
|
||||
|
||||
void bli_dgemv_t_zen_int_16x7m
|
||||
void bli_dgemv_t_zen_int_16x7m
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
@@ -1290,7 +1290,7 @@ void bli_dgemv_t_zen_int_16x7m
|
||||
}
|
||||
}
|
||||
|
||||
void bli_dgemv_t_zen_int_16x6m
|
||||
void bli_dgemv_t_zen_int_16x6m
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
@@ -1653,7 +1653,7 @@ void bli_dgemv_t_zen_int_16x6m
|
||||
}
|
||||
}
|
||||
|
||||
void bli_dgemv_t_zen_int_16x5m
|
||||
void bli_dgemv_t_zen_int_16x5m
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
@@ -1955,7 +1955,7 @@ void bli_dgemv_t_zen_int_16x5m
|
||||
}
|
||||
}
|
||||
|
||||
void bli_dgemv_t_zen_int_16x4m
|
||||
void bli_dgemv_t_zen_int_16x4m
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
@@ -2221,7 +2221,7 @@ void bli_dgemv_t_zen_int_16x4m
|
||||
|
||||
}
|
||||
|
||||
void bli_dgemv_t_zen_int_16x3m
|
||||
void bli_dgemv_t_zen_int_16x3m
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
@@ -2469,7 +2469,7 @@ void bli_dgemv_t_zen_int_16x3m
|
||||
}
|
||||
}
|
||||
|
||||
void bli_dgemv_t_zen_int_16x2m
|
||||
void bli_dgemv_t_zen_int_16x2m
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
@@ -2696,7 +2696,7 @@ void bli_dgemv_t_zen_int_16x2m
|
||||
}
|
||||
}
|
||||
|
||||
void bli_dgemv_t_zen_int_16x1m
|
||||
void bli_dgemv_t_zen_int_16x1m
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -35,11 +35,11 @@
|
||||
#include "blis.h"
|
||||
|
||||
/**
|
||||
* bli_dgemv_n_avx2(...) handles cases where op(A) = NO_TRANSPOSE for Zen/2/3
|
||||
* bli_dgemv_n_zen(...) handles cases where op(A) = NO_TRANSPOSE for Zen/2/3
|
||||
* architectures and is based on the previous approach of using the fused
|
||||
* kernels, namely AXPYF, to perform the GEMV operation.
|
||||
*/
|
||||
void bli_dgemv_n_avx2
|
||||
void bli_dgemv_n_zen
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
@@ -88,7 +88,7 @@ void bli_dgemv_n_avx2
|
||||
b_fuse = 8;
|
||||
axpyf_kr_ptr = bli_daxpyf_zen_int_8; // DAXPYF
|
||||
scal2v_kr_ptr = bli_dscal2v_zen_int; // DSCAL2V
|
||||
scalv_kr_ptr = bli_dscalv_zen_int10; // DSCALV
|
||||
scalv_kr_ptr = bli_dscalv_zen_int_10; // DSCALV
|
||||
copyv_kr_ptr = bli_dcopyv_zen_int; // DCOPYV
|
||||
|
||||
/*
|
||||
@@ -119,7 +119,7 @@ void bli_dgemv_n_avx2
|
||||
size_t buffer_size = m0 * sizeof(double);
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_dgemv_n_avx2(): get mem pool block\n" );
|
||||
printf( "bli_dgemv_n_zen(): get mem pool block\n" );
|
||||
#endif
|
||||
|
||||
/* Acquire a Buffer(m0*size(double)) from the memory broker
|
||||
@@ -218,7 +218,7 @@ void bli_dgemv_n_avx2
|
||||
);
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_dgemv_n_avx2(): releasing mem pool block\n" );
|
||||
printf( "bli_dgemv_n_zen(): releasing mem pool block\n" );
|
||||
#endif
|
||||
// Return the buffer to pool
|
||||
bli_pba_release( &rntm , &mem_bufY );
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -265,7 +265,7 @@ void bli_zgemv_zen_int_4x4
|
||||
bli_zcopycjs( conjx, *chi1, alpha_chi1 );
|
||||
bli_zscals( *alpha, alpha_chi1 );
|
||||
|
||||
bli_zaxpyv_zen_int5
|
||||
bli_zaxpyv_zen_int_5
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
@@ -483,7 +483,7 @@ void bli_cgemv_zen_int_4x4
|
||||
scomplex alpha_chi1;
|
||||
bli_ccopycjs( conjx, *chi1, alpha_chi1 );
|
||||
bli_cscals( *alpha, alpha_chi1 );
|
||||
bli_caxpyv_zen_int5
|
||||
bli_caxpyv_zen_int_5
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
@@ -529,7 +529,7 @@ void bli_multi_sgemv_4x2
|
||||
if (bli_zero_dim1(m) || PASTEMAC(s, eq0)(*alpha))
|
||||
{
|
||||
|
||||
bli_sscalv_zen_int10(
|
||||
bli_sscalv_zen_int_10(
|
||||
BLIS_NO_CONJUGATE,
|
||||
b_n,
|
||||
beta,
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -40,7 +40,7 @@
|
||||
#define D_MR 8
|
||||
#define D_NR 6
|
||||
|
||||
err_t bli_dgemm_8x6_avx2_k1_nn
|
||||
err_t bli_dgemm_zen_int_8x6_k1_nn
|
||||
(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
@@ -33,7 +33,7 @@
|
||||
*/
|
||||
#include "blis.h"
|
||||
|
||||
static dgemmsup_ker_ft kern_fp[] =
|
||||
static dgemmsup_ker_ft kern_fp_zen[] =
|
||||
{
|
||||
bli_dgemmsup_rv_haswell_asm_6x8m,
|
||||
bli_dgemmsup_rd_haswell_asm_6x8m,
|
||||
@@ -45,7 +45,7 @@ static dgemmsup_ker_ft kern_fp[] =
|
||||
bli_dgemmsup_rv_haswell_asm_6x8n
|
||||
};
|
||||
|
||||
err_t bli_dgemm_tiny_6x8
|
||||
err_t bli_dgemm_tiny_zen_6x8
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
@@ -202,7 +202,7 @@ err_t bli_dgemm_tiny_6x8
|
||||
*/
|
||||
inc_t ps_a_use = (MR_ * rs_a);
|
||||
bli_auxinfo_set_ps_a( ps_a_use, &aux );
|
||||
dgemmsup_ker_ft kern_ptr = kern_fp[stor_id];
|
||||
dgemmsup_ker_ft kern_ptr = kern_fp_zen[stor_id];
|
||||
|
||||
/**
|
||||
* JC Loop is eliminated as it iterates only once, So computation
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
#include "blis.h"
|
||||
|
||||
// Defining separate static arrays to hold all the kernel info, based on the datatype
|
||||
static gemmtiny_ukr_info_t cgemmtiny_ukr_avx2[] =
|
||||
static gemmtiny_ukr_info_t cgemmtiny_ukr_zen[] =
|
||||
{
|
||||
{ (void *)bli_cgemmsup_rv_zen_asm_3x8m, (void *)bli_cpackm_haswell_asm_8xk, TRUE, FALSE, 3, 4 },
|
||||
{ (void *)bli_cgemmsup_rv_zen_asm_3x8m, (void *)bli_cpackm_haswell_asm_8xk, TRUE, TRUE, 3, 4 },
|
||||
@@ -47,7 +47,7 @@ static gemmtiny_ukr_info_t cgemmtiny_ukr_avx2[] =
|
||||
{ (void *)bli_cgemmsup_rv_zen_asm_3x8m, (void *)bli_cpackm_haswell_asm_8xk, TRUE, FALSE, 3, 4 }
|
||||
};
|
||||
|
||||
static gemmtiny_ukr_info_t zgemmtiny_ukr_avx2[] =
|
||||
static gemmtiny_ukr_info_t zgemmtiny_ukr_zen[] =
|
||||
{
|
||||
{ (void *)bli_zgemmsup_rv_zen_asm_3x4m, (void *)bli_zpackm_haswell_asm_4xk, TRUE, FALSE, 3, 4 },
|
||||
{ (void *)bli_zgemmsup_rd_zen_asm_3x4m, (void *)bli_zpackm_haswell_asm_4xk, TRUE, FALSE, 3, 4 },
|
||||
@@ -59,7 +59,7 @@ static gemmtiny_ukr_info_t zgemmtiny_ukr_avx2[] =
|
||||
{ (void *)bli_zgemmsup_rv_zen_asm_3x4m, (void *)bli_zpackm_haswell_asm_4xk, TRUE, FALSE, 3, 4 }
|
||||
};
|
||||
|
||||
// Function macro that defines the bli_?gemmtiny_avx2_ukr_info functions
|
||||
// Function macro that defines the bli_?gemmtiny_ukr_zen_info functions
|
||||
// These are used to acquire the kernel info at framework level
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ftype, ch, tfuncname ) \
|
||||
@@ -84,5 +84,5 @@ err_t PASTEMAC( ch, tfuncname ) \
|
||||
return BLIS_SUCCESS; \
|
||||
} \
|
||||
|
||||
GENTFUNC( scomplex, c, gemmtiny_avx2_ukr_info )
|
||||
GENTFUNC( dcomplex, z, gemmtiny_avx2_ukr_info )
|
||||
GENTFUNC( scomplex, c, gemmtiny_ukr_zen_info )
|
||||
GENTFUNC( dcomplex, z, gemmtiny_ukr_zen_info )
|
||||
@@ -34,9 +34,9 @@
|
||||
|
||||
// Macro to access the appropriate static array(that contains the kernel list),
|
||||
// based on the datatype
|
||||
#define TINY_GEMM_AVX2(ch) ch ## gemmtiny_ukr_avx2
|
||||
#define TINY_GEMM_AVX2(ch) ch ## gemmtiny_ukr_zen
|
||||
|
||||
// Macro prototypes for bli_?gemmtiny_avx2_ukr_info functions
|
||||
// Macro prototypes for bli_?gemmtiny_ukr_zen_info functions
|
||||
// These are used to acquire the kernel info at framework level
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ftype, ch, tfuncname ) \
|
||||
@@ -46,13 +46,13 @@ err_t PASTEMAC( ch, tfuncname ) \
|
||||
gemmtiny_ukr_info_t *fp_info \
|
||||
); \
|
||||
|
||||
GENTFUNC( scomplex, c, gemmtiny_avx2_ukr_info )
|
||||
GENTFUNC( dcomplex, z, gemmtiny_avx2_ukr_info )
|
||||
GENTFUNC( scomplex, c, gemmtiny_ukr_zen_info )
|
||||
GENTFUNC( dcomplex, z, gemmtiny_ukr_zen_info )
|
||||
|
||||
/* Enabling the query for AVX2 kernels, based on the library's configuration */
|
||||
/* Minimum requirement is 'ZEN' */
|
||||
#define LOOKUP_AVX2_UKR( ch, stor_id, ukr_support, gemmtiny_ukr_info ) \
|
||||
{ \
|
||||
/* Call the appropriate function to query the AVX2 object info */ \
|
||||
ukr_support = PASTEMAC(ch, gemmtiny_avx2_ukr_info)( stor_id, &gemmtiny_ukr_info ); \
|
||||
ukr_support = PASTEMAC(ch, gemmtiny_ukr_zen_info)( stor_id, &gemmtiny_ukr_info ); \
|
||||
}
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2018 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -56,7 +56,7 @@ typedef err_t (*trsmsmall_ker_ft)
|
||||
//A.'X = B; A is upper triangular;
|
||||
//A has to be transposed; double precision
|
||||
|
||||
BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
|
||||
BLIS_INLINE err_t bli_dtrsm_small_zen_int_AutXB_AlXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -71,7 +71,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
|
||||
*/
|
||||
//AX = B; A is lower triangular; transpose; double precision
|
||||
|
||||
BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
|
||||
BLIS_INLINE err_t bli_dtrsm_small_zen_int_AltXB_AuXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -85,7 +85,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
|
||||
// XA = B; A is lower-traingular; No transpose;
|
||||
//double precision; non-unit diagonal
|
||||
|
||||
BLIS_INLINE err_t bli_dtrsm_small_XAutB_XAlB
|
||||
BLIS_INLINE err_t bli_dtrsm_small_zen_int_XAutB_XAlB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -99,7 +99,7 @@ BLIS_INLINE err_t bli_dtrsm_small_XAutB_XAlB
|
||||
//XA = B; A is lower-triangular; A is transposed;
|
||||
// double precision; non-unit-diagonal
|
||||
|
||||
BLIS_INLINE err_t bli_dtrsm_small_XAltB_XAuB
|
||||
BLIS_INLINE err_t bli_dtrsm_small_zen_int_XAltB_XAuB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -111,7 +111,7 @@ BLIS_INLINE err_t bli_dtrsm_small_XAltB_XAuB
|
||||
/*
|
||||
* ZTRSM kernel declaration
|
||||
*/
|
||||
BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
|
||||
BLIS_INLINE err_t bli_ztrsm_small_zen_int_AutXB_AlXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -120,7 +120,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
|
||||
BLIS_INLINE err_t bli_ztrsm_small_zen_int_AltXB_AuXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -129,7 +129,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
|
||||
BLIS_INLINE err_t bli_ztrsm_small_zen_int_XAutB_XAlB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -138,7 +138,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
|
||||
BLIS_INLINE err_t bli_ztrsm_small_zen_int_XAltB_XAuB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -149,7 +149,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
|
||||
/*
|
||||
* CTRSM kernel declaration
|
||||
*/
|
||||
BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
|
||||
BLIS_INLINE err_t bli_ctrsm_small_zen_int_AutXB_AlXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -158,7 +158,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
|
||||
BLIS_INLINE err_t bli_ctrsm_small_zen_int_AltXB_AuXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -167,7 +167,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB
|
||||
BLIS_INLINE err_t bli_ctrsm_small_zen_int_XAutB_XAlB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -176,7 +176,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB
|
||||
BLIS_INLINE err_t bli_ctrsm_small_zen_int_XAltB_XAuB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -187,7 +187,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB
|
||||
/*
|
||||
* STRSM kernel declaration
|
||||
*/
|
||||
BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
|
||||
BLIS_INLINE err_t bli_strsm_small_zen_int_AutXB_AlXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -196,7 +196,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
|
||||
BLIS_INLINE err_t bli_strsm_small_zen_int_AltXB_AuXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -205,7 +205,7 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
BLIS_INLINE err_t bli_strsm_small_XAutB_XAlB
|
||||
BLIS_INLINE err_t bli_strsm_small_zen_int_XAutB_XAlB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -214,7 +214,7 @@ BLIS_INLINE err_t bli_strsm_small_XAutB_XAlB
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
BLIS_INLINE err_t bli_strsm_small_XAltB_XAuB
|
||||
BLIS_INLINE err_t bli_strsm_small_zen_int_XAltB_XAuB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -5027,7 +5027,7 @@ BLIS_INLINE err_t dtrsm_XAltB_ref
|
||||
Pack a block of 8xk or 6xk from input buffer into packed buffer
|
||||
directly or after transpose based on input params
|
||||
*/
|
||||
BLIS_INLINE void bli_dtrsm_small_pack
|
||||
BLIS_INLINE void bli_dtrsm_small_zen_int_pack
|
||||
(
|
||||
char side,
|
||||
dim_t size,
|
||||
@@ -5272,7 +5272,7 @@ BLIS_INLINE void bli_dtrsm_small_pack
|
||||
a. This helps to utilize cache line efficiently in TRSM operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
BLIS_INLINE void dtrsm_small_pack_diag_element
|
||||
BLIS_INLINE void dtrsm_small_zen_int_pack_diag_element
|
||||
(
|
||||
bool is_unitdiag,
|
||||
double *a11,
|
||||
@@ -5337,47 +5337,47 @@ BLIS_INLINE void dtrsm_small_pack_diag_element
|
||||
/*
|
||||
* Kernels Table
|
||||
*/
|
||||
trsmsmall_ker_ft ker_fps[4][8] =
|
||||
trsmsmall_ker_ft ker_fps_zen[4][8] =
|
||||
{
|
||||
{bli_strsm_small_AutXB_AlXB,
|
||||
bli_strsm_small_AltXB_AuXB,
|
||||
bli_strsm_small_AltXB_AuXB,
|
||||
bli_strsm_small_AutXB_AlXB,
|
||||
bli_strsm_small_XAutB_XAlB,
|
||||
bli_strsm_small_XAltB_XAuB,
|
||||
bli_strsm_small_XAltB_XAuB,
|
||||
bli_strsm_small_XAutB_XAlB },
|
||||
{bli_strsm_small_zen_int_AutXB_AlXB,
|
||||
bli_strsm_small_zen_int_AltXB_AuXB,
|
||||
bli_strsm_small_zen_int_AltXB_AuXB,
|
||||
bli_strsm_small_zen_int_AutXB_AlXB,
|
||||
bli_strsm_small_zen_int_XAutB_XAlB,
|
||||
bli_strsm_small_zen_int_XAltB_XAuB,
|
||||
bli_strsm_small_zen_int_XAltB_XAuB,
|
||||
bli_strsm_small_zen_int_XAutB_XAlB },
|
||||
|
||||
{bli_ctrsm_small_AutXB_AlXB,
|
||||
bli_ctrsm_small_AltXB_AuXB,
|
||||
bli_ctrsm_small_AltXB_AuXB,
|
||||
bli_ctrsm_small_AutXB_AlXB,
|
||||
bli_ctrsm_small_XAutB_XAlB,
|
||||
bli_ctrsm_small_XAltB_XAuB,
|
||||
bli_ctrsm_small_XAltB_XAuB,
|
||||
bli_ctrsm_small_XAutB_XAlB },
|
||||
{bli_ctrsm_small_zen_int_AutXB_AlXB,
|
||||
bli_ctrsm_small_zen_int_AltXB_AuXB,
|
||||
bli_ctrsm_small_zen_int_AltXB_AuXB,
|
||||
bli_ctrsm_small_zen_int_AutXB_AlXB,
|
||||
bli_ctrsm_small_zen_int_XAutB_XAlB,
|
||||
bli_ctrsm_small_zen_int_XAltB_XAuB,
|
||||
bli_ctrsm_small_zen_int_XAltB_XAuB,
|
||||
bli_ctrsm_small_zen_int_XAutB_XAlB },
|
||||
|
||||
{bli_dtrsm_small_AutXB_AlXB,
|
||||
bli_dtrsm_small_AltXB_AuXB,
|
||||
bli_dtrsm_small_AltXB_AuXB,
|
||||
bli_dtrsm_small_AutXB_AlXB,
|
||||
bli_dtrsm_small_XAutB_XAlB,
|
||||
bli_dtrsm_small_XAltB_XAuB,
|
||||
bli_dtrsm_small_XAltB_XAuB,
|
||||
bli_dtrsm_small_XAutB_XAlB },
|
||||
{bli_dtrsm_small_zen_int_AutXB_AlXB,
|
||||
bli_dtrsm_small_zen_int_AltXB_AuXB,
|
||||
bli_dtrsm_small_zen_int_AltXB_AuXB,
|
||||
bli_dtrsm_small_zen_int_AutXB_AlXB,
|
||||
bli_dtrsm_small_zen_int_XAutB_XAlB,
|
||||
bli_dtrsm_small_zen_int_XAltB_XAuB,
|
||||
bli_dtrsm_small_zen_int_XAltB_XAuB,
|
||||
bli_dtrsm_small_zen_int_XAutB_XAlB },
|
||||
|
||||
{bli_ztrsm_small_AutXB_AlXB,
|
||||
bli_ztrsm_small_AltXB_AuXB,
|
||||
bli_ztrsm_small_AltXB_AuXB,
|
||||
bli_ztrsm_small_AutXB_AlXB,
|
||||
bli_ztrsm_small_XAutB_XAlB,
|
||||
bli_ztrsm_small_XAltB_XAuB,
|
||||
bli_ztrsm_small_XAltB_XAuB,
|
||||
bli_ztrsm_small_XAutB_XAlB },
|
||||
{bli_ztrsm_small_zen_int_AutXB_AlXB,
|
||||
bli_ztrsm_small_zen_int_AltXB_AuXB,
|
||||
bli_ztrsm_small_zen_int_AltXB_AuXB,
|
||||
bli_ztrsm_small_zen_int_AutXB_AlXB,
|
||||
bli_ztrsm_small_zen_int_XAutB_XAlB,
|
||||
bli_ztrsm_small_zen_int_XAltB_XAuB,
|
||||
bli_ztrsm_small_zen_int_XAltB_XAuB,
|
||||
bli_ztrsm_small_zen_int_XAutB_XAlB },
|
||||
};
|
||||
|
||||
/*
|
||||
* The bli_trsm_small implements a version of TRSM where A is packed and reused
|
||||
* The bli_trsm_small_zen implements a version of TRSM where A is packed and reused
|
||||
*
|
||||
* Input: A: MxM (triangular matrix)
|
||||
* B: MxN matrix
|
||||
@@ -5387,7 +5387,7 @@ trsmsmall_ker_ft ker_fps[4][8] =
|
||||
*
|
||||
* Note: Currently only dtrsm is supported when A & B are column-major
|
||||
*/
|
||||
err_t bli_trsm_small
|
||||
err_t bli_trsm_small_zen
|
||||
(
|
||||
side_t side,
|
||||
obj_t* alpha,
|
||||
@@ -5461,7 +5461,7 @@ err_t bli_trsm_small
|
||||
( transa & 0x1) );
|
||||
|
||||
|
||||
trsmsmall_ker_ft ker_fp = ker_fps[dt][ keridx ];
|
||||
trsmsmall_ker_ft ker_fp = ker_fps_zen[dt][ keridx ];
|
||||
|
||||
/*Call the kernel*/
|
||||
err = ker_fp
|
||||
@@ -5481,7 +5481,7 @@ err_t bli_trsm_small
|
||||
* Parallelized dtrsm_small across m-dimension or n-dimension based on side(Left/Right)
|
||||
*/
|
||||
|
||||
err_t bli_trsm_small_mt
|
||||
err_t bli_trsm_small_zen_mt
|
||||
(
|
||||
side_t side,
|
||||
obj_t* alpha,
|
||||
@@ -5545,7 +5545,7 @@ err_t bli_trsm_small_mt
|
||||
{
|
||||
if(tid == 0)
|
||||
{
|
||||
bli_trsm_small
|
||||
bli_trsm_small_zen
|
||||
(
|
||||
side,
|
||||
alpha,
|
||||
@@ -5607,7 +5607,7 @@ err_t bli_trsm_small_mt
|
||||
// all threads
|
||||
err_t status_l = BLIS_SUCCESS;
|
||||
|
||||
status_l = bli_trsm_small
|
||||
status_l = bli_trsm_small_zen
|
||||
(
|
||||
side,
|
||||
alpha,
|
||||
@@ -8319,7 +8319,7 @@ BLIS_INLINE err_t ztrsm_AuXB_ref
|
||||
}\
|
||||
}
|
||||
|
||||
BLIS_INLINE void bli_ztrsm_small_pack
|
||||
BLIS_INLINE void bli_ztrsm_small_zen_int_pack
|
||||
(
|
||||
char side,
|
||||
dim_t size,
|
||||
@@ -8465,7 +8465,7 @@ BLIS_INLINE void bli_ztrsm_small_pack
|
||||
|
||||
}
|
||||
|
||||
BLIS_INLINE void ztrsm_small_pack_diag_element
|
||||
BLIS_INLINE void ztrsm_small_zen_pack_diag_element
|
||||
(
|
||||
bool is_unitdiag,
|
||||
dcomplex *a11,
|
||||
@@ -8545,7 +8545,7 @@ b11 * * * * * **a01 * * a11
|
||||
|
||||
*/
|
||||
|
||||
BLIS_INLINE err_t bli_dtrsm_small_XAltB_XAuB
|
||||
BLIS_INLINE err_t bli_dtrsm_small_zen_int_XAltB_XAuB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -8661,7 +8661,7 @@ BLIS_INLINE err_t bli_dtrsm_small_XAltB_XAuB
|
||||
until it reaches 6x(n-6) which is the maximum GEMM alone block size in A
|
||||
b. This packed buffer is reused to calculate all m cols of B matrix
|
||||
*/
|
||||
bli_dtrsm_small_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
|
||||
bli_dtrsm_small_zen_int_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
|
||||
|
||||
/*
|
||||
Pack 6 diagonal elements of A block into an array
|
||||
@@ -8669,12 +8669,12 @@ BLIS_INLINE err_t bli_dtrsm_small_XAltB_XAuB
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
|
||||
dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
|
||||
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_dtrsm_small_pack('R', j, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
|
||||
dtrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
|
||||
bli_dtrsm_small_zen_int_pack('R', j, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
|
||||
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -10954,7 +10954,7 @@ b10 ***************** *************
|
||||
***************** *******************
|
||||
|
||||
*/
|
||||
BLIS_INLINE err_t bli_dtrsm_small_XAutB_XAlB
|
||||
BLIS_INLINE err_t bli_dtrsm_small_zen_int_XAutB_XAlB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -11070,19 +11070,19 @@ BLIS_INLINE err_t bli_dtrsm_small_XAutB_XAlB
|
||||
until it reaches 6x(n-6) which is the maximum GEMM alone block size in A
|
||||
b. This packed buffer is reused to calculate all m cols of B matrix
|
||||
*/
|
||||
bli_dtrsm_small_pack('R', p_lda, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
|
||||
bli_dtrsm_small_zen_int_pack('R', p_lda, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
|
||||
|
||||
/*
|
||||
Pack 6 diagonal elements of A block into an array
|
||||
a. This helps to utilize cache line efficiently in TRSM operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
|
||||
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_dtrsm_small_pack('R', p_lda, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
|
||||
dtrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
|
||||
bli_dtrsm_small_zen_int_pack('R', p_lda, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
|
||||
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -13283,7 +13283,7 @@ BLIS_INLINE err_t bli_dtrsm_small_XAutB_XAlB
|
||||
* A is lower-triangular, transpose, non-unit diagonal
|
||||
* dimensions A: mxm X: mxn B: mxn
|
||||
*/
|
||||
BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
|
||||
BLIS_INLINE err_t bli_dtrsm_small_zen_int_AltXB_AuXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -13402,19 +13402,19 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
|
||||
until it reaches 8x(m-8) which is the maximum GEMM alone block size in A
|
||||
b. This packed buffer is reused to calculate all n rows of B matrix
|
||||
*/
|
||||
bli_dtrsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
|
||||
bli_dtrsm_small_zen_int_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
|
||||
|
||||
/*
|
||||
Pack 8 diagonal elements of A block into an array
|
||||
a. This helps to utilize cache line efficiently in TRSM operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_dtrsm_small_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack,p_lda,d_mr);
|
||||
dtrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
bli_dtrsm_small_zen_int_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack,p_lda,d_mr);
|
||||
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -15296,7 +15296,7 @@ a10 ****** b11 *****************
|
||||
**************** *****************
|
||||
a11--->
|
||||
*/
|
||||
BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
|
||||
BLIS_INLINE err_t bli_dtrsm_small_zen_int_AutXB_AlXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -15410,19 +15410,19 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
|
||||
until it reaches 8x(m-8) which is the maximum GEMM alone block size in A
|
||||
b. This packed buffer is reused to calculate all n rows of B matrix
|
||||
*/
|
||||
bli_dtrsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
|
||||
bli_dtrsm_small_zen_int_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
|
||||
|
||||
/*
|
||||
Pack 8 diagonal elements of A block into an array
|
||||
a. This helps to utilize cache line efficiently in TRSM operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_dtrsm_small_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
|
||||
dtrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
bli_dtrsm_small_zen_int_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
|
||||
dtrsm_small_zen_int_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -17361,7 +17361,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
|
||||
a. This helps to utilize cache line efficiently in TRSM operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
BLIS_INLINE void strsm_small_pack_diag_element
|
||||
BLIS_INLINE void strsm_small_zen_int_pack_diag_element
|
||||
(
|
||||
char side,
|
||||
bool is_unitdiag,
|
||||
@@ -17477,7 +17477,7 @@ BLIS_INLINE void strsm_small_pack_diag_element
|
||||
Pack a block of 16xk or 6xk from input buffer into packed buffer
|
||||
directly or after transpose based on input params
|
||||
*/
|
||||
BLIS_INLINE void bli_strsm_small_pack
|
||||
BLIS_INLINE void bli_strsm_small_zen_int_pack
|
||||
(
|
||||
char side,
|
||||
dim_t size,
|
||||
@@ -17903,7 +17903,7 @@ b10 ***************** *************
|
||||
***************** *******************
|
||||
|
||||
*/
|
||||
BLIS_INLINE err_t bli_strsm_small_XAutB_XAlB
|
||||
BLIS_INLINE err_t bli_strsm_small_zen_int_XAutB_XAlB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -18015,19 +18015,19 @@ BLIS_INLINE err_t bli_strsm_small_XAutB_XAlB
|
||||
until it reaches 6x(n-6) which is the maximum GEMM alone block size in A
|
||||
b. This packed buffer is reused to calculate all m cols of B matrix
|
||||
*/
|
||||
bli_strsm_small_pack('R', p_lda, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
|
||||
bli_strsm_small_zen_int_pack('R', p_lda, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
|
||||
|
||||
/*
|
||||
Pack 6 diagonal elements of A block into an array
|
||||
a. This helps to utilize cache line efficiently in TRSM operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
strsm_small_pack_diag_element('R',is_unitdiag,a11,cs_a,d11_pack,d_nr);
|
||||
strsm_small_zen_int_pack_diag_element('R',is_unitdiag,a11,cs_a,d11_pack,d_nr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_strsm_small_pack('R', p_lda, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
|
||||
strsm_small_pack_diag_element('R',is_unitdiag,a11,rs_a,d11_pack,d_nr);
|
||||
bli_strsm_small_zen_int_pack('R', p_lda, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
|
||||
strsm_small_zen_int_pack_diag_element('R',is_unitdiag,a11,rs_a,d11_pack,d_nr);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -21572,7 +21572,7 @@ b11 * * * * * **a01 * * a11
|
||||
|
||||
*/
|
||||
|
||||
BLIS_INLINE err_t bli_strsm_small_XAltB_XAuB
|
||||
BLIS_INLINE err_t bli_strsm_small_zen_int_XAltB_XAuB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -21686,7 +21686,7 @@ BLIS_INLINE err_t bli_strsm_small_XAltB_XAuB
|
||||
until it reaches 6x(n-6) which is the maximum GEMM alone block size in A
|
||||
b. This packed buffer is reused to calculate all m cols of B matrix
|
||||
*/
|
||||
bli_strsm_small_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
|
||||
bli_strsm_small_zen_int_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
|
||||
|
||||
/*
|
||||
Pack 6 diagonal elements of A block into an array
|
||||
@@ -21694,12 +21694,12 @@ BLIS_INLINE err_t bli_strsm_small_XAltB_XAuB
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
|
||||
strsm_small_pack_diag_element('R',is_unitdiag,a11,cs_a,d11_pack,d_nr);
|
||||
strsm_small_zen_int_pack_diag_element('R',is_unitdiag,a11,cs_a,d11_pack,d_nr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_strsm_small_pack('R', j, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
|
||||
strsm_small_pack_diag_element('R',is_unitdiag,a11,rs_a,d11_pack,d_nr);
|
||||
bli_strsm_small_zen_int_pack('R', j, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
|
||||
strsm_small_zen_int_pack_diag_element('R',is_unitdiag,a11,rs_a,d11_pack,d_nr);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -25419,7 +25419,7 @@ a10 ****** b11 *****************
|
||||
**************** *****************
|
||||
a11--->
|
||||
*/
|
||||
BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
|
||||
BLIS_INLINE err_t bli_strsm_small_zen_int_AutXB_AlXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -25537,19 +25537,19 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
|
||||
until it reaches 16x(m-16) which is the maximum GEMM alone block size in A
|
||||
b. This packed buffer is reused to calculate all n rows of B matrix
|
||||
*/
|
||||
bli_strsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
|
||||
bli_strsm_small_zen_int_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
|
||||
|
||||
/*
|
||||
Pack 16 diagonal elements of A block into an array
|
||||
a. This helps to utilize cache line efficiently in TRSM operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
strsm_small_pack_diag_element('L',is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
strsm_small_zen_int_pack_diag_element('L',is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_strsm_small_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
|
||||
strsm_small_pack_diag_element('L',is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
bli_strsm_small_zen_int_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
|
||||
strsm_small_zen_int_pack_diag_element('L',is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -29799,7 +29799,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
|
||||
* A is lower-triangular, transpose, non-unit diagonal
|
||||
* dimensions A: mxm X: mxn B: mxn
|
||||
*/
|
||||
BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
|
||||
BLIS_INLINE err_t bli_strsm_small_zen_int_AltXB_AuXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -29921,19 +29921,19 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
|
||||
until it reaches 16x(m-16) which is the maximum GEMM alone block size in A
|
||||
b. This packed buffer is reused to calculate all n rows of B matrix
|
||||
*/
|
||||
bli_strsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
|
||||
bli_strsm_small_zen_int_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
|
||||
|
||||
/*
|
||||
Pack 8 diagonal elements of A block into an array
|
||||
a. This helps to utilize cache line efficiently in TRSM operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
strsm_small_pack_diag_element('L',is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
strsm_small_zen_int_pack_diag_element('L',is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_strsm_small_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack,p_lda,d_mr);
|
||||
strsm_small_pack_diag_element('L',is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
bli_strsm_small_zen_int_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack,p_lda,d_mr);
|
||||
strsm_small_zen_int_pack_diag_element('L',is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -33952,7 +33952,7 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
|
||||
BLIS_INLINE err_t bli_ztrsm_small_zen_int_AutXB_AlXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -34071,19 +34071,19 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
|
||||
in A
|
||||
b. This packed buffer is reused to calculate all n rows of B matrix
|
||||
*/
|
||||
bli_ztrsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
|
||||
bli_ztrsm_small_zen_int_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
|
||||
|
||||
/*
|
||||
Pack 4 diagonal elements of A block into an array
|
||||
a. This helps to utilize cache line efficiently in TRSM operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_ztrsm_small_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
bli_ztrsm_small_zen_int_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
}
|
||||
/*
|
||||
a. Perform GEMM using a10, b01.
|
||||
@@ -35194,7 +35194,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
|
||||
BLIS_INLINE err_t bli_ztrsm_small_zen_int_AltXB_AuXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -35317,19 +35317,19 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
|
||||
in A
|
||||
b. This packed buffer is reused to calculate all n rows of B matrix
|
||||
*/
|
||||
bli_ztrsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
|
||||
bli_ztrsm_small_zen_int_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
|
||||
|
||||
/*
|
||||
Pack 8 diagonal elements of A block into an array
|
||||
a. This helps to utilize cache line efficiently in TRSM operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_ztrsm_small_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack,p_lda,d_mr);
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
bli_ztrsm_small_zen_int_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack,p_lda,d_mr);
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -36426,7 +36426,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
|
||||
BLIS_INLINE err_t bli_ztrsm_small_zen_int_XAutB_XAlB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -36534,7 +36534,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
|
||||
b. This packed buffer is reused to calculate all m cols of B
|
||||
matrix
|
||||
*/
|
||||
bli_ztrsm_small_pack('R', p_lda, 1, a01, cs_a, D_A_pack,
|
||||
bli_ztrsm_small_zen_int_pack('R', p_lda, 1, a01, cs_a, D_A_pack,
|
||||
p_lda,d_nr);
|
||||
|
||||
/*
|
||||
@@ -36543,14 +36543,14 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
|
||||
operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,
|
||||
d11_pack,d_nr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_ztrsm_small_pack('R', p_lda, 0, a01, rs_a, D_A_pack,
|
||||
bli_ztrsm_small_zen_int_pack('R', p_lda, 0, a01, rs_a, D_A_pack,
|
||||
p_lda,d_nr);
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,
|
||||
d11_pack,d_nr);
|
||||
}
|
||||
|
||||
@@ -37327,12 +37327,12 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
|
||||
{
|
||||
if(transa)
|
||||
{
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,
|
||||
d11_pack,n_remainder);
|
||||
}
|
||||
else
|
||||
{
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,
|
||||
d11_pack,n_remainder);
|
||||
}
|
||||
}
|
||||
@@ -37732,12 +37732,12 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
|
||||
{
|
||||
if(transa)
|
||||
{
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,
|
||||
d11_pack,n_remainder);
|
||||
}
|
||||
else
|
||||
{
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,
|
||||
d11_pack,n_remainder);
|
||||
}
|
||||
}
|
||||
@@ -37893,7 +37893,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
|
||||
BLIS_INLINE err_t bli_ztrsm_small_zen_int_XAltB_XAuB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -38000,7 +38000,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
|
||||
b. This packed buffer is reused to calculate all m cols of
|
||||
B matrix
|
||||
*/
|
||||
bli_ztrsm_small_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
|
||||
bli_ztrsm_small_zen_int_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
|
||||
|
||||
/*
|
||||
Pack 3 diagonal elements of A block into an array
|
||||
@@ -38008,14 +38008,14 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
|
||||
operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,
|
||||
d11_pack,d_nr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_ztrsm_small_pack('R', j, 0, a01, rs_a, D_A_pack,
|
||||
bli_ztrsm_small_zen_int_pack('R', j, 0, a01, rs_a, D_A_pack,
|
||||
p_lda,d_nr);
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,
|
||||
d11_pack,d_nr);
|
||||
}
|
||||
|
||||
@@ -38762,12 +38762,12 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
|
||||
{
|
||||
if(transa)
|
||||
{
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,
|
||||
d11_pack,n_remainder);
|
||||
}
|
||||
else
|
||||
{
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,
|
||||
d11_pack,n_remainder);
|
||||
}
|
||||
}
|
||||
@@ -39164,12 +39164,12 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
|
||||
{
|
||||
if(transa)
|
||||
{
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,
|
||||
d11_pack,n_remainder);
|
||||
}
|
||||
else
|
||||
{
|
||||
ztrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,
|
||||
ztrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,
|
||||
d11_pack,n_remainder);
|
||||
}
|
||||
}
|
||||
@@ -39583,7 +39583,7 @@ BLIS_INLINE err_t ctrsm_AlXB_ref
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
BLIS_INLINE void bli_ctrsm_small_pack
|
||||
BLIS_INLINE void bli_ctrsm_small_zen_int_pack
|
||||
(
|
||||
char side,
|
||||
dim_t size,
|
||||
@@ -39768,7 +39768,7 @@ BLIS_INLINE void bli_ctrsm_small_pack
|
||||
}
|
||||
}
|
||||
|
||||
BLIS_INLINE void ctrsm_small_pack_diag_element
|
||||
BLIS_INLINE void ctrsm_small_zen_pack_diag_element
|
||||
(
|
||||
bool is_unitdiag,
|
||||
scomplex *a11,
|
||||
@@ -42491,7 +42491,7 @@ BLIS_INLINE void ctrsm_small_pack_diag_element
|
||||
_mm256_storeu_ps((float *)(b11 + cs_b * 2 + 4), ymm2);\
|
||||
}
|
||||
|
||||
BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
|
||||
BLIS_INLINE err_t bli_ctrsm_small_zen_int_AutXB_AlXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -42615,19 +42615,19 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
|
||||
in A
|
||||
b. This packed buffer is reused to calculate all n rows of B matrix
|
||||
*/
|
||||
bli_ctrsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
|
||||
bli_ctrsm_small_zen_int_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
|
||||
|
||||
/*
|
||||
Pack 4 diagonal elements of A block into an array
|
||||
a. This helps to utilize cache line efficiently in TRSM operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_ctrsm_small_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
bli_ctrsm_small_zen_int_pack('L', i, 0, a10, rs_a, D_A_pack, p_lda,d_mr);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
}
|
||||
/*
|
||||
a. Perform GEMM using a10, b01.
|
||||
@@ -44131,11 +44131,11 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
|
||||
{
|
||||
if(transa)
|
||||
{
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,m_rem);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,m_rem);
|
||||
}
|
||||
else
|
||||
{
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,m_rem);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,m_rem);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45028,7 +45028,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
|
||||
BLIS_INLINE err_t bli_ctrsm_small_zen_int_AltXB_AuXB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -45155,19 +45155,19 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
|
||||
in A
|
||||
b. This packed buffer is reused to calculate all n rows of B matrix
|
||||
*/
|
||||
bli_ctrsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack, p_lda,d_mr);
|
||||
bli_ctrsm_small_zen_int_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack, p_lda,d_mr);
|
||||
|
||||
/*
|
||||
Pack 4 diagonal elements of A block into an array
|
||||
a. This helps to utilize cache line efficiently in TRSM operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_ctrsm_small_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack, p_lda,d_mr);
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
bli_ctrsm_small_zen_int_pack('L', (m-i-d_mr), 0, a10, rs_a, D_A_pack, p_lda,d_mr);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_mr);
|
||||
}
|
||||
/*
|
||||
a. Perform GEMM using a10, b01.
|
||||
@@ -46880,11 +46880,11 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
|
||||
{
|
||||
if(transa)
|
||||
{
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,4);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,4);
|
||||
}
|
||||
else
|
||||
{
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,4);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,4);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47815,7 +47815,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB
|
||||
BLIS_INLINE err_t bli_ctrsm_small_zen_int_XAutB_XAlB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -47934,19 +47934,19 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB
|
||||
in A
|
||||
b. This packed buffer is reused to calculate all n rows of B matrix
|
||||
*/
|
||||
bli_ctrsm_small_pack('R', p_lda, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
|
||||
bli_ctrsm_small_zen_int_pack('R', p_lda, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
|
||||
|
||||
/*
|
||||
Pack 4 diagonal elements of A block into an array
|
||||
a. This helps to utilize cache line efficiently in TRSM operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_ctrsm_small_pack('R', p_lda, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
|
||||
bli_ctrsm_small_zen_int_pack('R', p_lda, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
|
||||
}
|
||||
/*
|
||||
a. Perform GEMM using a10, b01.
|
||||
@@ -48684,11 +48684,11 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB
|
||||
{
|
||||
if(transa)
|
||||
{
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
|
||||
}
|
||||
else
|
||||
{
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,n_rem);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,n_rem);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49202,7 +49202,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB
|
||||
|
||||
if(!is_unitdiag)
|
||||
{
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
|
||||
}
|
||||
|
||||
for(i = (m-d_mr); (i+1) > 0; i -= d_mr)
|
||||
@@ -49438,7 +49438,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAutB_XAlB
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB
|
||||
BLIS_INLINE err_t bli_ctrsm_small_zen_int_XAltB_XAuB
|
||||
(
|
||||
obj_t* AlphaObj,
|
||||
obj_t* a,
|
||||
@@ -49558,19 +49558,19 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB
|
||||
in A
|
||||
b. This packed buffer is reused to calculate all n rows of B matrix
|
||||
*/
|
||||
bli_ctrsm_small_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
|
||||
bli_ctrsm_small_zen_int_pack('R', j, 1, a01, cs_a, D_A_pack, p_lda,d_nr);
|
||||
|
||||
/*
|
||||
Pack 4 diagonal elements of A block into an array
|
||||
a. This helps to utilize cache line efficiently in TRSM operation
|
||||
b. store ones when input is unit diagonal
|
||||
*/
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_ctrsm_small_pack('R', j, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
|
||||
bli_ctrsm_small_zen_int_pack('R', j, 0, a01, rs_a, D_A_pack, p_lda,d_nr);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,d_nr);
|
||||
}
|
||||
/*
|
||||
a. Perform GEMM using a10, b01.
|
||||
@@ -50314,11 +50314,11 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB
|
||||
{
|
||||
if(transa)
|
||||
{
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
|
||||
}
|
||||
else
|
||||
{
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,n_rem);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,rs_a,d11_pack,n_rem);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50843,7 +50843,7 @@ BLIS_INLINE err_t bli_ctrsm_small_XAltB_XAuB
|
||||
|
||||
if(!is_unitdiag)
|
||||
{
|
||||
ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
|
||||
ctrsm_small_zen_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,n_rem);
|
||||
}
|
||||
|
||||
for(i = 0; (i+d_mr-1) < m; i += d_mr)
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -90,7 +90,7 @@
|
||||
with k == 1. It expects the inputs and output to support the column-major storage
|
||||
scheme, without any requirement to conjugate/transpose any of the operands. */
|
||||
|
||||
err_t bli_zgemm_4x4_avx2_k1_nn
|
||||
err_t bli_zgemm_zen_int_4x4_k1_nn
|
||||
(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
@@ -34,7 +34,7 @@
|
||||
*/
|
||||
|
||||
// Including the header for tiny gemm kernel signatures
|
||||
#include "bli_gemm_tiny_avx2.h"
|
||||
#include "bli_gemm_tiny_zen.h"
|
||||
|
||||
// -- level-1m --
|
||||
// Removed - reference packm kernels are used
|
||||
@@ -59,28 +59,28 @@ AXPBYV_KER_PROT( scomplex, c, axpbyv_zen_int )
|
||||
AXPBYV_KER_PROT( dcomplex, z, axpbyv_zen_int )
|
||||
|
||||
// axpbyv (intrinsics, unrolled x10)
|
||||
AXPBYV_KER_PROT( float, s, axpbyv_zen_int10 )
|
||||
AXPBYV_KER_PROT( double, d, axpbyv_zen_int10 )
|
||||
AXPBYV_KER_PROT( float, s, axpbyv_zen_int_10 )
|
||||
AXPBYV_KER_PROT( double, d, axpbyv_zen_int_10 )
|
||||
|
||||
// axpyv (intrinsics)
|
||||
AXPYV_KER_PROT( float, s, axpyv_zen_int )
|
||||
AXPYV_KER_PROT( double, d, axpyv_zen_int )
|
||||
|
||||
// axpyv (intrinsics unrolled x10)
|
||||
AXPYV_KER_PROT( float, s, axpyv_zen_int10 )
|
||||
BLIS_EXPORT_BLIS AXPYV_KER_PROT( double, d, axpyv_zen_int10 )
|
||||
AXPYV_KER_PROT( scomplex, c, axpyv_zen_int5 )
|
||||
AXPYV_KER_PROT( dcomplex, z, axpyv_zen_int5 )
|
||||
AXPYV_KER_PROT( float, s, axpyv_zen_int_10 )
|
||||
BLIS_EXPORT_BLIS AXPYV_KER_PROT( double, d, axpyv_zen_int_10 )
|
||||
AXPYV_KER_PROT( scomplex, c, axpyv_zen_int_5 )
|
||||
AXPYV_KER_PROT( dcomplex, z, axpyv_zen_int_5 )
|
||||
|
||||
// dotv (intrinsics)
|
||||
DOTV_KER_PROT( float, s, dotv_zen_int )
|
||||
DOTV_KER_PROT( double, d, dotv_zen_int )
|
||||
|
||||
// dotv (intrinsics, unrolled x10)
|
||||
DOTV_KER_PROT( float, s, dotv_zen_int10 )
|
||||
DOTV_KER_PROT( double, d, dotv_zen_int10 )
|
||||
DOTV_KER_PROT( scomplex, c, dotv_zen_int5 )
|
||||
DOTV_KER_PROT( dcomplex, z, dotv_zen_int5 )
|
||||
DOTV_KER_PROT( float, s, dotv_zen_int_10 )
|
||||
DOTV_KER_PROT( double, d, dotv_zen_int_10 )
|
||||
DOTV_KER_PROT( scomplex, c, dotv_zen_int_5 )
|
||||
DOTV_KER_PROT( dcomplex, z, dotv_zen_int_5 )
|
||||
|
||||
// dotxv (intrinsics)
|
||||
DOTXV_KER_PROT( float, s, dotxv_zen_int )
|
||||
@@ -95,13 +95,13 @@ SCALV_KER_PROT( scomplex, c, scalv_zen_int )
|
||||
SCALV_KER_PROT( dcomplex, z, scalv_zen_int )
|
||||
|
||||
// scalv (intrinsics unrolled x10)
|
||||
SCALV_KER_PROT( float, s, scalv_zen_int10 )
|
||||
BLIS_EXPORT_BLIS SCALV_KER_PROT( double, d, scalv_zen_int10 )
|
||||
SCALV_KER_PROT( dcomplex, z, dscalv_zen_int10 )
|
||||
SCALV_KER_PROT( float, s, scalv_zen_int_10 )
|
||||
BLIS_EXPORT_BLIS SCALV_KER_PROT( double, d, scalv_zen_int_10 )
|
||||
SCALV_KER_PROT( dcomplex, z, dscalv_zen_int_10 )
|
||||
|
||||
// swapv (intrinsics)
|
||||
SWAPV_KER_PROT(float, s, swapv_zen_int8 )
|
||||
BLIS_EXPORT_BLIS SWAPV_KER_PROT(double, d, swapv_zen_int8 )
|
||||
SWAPV_KER_PROT(float, s, swapv_zen_int_8 )
|
||||
BLIS_EXPORT_BLIS SWAPV_KER_PROT(double, d, swapv_zen_int_8 )
|
||||
|
||||
// copyv (intrinsics)
|
||||
COPYV_KER_PROT( float, s, copyv_zen_int )
|
||||
@@ -328,7 +328,7 @@ err_t bli_dgemm_tiny
|
||||
double* c, const inc_t rs_c0, const inc_t cs_c0
|
||||
);
|
||||
|
||||
err_t bli_dgemm_tiny_6x8
|
||||
err_t bli_dgemm_tiny_zen_6x8
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
@@ -388,7 +388,7 @@ err_t bli_zgemm_small_At
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
err_t bli_dgemm_8x6_avx2_k1_nn
|
||||
err_t bli_dgemm_zen_int_8x6_k1_nn
|
||||
(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
@@ -400,7 +400,7 @@ err_t bli_dgemm_8x6_avx2_k1_nn
|
||||
double* c, const inc_t ldc
|
||||
);
|
||||
|
||||
err_t bli_zgemm_4x4_avx2_k1_nn
|
||||
err_t bli_zgemm_zen_int_4x4_k1_nn
|
||||
(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
@@ -412,7 +412,7 @@ err_t bli_zgemm_4x4_avx2_k1_nn
|
||||
dcomplex* c, const inc_t ldc
|
||||
);
|
||||
|
||||
err_t bli_trsm_small
|
||||
err_t bli_trsm_small_zen
|
||||
(
|
||||
side_t side,
|
||||
obj_t* alpha,
|
||||
@@ -424,7 +424,7 @@ err_t bli_trsm_small
|
||||
);
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
err_t bli_trsm_small_mt
|
||||
err_t bli_trsm_small_zen_mt
|
||||
(
|
||||
side_t side,
|
||||
obj_t* alpha,
|
||||
@@ -480,7 +480,7 @@ bool bli_cntx_trsm_small_thresh_is_met_zen
|
||||
dim_t n
|
||||
);
|
||||
|
||||
void bli_snorm2fv_unb_var1_avx2
|
||||
void bli_snorm2fv_zen_int_unb_var1
|
||||
(
|
||||
dim_t n,
|
||||
float* x, inc_t incx,
|
||||
@@ -488,7 +488,7 @@ void bli_snorm2fv_unb_var1_avx2
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
void bli_dnorm2fv_unb_var1_avx2
|
||||
void bli_dnorm2fv_zen_int_unb_var1
|
||||
(
|
||||
dim_t n,
|
||||
double* x, inc_t incx,
|
||||
@@ -496,7 +496,7 @@ void bli_dnorm2fv_unb_var1_avx2
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
void bli_scnorm2fv_unb_var1_avx2
|
||||
void bli_scnorm2fv_zen_int_unb_var1
|
||||
(
|
||||
dim_t n,
|
||||
scomplex* x, inc_t incx,
|
||||
@@ -504,7 +504,7 @@ void bli_scnorm2fv_unb_var1_avx2
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
void bli_dznorm2fv_unb_var1_avx2
|
||||
void bli_dznorm2fv_zen_int_unb_var1
|
||||
(
|
||||
dim_t n,
|
||||
dcomplex* x, inc_t incx,
|
||||
@@ -543,7 +543,7 @@ void bli_sgemv_zen_ref
|
||||
cntx_t* restrict cntx
|
||||
);
|
||||
|
||||
void bli_dgemv_n_avx2
|
||||
void bli_dgemv_n_zen
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -35,7 +35,7 @@
|
||||
#include "immintrin.h"
|
||||
#include "blis.h"
|
||||
|
||||
void bli_daddv_zen_int_avx512
|
||||
void bli_daddv_zen4_int
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -139,7 +139,7 @@ typedef union
|
||||
);
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
void bli_samaxv_zen_int_avx512(
|
||||
void bli_samaxv_zen4_int(
|
||||
dim_t n,
|
||||
float *restrict x, inc_t incx,
|
||||
dim_t *restrict i_max,
|
||||
@@ -443,7 +443,7 @@ void bli_samaxv_zen_int_avx512(
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------------------------------*/
|
||||
BLIS_EXPORT_BLIS void bli_damaxv_zen_int_avx512
|
||||
BLIS_EXPORT_BLIS void bli_damaxv_zen4_int
|
||||
(
|
||||
dim_t n,
|
||||
double *restrict x, inc_t incx,
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -49,7 +49,7 @@ typedef union
|
||||
* x & y are double precision vectors of length n.
|
||||
* alpha & beta are scalars.
|
||||
*/
|
||||
void bli_daxpbyv_zen_int_avx512
|
||||
void bli_daxpbyv_zen4_int
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -72,7 +72,7 @@ void bli_daxpbyv_zen_int_avx512
|
||||
// When beta = !( 0 or 1 ) --> DSCALV
|
||||
if ( bli_deq0( *alpha ) )
|
||||
{
|
||||
bli_dscalv_zen_int10
|
||||
bli_dscalv_zen_int_10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2023 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -76,7 +76,7 @@
|
||||
The expectation is that these are standard BLAS exceptions and should be handled in
|
||||
a higher layer
|
||||
*/
|
||||
void bli_saxpyv_zen_int_avx512
|
||||
void bli_saxpyv_zen4_int
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -282,7 +282,7 @@ void bli_saxpyv_zen_int_avx512
|
||||
The expectation is that these are standard BLAS exceptions and should be handled in
|
||||
a higher layer
|
||||
*/
|
||||
BLIS_EXPORT_BLIS void bli_daxpyv_zen_int_avx512
|
||||
BLIS_EXPORT_BLIS void bli_daxpyv_zen4_int
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -487,7 +487,7 @@ BLIS_EXPORT_BLIS void bli_daxpyv_zen_int_avx512
|
||||
The expectation is that these are standard BLAS exceptions and should be handled in
|
||||
a higher layer
|
||||
*/
|
||||
void bli_zaxpyv_zen_int_avx512
|
||||
void bli_zaxpyv_zen4_int
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -77,7 +77,7 @@
|
||||
a higher layer
|
||||
*/
|
||||
|
||||
void bli_scopyv_zen4_asm_avx512
|
||||
void bli_scopyv_zen4_asm
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -405,7 +405,7 @@ void bli_scopyv_zen4_asm_avx512
|
||||
*/
|
||||
|
||||
// This function is used to copy the vector x to vector y using AVX512 instructions
|
||||
void bli_dcopyv_zen4_asm_avx512
|
||||
void bli_dcopyv_zen4_asm
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -691,7 +691,7 @@ void bli_dcopyv_zen4_asm_avx512
|
||||
}
|
||||
|
||||
// This function is used to copy the vector x to vector y using AVX512 instructions in a two directional way
|
||||
void bli_dcopyv_zen4_asm_avx512_biway
|
||||
void bli_dcopyv_zen4_asm_biway
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -1043,7 +1043,7 @@ void bli_dcopyv_zen4_asm_avx512_biway
|
||||
a higher layer
|
||||
*/
|
||||
|
||||
void bli_zcopyv_zen4_asm_avx512
|
||||
void bli_zcopyv_zen4_asm
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2024 - 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -75,7 +75,7 @@
|
||||
a higher layer
|
||||
*/
|
||||
|
||||
void bli_scopyv_zen_int_avx512
|
||||
void bli_scopyv_zen4_int
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -388,7 +388,7 @@ void bli_scopyv_zen_int_avx512
|
||||
a higher layer
|
||||
*/
|
||||
|
||||
void bli_dcopyv_zen_int_avx512
|
||||
void bli_dcopyv_zen4_int
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
@@ -700,7 +700,7 @@ void bli_dcopyv_zen_int_avx512
|
||||
a higher layer
|
||||
*/
|
||||
|
||||
void bli_zcopyv_zen_int_avx512
|
||||
void bli_zcopyv_zen4_int
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user