From 718c6bc024d3a6c8f4c6987e098aba41e914db8c Mon Sep 17 00:00:00 2001 From: Harsh Dave Date: Thu, 23 Dec 2021 04:44:24 -0600 Subject: [PATCH] Optimized daxpy2v implementation - Optimized axpy2v implementation for double datatype by handling rows in mulitple of 4 and store the final computed result at the end of computation, preventing unnecessary stores for improving the performance. - Optimal and reuse of vector registers for faster computation. AMD-Internal: [CPUPL-1973] Change-Id: I7b8ef94d0f67c1c666fdce26e9b2b7291365d2e9 --- kernels/zen/1f/CMakeLists.txt | 3 +++ kernels/zen/1f/bli_axpy2v_zen_int.c | 5 ++++- kernels/zen/bli_kernels_zen.h | 16 ++++++++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/kernels/zen/1f/CMakeLists.txt b/kernels/zen/1f/CMakeLists.txt index 3a77f69ef..b020d8c92 100644 --- a/kernels/zen/1f/CMakeLists.txt +++ b/kernels/zen/1f/CMakeLists.txt @@ -8,5 +8,8 @@ target_sources("${PROJECT_NAME}" ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_4.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_6.c ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpy2v_zen_int.c +<<<<<<< HEAD ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotxaxpyf_zen_int_8.c +======= +>>>>>>> 8b5b2707... Optimized daxpy2v implementation ) diff --git a/kernels/zen/1f/bli_axpy2v_zen_int.c b/kernels/zen/1f/bli_axpy2v_zen_int.c index cba014137..26d307eda 100644 --- a/kernels/zen/1f/bli_axpy2v_zen_int.c +++ b/kernels/zen/1f/bli_axpy2v_zen_int.c @@ -186,6 +186,7 @@ void bli_daxpy2v_zen_int ); } } +<<<<<<< HEAD /** * zaxpy2v kernel performs axpy2v operation. @@ -718,4 +719,6 @@ void bli_zaxpy2v_zen_int } } AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4) -} \ No newline at end of file +} +======= +>>>>>>> 8b5b2707... Optimized daxpy2v implementation diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h index 5444c90ea..f7083c915 100644 --- a/kernels/zen/bli_kernels_zen.h +++ b/kernels/zen/bli_kernels_zen.h @@ -32,6 +32,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +<<<<<<< HEAD +======= +// hemv helper function +void bli_pre_hemv_8x8(double *a, double *x, + double *y, double *alpha, + dim_t cs_a, dim_t rs_a); + +void bli_post_hemv_8x8(double *a, double *x, + double *y, double *alpha, + dim_t cs_a, dim_t rs_a); + + +>>>>>>> 8b5b2707... Optimized daxpy2v implementation // -- level-1m -- PACKM_KER_PROT(double, d, packm_8xk_gen_zen) PACKM_KER_PROT(double, d, packm_6xk_gen_zen) @@ -129,7 +142,10 @@ AXPYF_KER_PROT( dcomplex, z, axpyf_zen_int_5 ) AXPYF_KER_PROT( dcomplex, z, axpyf_zen_int_4 ) // axpy2v (intrinsics) AXPY2V_KER_PROT(double, d, axpy2v_zen_int ) +<<<<<<< HEAD AXPY2V_KER_PROT(dcomplex, z, axpy2v_zen_int ) +======= +>>>>>>> 8b5b2707... Optimized daxpy2v implementation // dotxf (intrinsics) DOTXF_KER_PROT( float, s, dotxf_zen_int_8 )