Optimized daxpy2v implementation

- Optimized axpy2v implementation for double
  datatype by handling rows in mulitple of 4
  and store the final computed result at the
  end of computation, preventing unnecessary
  stores for improving the performance.

- Optimal and reuse of vector registers for
  faster computation.

AMD-Internal: [CPUPL-1973]
Change-Id: I7b8ef94d0f67c1c666fdce26e9b2b7291365d2e9
This commit is contained in:
Harsh Dave
2021-12-23 04:44:24 -06:00
committed by Dipal M Zambare
parent 43c16d8e08
commit 718c6bc024
3 changed files with 23 additions and 1 deletions

View File

@@ -8,5 +8,8 @@ target_sources("${PROJECT_NAME}"
${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_4.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_6.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_axpy2v_zen_int.c
<<<<<<< HEAD
${CMAKE_CURRENT_SOURCE_DIR}/bli_dotxaxpyf_zen_int_8.c
=======
>>>>>>> 8b5b2707... Optimized daxpy2v implementation
)

View File

@@ -186,6 +186,7 @@ void bli_daxpy2v_zen_int
);
}
}
<<<<<<< HEAD
/**
* zaxpy2v kernel performs axpy2v operation.
@@ -718,4 +719,6 @@ void bli_zaxpy2v_zen_int
}
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
}
}
=======
>>>>>>> 8b5b2707... Optimized daxpy2v implementation

View File

@@ -32,6 +32,19 @@
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
<<<<<<< HEAD
=======
// hemv helper function
void bli_pre_hemv_8x8(double *a, double *x,
double *y, double *alpha,
dim_t cs_a, dim_t rs_a);
void bli_post_hemv_8x8(double *a, double *x,
double *y, double *alpha,
dim_t cs_a, dim_t rs_a);
>>>>>>> 8b5b2707... Optimized daxpy2v implementation
// -- level-1m --
PACKM_KER_PROT(double, d, packm_8xk_gen_zen)
PACKM_KER_PROT(double, d, packm_6xk_gen_zen)
@@ -129,7 +142,10 @@ AXPYF_KER_PROT( dcomplex, z, axpyf_zen_int_5 )
AXPYF_KER_PROT( dcomplex, z, axpyf_zen_int_4 )
// axpy2v (intrinsics)
AXPY2V_KER_PROT(double, d, axpy2v_zen_int )
<<<<<<< HEAD
AXPY2V_KER_PROT(dcomplex, z, axpy2v_zen_int )
=======
>>>>>>> 8b5b2707... Optimized daxpy2v implementation
// dotxf (intrinsics)
DOTXF_KER_PROT( float, s, dotxf_zen_int_8 )