Vectorized and parallelized zdscal routine

- Implemented optimized intrinsic kernel for zdscalv for the cases where AVX2 is supported. - Also added multithreaded support for the same. - The optimal number of threads is being calculated on the basis of input size. AMD-Internal: [CPUPL-2602] Change-Id: I4d05c3b1cc365a7770703286a89c6dce3875c067
2026-05-11 17:50:00 +00:00 · 2022-09-29 23:32:10 +05:30
parent 9c292b79e2
commit 90f915d3a9
4 changed files with 403 additions and 2 deletions
--- a/frame/include/bli_gentfunc_macro_defs.h
+++ b/frame/include/bli_gentfunc_macro_defs.h
@@ -159,8 +159,8 @@ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname )
 \
 GENTFUNCSCAL( scomplex, scomplex, c,  , blasname, blisname ) \
 GENTFUNCSCAL( dcomplex, dcomplex, z,  , blasname, blisname ) \
-GENTFUNCSCAL( scomplex, float,    c, s, blasname, blisname ) \
-GENTFUNCSCAL( dcomplex, double,   z, d, blasname, blisname )
+GENTFUNCSCAL( scomplex, float,    c, s, blasname, blisname )
+// GENTFUNCSCAL( dcomplex, double,   z, d, blasname, blisname )


 #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \