SCALV alpha=zero BLAS compliance

SCALV is used directly by BLAS, CBLAS and BLIS scal{v} APIs but also within many other APIs to handle special cases. In general it is preferred to use SETV when alpha=0, but BLAS and CBLAS continue to multiple all vector element by alpha. This has different behaviour for propagating NaNs or Infs. Changes in this commit: - Standardize early returns from SCALV reference and optimized kernels. - User supplied N<0 is handled at the top level API layer. Use negative values of N in kernel calls to signify that SETV should _not_ be used when alpha=0. This should only be required in SCALV. - Include serial threshold in zdscal (as in dscal) to reduce overhead for small problem sizes. - Code tidying to make different variants more consistent. - More standardization of tests in SCALV gtestsuite programs. - Remove scalv_extreme_cases.cpp as it is now redundant. AMD-Internal: [CPUPL-4415] Change-Id: I42e98875ceaea224cc98d0cdfe0133c9abc3edae (cherry picked from commit a07e041b1f)
2026-07-03 05:37:51 +00:00 · 2024-09-05 09:31:34 -04:00
parent 601e4fbb87
commit 4eabb5cd2e
15 changed files with 1297 additions and 595 deletions
--- a/frame/compat/bla_scal.c
+++ b/frame/compat/bla_scal.c
@@ -5,7 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -40,7 +40,7 @@
 // Define BLAS-to-BLIS interfaces.
 //
 #undef  GENTFUNCSCAL
-#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
+#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, chau, blasname, blisname ) \
 \
 void PASTEF772S(chx,cha,blasname) \
     ( \
@@ -50,44 +50,42 @@ void PASTEF772S(chx,cha,blasname) \
     ) \
 { \
 	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
-	dim_t    n0; \
-	ftype_x* x0; \
-	inc_t    incx0; \
-	ftype_x  alpha_cast; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
 \
-	if (*n == 0 || alpha == NULL) { \
+	dim_t n0 = (dim_t)(*n); \
+	ftype_x *x0 = x; \
+	inc_t incx0 = (inc_t)(*incx); \
+\
+	if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(chau, eq1)(*alpha)) \
+	{ \
 		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+		/* Finalize BLIS. */ \
+		bli_finalize_auto(); \
 		return ; \
 	} \
-\
-	/* Convert/typecast negative values of n to zero. */ \
-	bli_convert_blas_dim1( *n, n0 ); \
-\
-	/* If the input increments are negative, adjust the pointers so we can
-	   use positive increments instead. */ \
-	bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
 \
 	/* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS.
 	   that is, we just always sub-optimally implement those cases
 	   by casting alpha to ctype_x (potentially the complex domain) and
 	   using the homogeneous datatype instance according to that type. */ \
+	ftype_x  alpha_cast; \
 	PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \
 \
 	/* Call BLIS interface. */ \
+	/* Pass size as negative to stipulate don't use SETV when alpha=0 */ \
 	PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
-	  n0, \
+	  -n0, \
 	  &alpha_cast, \
 	  x0, incx0, \
 	  NULL, \
 	  NULL  \
 	); \
 \
-  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
 }\
--- a/frame/compat/bla_scal_amd.c
+++ b/frame/compat/bla_scal_amd.c
@@ -50,13 +50,16 @@

  1. When alpha == NaN - Propogate the NaN to the vector
  2. When alpha == 0 - Perform the SCALV operation completely and don't use setv.
+     As SCALV kernels are used in many other BLAS APIs where we want setv to be
+     used in this scenario, here we call the kernels with n=-n to signify that
+     setv should not be used.
 */

 //
 // Define BLAS-to-BLIS interfaces.
 //
 #undef  GENTFUNCSCAL
-#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
+#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, chau, blasname, blisname ) \
 \
 void PASTEF772S(chx,cha,blasname) \
     ( \
@@ -66,55 +69,42 @@ void PASTEF772S(chx,cha,blasname) \
     ) \
 { \
 	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
-	dim_t    n0; \
-	ftype_x* x0; \
-	inc_t    incx0; \
-	ftype_x  alpha_cast; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
 \
-	if (*n == 0 || alpha == NULL) { \
-		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
-		return ; \
-	} \
+	dim_t n0 = (dim_t)(*n); \
+	ftype_x *x0 = x; \
+	inc_t incx0 = (inc_t)(*incx); \
 \
-	/* Convert/typecast negative values of n to zero. */ \
-	bli_convert_blas_dim1( *n, n0 ); \
-\
-	/* If the input increments are less than or equal to zero, return. */ \
-	if ( (*incx) <= 0 ) { \
+	if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(chau, eq1)(*alpha)) \
+	{ \
 		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+		/* Finalize BLIS. */ \
+		bli_finalize_auto(); \
 		return ; \
-	} else { \
-		incx0 = ( inc_t )(*incx); \
-		x0 = (x); \
 	} \
 \
 	/* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS.
 	   that is, we just always sub-optimally implement those cases
 	   by casting alpha to ctype_x (potentially the complex domain) and
 	   using the homogeneous datatype instance according to that type. */ \
+	ftype_x  alpha_cast; \
 	PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \
-\
-	/* If alpha is a unit scalar, return early. */ \
-	if ( PASTEMAC(c, eq1)(alpha_cast) ) { \
-		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
-		return ; \
-	} \
 \
 	/* Call BLIS interface. */ \
+	/* Pass size as negative to stipulate don't use SETV when alpha=0 */ \
 	PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
-	  n0, \
+	  -n0, \
 	  &alpha_cast, \
 	  x0, incx0, \
 	  NULL, \
 	  NULL  \
 	); \
 \
-  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
 }\
@@ -139,82 +129,72 @@ void sscal_blis_impl
 {
    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
    AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', (void *) alpha, *n, *incx );
-    dim_t  n0;
-    float* x0;
-    inc_t  incx0;
+
    /* Initialize BLIS. */
    //bli_init_auto();

-    if ((*n) <= 0 || alpha == NULL || bli_seq1(*alpha))
+    dim_t n0 = (dim_t)(*n);
+    float *x0 = x;
+    inc_t incx0 = (inc_t)(*incx);
+
+    /*
+      Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
+      Return early when alpha pointer is NULL - BLIS exception
+    */
+    if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(s, eq1)(*alpha))
    {
-      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-      return;
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        //bli_finalize_auto();
+        return;
    }

-    /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else          n0 = ( dim_t )(*n);
-
-    /* If the input increments are less than or equal to zero, return. */
-    if ( (*incx) <= 0 )
-    {
-      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-      return ;
-    }
-    else
-    {
-      x0    = (x);
-      incx0 = ( inc_t )(*incx);
-    }
+    // Definition of function pointer
+    sscalv_ker_ft scalv_ker_ptr;

    cntx_t *cntx = NULL;

    // Query the architecture ID
    arch_t id = bli_arch_query_id();

-    /*
-      Function pointer declaration for the function
-      that will be used by this API
-    */
-    sscalv_ker_ft scalv_ker_ptr; // DSCALV
-
    // Pick the kernel based on the architecture ID
    switch (id)
    {
-      case BLIS_ARCH_ZEN5:
-      case BLIS_ARCH_ZEN4:
+        case BLIS_ARCH_ZEN5:
+        case BLIS_ARCH_ZEN4:
 #if defined(BLIS_KERNELS_ZEN4)
-        scalv_ker_ptr = bli_sscalv_zen_int_avx512;
-
-        break;
+          scalv_ker_ptr = bli_sscalv_zen_int_avx512;
+          break;
 #endif
-      case BLIS_ARCH_ZEN:
-      case BLIS_ARCH_ZEN2:
-      case BLIS_ARCH_ZEN3:
-        scalv_ker_ptr = bli_sscalv_zen_int10;
+        case BLIS_ARCH_ZEN:
+        case BLIS_ARCH_ZEN2:
+        case BLIS_ARCH_ZEN3:
+          scalv_ker_ptr = bli_sscalv_zen_int10;
+          break;

-        break;
-      default:
+        default:

-        // For non-Zen architectures, query the context
-        cntx = bli_gks_query_cntx();
+          // For non-Zen architectures, query the context
+          cntx = bli_gks_query_cntx();

-        // Query the context for the kernel function pointers for sscalv
-        scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SCALV_KER, cntx);
+          // Query the context for the kernel function pointers for sscalv
+          scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SCALV_KER, cntx);
    }

+    // Invoke the function based on the kernel function pointer
+    // Pass size as negative to stipulate don't use SETV when alpha=0
    scalv_ker_ptr
    (
      BLIS_NO_CONJUGATE,
-      n0,
+      -n0,
      (float *)alpha,
      x0, incx0,
      cntx
    );

-    /* Finalize BLIS. */
-    //    bli_finalize_auto();
    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+    /* Finalize BLIS. */
+    //bli_finalize_auto();
 }
 #ifdef BLIS_ENABLE_BLAS
 void sscal_
@@ -224,7 +204,7 @@ void sscal_
       float*   x, const f77_int* incx
     )
 {
-  sscal_blis_impl( n, alpha, x, incx );
+    sscal_blis_impl( n, alpha, x, incx );
 }
 #endif
 void dscal_blis_impl
@@ -236,65 +216,54 @@ void dscal_blis_impl
 {
    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
    AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', (void *)alpha, *n, *incx );
-    dim_t  n_elem;
-#ifdef BLIS_ENABLE_OPENMP
-    dim_t  ST_THRESH = 30000;
-#endif
-    double* x0;
-    inc_t  incx0;

-    /* Initialize BLIS  */
+    /* Initialize BLIS. */
    //bli_init_auto();

-    /* Convert typecast negative values of n to zero. */
-    if ( *n < 0 ) n_elem = ( dim_t )0;
-    else          n_elem = ( dim_t )(*n);
+    dim_t n0 = (dim_t)(*n);
+    double *x0 = x;
+    inc_t incx0 = (inc_t)(*incx);

    /*
      Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
      Return early when alpha pointer is NULL - BLIS exception
    */
-    if ((*n) <= 0 || alpha == NULL || bli_deq1(*alpha))
+    if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
    {
        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        //bli_finalize_auto();
        return;
    }

-    /* If the input increments are less than or equal to zero, return. */
-    if ( (*incx) <= 0 )
-    {
-      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-      return ;
-    }
-    else
-    {
-      x0    = (x);
-      incx0 = ( inc_t )(*incx);
-    }
-
-     // Definition of function pointer
+    // Definition of function pointer
    dscalv_ker_ft scalv_ker_ptr;

    cntx_t *cntx = NULL;

+#ifdef BLIS_ENABLE_OPENMP
+    dim_t ST_THRESH = 30000;
+#endif
+
    // Query the architecture ID
-    arch_t arch_id_local = bli_arch_query_id();
+    arch_t id = bli_arch_query_id();

    // Pick the kernel based on the architecture ID
-    switch (arch_id_local)
+    switch (id)
    {
-      case BLIS_ARCH_ZEN5:
-      case BLIS_ARCH_ZEN4:
+        case BLIS_ARCH_ZEN5:
+        case BLIS_ARCH_ZEN4:
 #if defined(BLIS_KERNELS_ZEN4)
-        scalv_ker_ptr = bli_dscalv_zen_int_avx512;
+          // AVX512 Kernel
+          scalv_ker_ptr = bli_dscalv_zen_int_avx512;
  #ifdef BLIS_ENABLE_OPENMP
-        ST_THRESH = 30000;
+          ST_THRESH = 30000;
  #endif
-        break;
+          break;
 #endif
-      case BLIS_ARCH_ZEN:
-      case BLIS_ARCH_ZEN2:
-      case BLIS_ARCH_ZEN3:
+        case BLIS_ARCH_ZEN:
+        case BLIS_ARCH_ZEN2:
+        case BLIS_ARCH_ZEN3:

          // AVX2 Kernel
          scalv_ker_ptr = bli_dscalv_zen_int10;
@@ -303,9 +272,9 @@ void dscal_blis_impl
 #endif
          break;

-      default:
+        default:

-          // Query the context
+          // For non-Zen architectures, query the context
          cntx = bli_gks_query_cntx();

          // Query the function pointer using the context
@@ -315,25 +284,28 @@ void dscal_blis_impl

 #ifdef BLIS_ENABLE_OPENMP
    /*
-      If the optimial number of threads is 1, the OpenMP and
-      'bli_nthreads_l1'overheads are avoided by calling the
+      If the optimal number of threads is 1, the OpenMP and
+      'bli_nthreads_l1' overheads are avoided by calling the
      function directly. This ensures that performance of dscalv
      does not drop for single  thread when OpenMP is enabled.
    */
-    if (n_elem <= ST_THRESH)
+    if (n0 <= ST_THRESH)
    {
 #endif
+        // Invoke the function based on the kernel function pointer
+        // Pass size as negative to stipulate don't use SETV when alpha=0
        scalv_ker_ptr
        (
          BLIS_NO_CONJUGATE,
-          n_elem,
+          -n0,
          (double *)alpha,
          x0, incx0,
          cntx
        );

        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-
+        /* Finalize BLIS. */
+        //bli_finalize_auto();
        return;
 #ifdef BLIS_ENABLE_OPENMP
    }
@@ -354,14 +326,14 @@ void dscal_blis_impl
      BLIS_SCALV_KER,
      BLIS_DOUBLE,
      BLIS_DOUBLE,
-      arch_id_local,
-      n_elem,
+      id,
+      n0,
      &nt
    );

    _Pragma("omp parallel num_threads(nt)")
    {
-        dim_t start, end, length; 
+        dim_t start, end, length;
        thrinfo_t thrinfo_vec;

        // The block size is the minimum factor, whose multiple will ensure that only
@@ -383,7 +355,7 @@ void dscal_blis_impl
        bli_thread_range_sub
        (
          &thrinfo_vec,
-          n_elem,
+          n0,
          block_size,
          FALSE,
          &start,
@@ -396,22 +368,21 @@ void dscal_blis_impl
        double *x_thread_local = x0 + (start * incx0);

        // Invoke the function based on the kernel function pointer
+        // Pass size as negative to stipulate don't use SETV when alpha=0
        scalv_ker_ptr
        (
          BLIS_NO_CONJUGATE,
-          length,
+          -length,
          (double *)alpha,
          x_thread_local, incx0,
          cntx
        );
    }

-    /* Finalize BLIS. */
-    // bli_finalize_auto();
    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-
+    /* Finalize BLIS. */
+    //bli_finalize_auto();
 #endif
-
 }
 #ifdef BLIS_ENABLE_BLAS
 void dscal_
@@ -421,7 +392,7 @@ void dscal_
       double*   x, const f77_int* incx
     )
 {
-  dscal_blis_impl( n, alpha, x, incx );
+    dscal_blis_impl( n, alpha, x, incx );
 }
 #endif
 void zdscal_blis_impl
@@ -433,19 +404,23 @@ void zdscal_blis_impl
 {
    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
    AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', (void *) alpha, *n, *incx );
-    dim_t  n_elem = (dim_t)(*n);
-    dcomplex* x0 = x;
-    inc_t  incx0 = (inc_t)(*incx);
+
    /* Initialize BLIS. */
    //bli_init_auto();

+    dim_t  n0 = (dim_t)(*n);
+    dcomplex* x0 = x;
+    inc_t  incx0 = (inc_t)(*incx);
+
    /*
-        When n is zero or the alpha pointer passed is null
-        or the incx is zero or alpha is 1, return early.
+      Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
+      Return early when alpha pointer is NULL - BLIS exception
    */
-    if ((n_elem <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
+    if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
    {
        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        //bli_finalize_auto();
        return;
    }

@@ -458,30 +433,34 @@ void zdscal_blis_impl

    cntx_t *cntx = NULL;

+#ifdef BLIS_ENABLE_OPENMP
+    dim_t ST_THRESH = 10000;
+#endif
+
    // Query the architecture ID
-    arch_t arch_id_local = bli_arch_query_id();
+    arch_t id = bli_arch_query_id();

    // Pick the kernel based on the architecture ID
-    switch (arch_id_local)
+    switch (id)
    {
-      case BLIS_ARCH_ZEN5:
-      case BLIS_ARCH_ZEN4:
+        case BLIS_ARCH_ZEN5:
+        case BLIS_ARCH_ZEN4:
 #if defined(BLIS_KERNELS_ZEN4)
          // AVX512 Kernel
          scalv_ker_ptr = bli_zdscalv_zen_int_avx512;
          break;
 #endif
-      case BLIS_ARCH_ZEN:
-      case BLIS_ARCH_ZEN2:
-      case BLIS_ARCH_ZEN3:
+        case BLIS_ARCH_ZEN:
+        case BLIS_ARCH_ZEN2:
+        case BLIS_ARCH_ZEN3:

          // AVX2 Kernel
          scalv_ker_ptr = bli_zdscalv_zen_int10;
          break;

-      default:
+        default:

-          // Query the context
+          // For non-Zen architectures, query the context
          cntx = bli_gks_query_cntx();

          // Query the function pointer using the context
@@ -489,6 +468,32 @@ void zdscal_blis_impl
    }

 #ifdef BLIS_ENABLE_OPENMP
+    /*
+      If the optimal number of threads is 1, the OpenMP and
+      'bli_nthreads_l1' overheads are avoided by calling the
+      function directly. This ensures that performance of dscalv
+      does not drop for single  thread when OpenMP is enabled.
+    */
+    if (n0 <= ST_THRESH)
+    {
+#endif
+        // Invoke the function based on the kernel function pointer
+        // Pass size as negative to stipulate don't use SETV when alpha=0
+        scalv_ker_ptr
+        (
+          BLIS_NO_CONJUGATE,
+          -n0,
+          (dcomplex *)&alpha_cast,
+          x0, incx0,
+          cntx
+        );
+
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+        /* Finalize BLIS. */
+        //bli_finalize_auto();
+        return;
+#ifdef BLIS_ENABLE_OPENMP
+    }

    /*
      Initializing the number of thread to one
@@ -506,33 +511,11 @@ void zdscal_blis_impl
      BLIS_SCALV_KER,
      BLIS_DCOMPLEX,
      BLIS_DOUBLE,
-      arch_id_local,
-      n_elem,
+      id,
+      n0,
      &nt
    );

-    /*
-      If the number of optimum threads is 1, the OpenMP overhead
-      is avoided by calling the function directly
-    */
-    if (nt == 1)
-    {
-#endif
-        scalv_ker_ptr
-        (
-          BLIS_NO_CONJUGATE,
-          n_elem,
-          (dcomplex *)&alpha_cast,
-          x0, incx0,
-          cntx
-        );
-
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-
-        return;
-#ifdef BLIS_ENABLE_OPENMP
-    }
-
    _Pragma("omp parallel num_threads(nt)")
    {
        dim_t start, length;
@@ -549,7 +532,7 @@ void zdscal_blis_impl
        */
        bli_thread_vector_partition
        (
-          n_elem,
+          n0,
          nt_use,
          &start, &length,
          thread_id
@@ -559,18 +542,21 @@ void zdscal_blis_impl
        dcomplex *x_thread_local = x0 + (start * incx0);

        // Invoke the function based on the kernel function pointer
+        // Pass size as negative to stipulate don't use SETV when alpha=0
        scalv_ker_ptr
        (
          BLIS_NO_CONJUGATE,
-          length,
+          -length,
          (dcomplex *)&alpha_cast,
          x_thread_local, incx0,
          cntx
        );
    }
-#endif

    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+    /* Finalize BLIS. */
+    //bli_finalize_auto();
+#endif
 }
 #ifdef BLIS_ENABLE_BLAS
 void zdscal_
@@ -594,22 +580,27 @@ void cscal_blis_impl
    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
    AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', (void *)alpha, *n, *incx);

+    /* Initialize BLIS. */
+    //bli_init_auto();
+
    dim_t n0 = (dim_t)(*n);
    scomplex *x0 = x;
    inc_t incx0 = (inc_t)(*incx);

    /*
-        When n is zero or the alpha pointer passed is null
-        or the incx is zero or alpha is 1, return early.
+      Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
+      Return early when alpha pointer is NULL - BLIS exception
    */
    if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(c, eq1)(*alpha))
    {
        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        //bli_finalize_auto();
        return;
    }

    // Definition of function pointer
-    cscalv_ker_ft scalv_fun_ptr;
+    cscalv_ker_ft scalv_ker_ptr;

    cntx_t* cntx = NULL;

@@ -622,40 +613,42 @@ void cscal_blis_impl
        case BLIS_ARCH_ZEN5:
        case BLIS_ARCH_ZEN4:
 #if defined(BLIS_KERNELS_ZEN4)
-            // AVX512 Kernel
-            scalv_fun_ptr = bli_cscalv_zen_int_avx512;
-            break;
+          // AVX512 Kernel
+          scalv_ker_ptr = bli_cscalv_zen_int_avx512;
+          break;
 #endif
        case BLIS_ARCH_ZEN:
        case BLIS_ARCH_ZEN2:
        case BLIS_ARCH_ZEN3:

-            //   AVX2 Kernel
-            scalv_fun_ptr = bli_cscalv_zen_int;
-            break;
+          // AVX2 Kernel
+          scalv_ker_ptr = bli_cscalv_zen_int;
+          break;

        default:

-          // Query the context
+          // For non-Zen architectures, query the context
          cntx = bli_gks_query_cntx();

          // Query the function pointer using the context
-          scalv_fun_ptr = bli_cntx_get_l1v_ker_dt(BLIS_SCOMPLEX, BLIS_SCALV_KER, cntx);
+          scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_SCOMPLEX, BLIS_SCALV_KER, cntx);
    }

-    // Call the function based on the function pointer assigned above
-    scalv_fun_ptr
+    // Invoke the function based on the kernel function pointer
+    // Pass size as negative to stipulate don't use SETV when alpha=0
+    scalv_ker_ptr
    (
      BLIS_NO_CONJUGATE,
-      n0,
+      -n0,
      (scomplex*) alpha,
      x0, incx0,
      cntx
    );

    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+    /* Finalize BLIS. */
+    //bli_finalize_auto();
 }
-
 #ifdef BLIS_ENABLE_BLAS
 void cscal_
     (
@@ -678,22 +671,27 @@ void zscal_blis_impl
    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
    AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', (void *)alpha, *n, *incx);

+    /* Initialize BLIS. */
+    //bli_init_auto();
+
    dim_t n0 = (dim_t)(*n);
    dcomplex *x0 = x;
    inc_t incx0 = (inc_t)(*incx);

    /*
-        When n is zero or the alpha pointer passed is null
-        or the incx is zero or alpha is 1, return early.
+      Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
+      Return early when alpha pointer is NULL - BLIS exception
    */
    if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(z, eq1)(*alpha))
    {
        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        //bli_finalize_auto();
        return;
    }

    // Definition of function pointer
-    zscalv_ker_ft scalv_fun_ptr;
+    zscalv_ker_ft scalv_ker_ptr;

    cntx_t* cntx = NULL;

@@ -707,7 +705,7 @@ void zscal_blis_impl
        case BLIS_ARCH_ZEN4:
 #if defined(BLIS_KERNELS_ZEN4)
          // AVX512 Kernel
-          scalv_fun_ptr = bli_zscalv_zen_int_avx512;
+          scalv_ker_ptr = bli_zscalv_zen_int_avx512;
          break;
 #endif
        case BLIS_ARCH_ZEN:
@@ -715,29 +713,32 @@ void zscal_blis_impl
        case BLIS_ARCH_ZEN3:

          // AVX2 Kernel
-          scalv_fun_ptr = bli_zscalv_zen_int;
+          scalv_ker_ptr = bli_zscalv_zen_int;
          break;

        default:

-          // Query the context
+          // For non-Zen architectures, query the context
          cntx = bli_gks_query_cntx();

          // Query the function pointer using the context
-          scalv_fun_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx);
+          scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx);
    }

-    // Call the function based on the function pointer assigned above
-    scalv_fun_ptr
+    // Invoke the function based on the kernel function pointer
+    // Pass size as negative to stipulate don't use SETV when alpha=0
+    scalv_ker_ptr
    (
      BLIS_NO_CONJUGATE,
-      n0,
+      -n0,
      (dcomplex*) alpha,
      x0, incx0,
      cntx
    );

    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+    /* Finalize BLIS. */
+    //bli_finalize_auto();
 }
 #ifdef BLIS_ENABLE_BLAS
 void zscal_
@@ -751,4 +752,4 @@ void zscal_
 }
 #endif

-GENTFUNCSCAL( scomplex, float, c, s, scal, scalv )
+GENTFUNCSCAL( scomplex, float, c, s, s, scal, scalv )
--- a/frame/include/bli_gentfunc_macro_defs.h
+++ b/frame/include/bli_gentfunc_macro_defs.h
@@ -174,12 +174,12 @@ GENTFUNCSCAL( scomplex, float,    c, s, blasname, blisname )

 #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \
 \
-GENTFUNCSCAL( float,    float,    s,  , blasname, blisname ) \
-GENTFUNCSCAL( double,   double,   d,  , blasname, blisname ) \
-GENTFUNCSCAL( scomplex, scomplex, c,  , blasname, blisname ) \
-GENTFUNCSCAL( dcomplex, dcomplex, z,  , blasname, blisname ) \
-GENTFUNCSCAL( scomplex, float,    c, s, blasname, blisname ) \
-GENTFUNCSCAL( dcomplex, double,   z, d, blasname, blisname )
+GENTFUNCSCAL( float,    float,    s,  , s, blasname, blisname ) \
+GENTFUNCSCAL( double,   double,   d,  , d, blasname, blisname ) \
+GENTFUNCSCAL( scomplex, scomplex, c,  , c, blasname, blisname ) \
+GENTFUNCSCAL( dcomplex, dcomplex, z,  , z, blasname, blisname ) \
+GENTFUNCSCAL( scomplex, float,    c, s, s, blasname, blisname ) \
+GENTFUNCSCAL( dcomplex, double,   z, d, d, blasname, blisname )

 // --GEMMT specific kernels ----------------------------------------------------

--- a/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp
+++ b/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp
@@ -36,10 +36,10 @@
 #include "test_scalv.h"

 class cscalvGeneric :
-        public ::testing::TestWithParam<std::tuple<char,
-                                                   gtint_t,
-                                                   gtint_t,
-                                                   scomplex>> {};
+        public ::testing::TestWithParam<std::tuple<char,            // conj_alpha
+                                                   gtint_t,         // n
+                                                   gtint_t,         // incx
+                                                   scomplex>> {};   // alpha


 // Tests using random integers as vector elements.
@@ -78,42 +78,140 @@ TEST_P( cscalvGeneric, API )
    test_scalv<T>( conj_alpha, n, incx, alpha, thresh );
 }

-// Black box testing for generic and main use of cscal.
+// Black box testing for generic use of dscal.
 INSTANTIATE_TEST_SUITE_P(
-        Blackbox,
+        unitPositiveIncrementSmall,
        cscalvGeneric,
        ::testing::Combine(
-            ::testing::Values('n'
-#ifdef TEST_BLIS_TYPED
-            , 'c'                                                            // this option is BLIS-api specific.
-#endif
-            ),                                                               // n: use x, c: use conj(x)
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
-            ::testing::Values(gtint_t(1)),                                   // stride size for x
-            ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0})      // alpha
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                scomplex{-5.1, -7.3},
+                                scomplex{ 1.0,  1.0},
+                                scomplex{ 7.3,  5.1}
+            )
        ),
        (::scalvGenericPrint<scomplex, scomplex>())
    );

+// Black box testing for generic use of dscal.
+INSTANTIATE_TEST_SUITE_P(
+        unitPositiveIncrementLarge,
+        cscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                scomplex{-5.1, -7.3},
+                                scomplex{ 1.0,  1.0},
+                                scomplex{ 7.3,  5.1}
+            )
+        ),
+        (::scalvGenericPrint<scomplex, scomplex>())
+    );

-// Test for non-unit increments.
-// Only test very few cases as sanity check.
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementSmall,
+        cscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(9), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                scomplex{-5.1, -7.3},
+                                scomplex{ 1.0,  1.0},
+                                scomplex{ 7.3,  5.1}
+            )
+        ),
+        (::scalvGenericPrint<scomplex, scomplex>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementLarge,
+        cscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                scomplex{-5.1, -7.3},
+                                scomplex{ 1.0,  1.0},
+                                scomplex{ 7.3,  5.1}
+            )
+        ),
+        (::scalvGenericPrint<scomplex, scomplex>())
+    );
+
+#ifndef TEST_BLIS_TYPED
+// alpha=0 testing only for BLAS and CBLAS as
+// BLIS uses setv and won't propagate Inf and NaNs
+INSTANTIATE_TEST_SUITE_P(
+        alphaZero,
+        cscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                scomplex{ 0.0,  0.0}
+            )
+        ),
+        (::scalvGenericPrint<scomplex, scomplex>())
+    );
+#endif
+
+#ifdef TEST_BLIS_TYPED
+// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
+// Only test very few cases as sanity check since conj(x) = x for real types.
 // We can modify the values using implementantion details.
 INSTANTIATE_TEST_SUITE_P(
-        NonUnitPositiveIncrements,
+        conjalpha,
        cscalvGeneric,
        ::testing::Combine(
-            ::testing::Values('n'
-#ifdef TEST_BLIS_TYPED
-            , 'c'                                                            // this option is BLIS-api specific.
-#endif
-            ),                                                               // n: use x, c: use conj(x)
-            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
-            ::testing::Values(gtint_t(2), gtint_t(11)),                      //(gtint_t(-5), gtint_t(-17)) // stride size for x
-            ::testing::Values(scomplex{4.0, 3.1})                            // alpha
+            ::testing::Values('c'),                                          // c: use conjugate
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(scomplex{ 7.3,  5.1})                          // alpha
        ),
        (::scalvGenericPrint<scomplex, scomplex>())
    );
+#endif

 #ifndef TEST_BLIS_TYPED
 // Test for negative increments.
@@ -126,7 +224,7 @@ INSTANTIATE_TEST_SUITE_P(
            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
            ::testing::Values(gtint_t(-2), gtint_t(-1)),                     // stride size for x
-            ::testing::Values(scomplex{4.0, 3.1})                            // alpha
+            ::testing::Values(scomplex{ 7.3,  5.1})                          // alpha
        ),
        (::scalvGenericPrint<scomplex, scomplex>())
    );
--- a/gtestsuite/testsuite/level1/scalv/csscalv_generic.cpp
+++ b/gtestsuite/testsuite/level1/scalv/csscalv_generic.cpp
@@ -0,0 +1,219 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_scalv.h"
+
+class csscalvGeneric :
+        public ::testing::TestWithParam<std::tuple<char,        // conj_alpha
+                                                   gtint_t,     // n
+                                                   gtint_t,     // incx
+                                                   float>> {}; // alpha
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(csscalvGeneric);
+
+// Tests using random integers as vector elements.
+TEST_P( csscalvGeneric, API )
+{
+    using T = scomplex;
+    using U = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether alpha or conj(alpha) will be used:
+    char conj_alpha = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // alpha
+    U alpha = std::get<3>(GetParam());
+
+    // Set the threshold for the errors:
+    // Check gtestsuite scalv.h or netlib source code for reminder of the
+    // functionality from which we estimate operation count per element
+    // of output, and hence the multipler for epsilon.
+    // No adjustment applied yet for complex data.
+    double thresh;
+    if (n == 0)
+        thresh = 0.0;
+    else if (alpha == testinghelpers::ZERO<U>() || alpha == testinghelpers::ONE<U>())
+        thresh = 0.0;
+    else
+        thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_scalv<T, U>( conj_alpha, n, incx, alpha, thresh );
+}
+
+// bli_csscal not present in BLIS
+#ifndef TEST_BLIS_TYPED
+
+// Black box testing for generic use of dscal.
+INSTANTIATE_TEST_SUITE_P(
+        unitPositiveIncrementSmall,
+        csscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+// Black box testing for generic use of dscal.
+INSTANTIATE_TEST_SUITE_P(
+        unitPositiveIncrementLarge,
+        csscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementSmall,
+        csscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(9), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementLarge,
+        csscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+// alpha=0 testing only for BLAS and CBLAS as
+// BLIS uses setv and won't propagate Inf and NaNs
+INSTANTIATE_TEST_SUITE_P(
+        alphaZero,
+        csscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 0.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        csscalvGeneric,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-2), gtint_t(-1)),                     // stride size for x
+            ::testing::Values(3)                                             // alpha
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+#endif // not TEST_BLIS_TYPED
+
+
+
+
+
+
--- a/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp
+++ b/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp
@@ -79,20 +79,41 @@ TEST_P( dscalvGeneric, API )

 // Black box testing for generic use of dscal.
 INSTANTIATE_TEST_SUITE_P(
-        unitPositiveIncrement,
+        unitPositiveIncrementSmall,
        dscalvGeneric,
        ::testing::Combine(
            // conj(alpha): uses n (no_conjugate) since it is real.
            ::testing::Values('n'),
            // m: size of vector.
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 7.0),
+                                double(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+
+// Black box testing for generic use of dscal.
+INSTANTIATE_TEST_SUITE_P(
+        unitPositiveIncrementLarge,
+        dscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
            // incx: stride of x vector.
            ::testing::Values(
                                gtint_t(1)
            ),
            // alpha: value of scalar.
            ::testing::Values(
-                                double( 0.0),
                                double( 7.0),
                                double(-3.0)
            )
@@ -101,21 +122,20 @@ INSTANTIATE_TEST_SUITE_P(
    );

 INSTANTIATE_TEST_SUITE_P(
-        nonUnitPositiveIncrement,
+        nonUnitPositiveIncrementSmall,
        dscalvGeneric,
        ::testing::Combine(
            // conj(alpha): uses n (no_conjugate) since it is real.
            ::testing::Values('n'),
            // m: size of vector.
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),
+            ::testing::Range(gtint_t(1), gtint_t(9), 1),
            // incx: stride of x vector.
            ::testing::Values(
                                gtint_t(2),
-                                gtint_t(3)
+                                gtint_t(41)
            ),
            // alpha: value of scalar.
            ::testing::Values(
-                                double( 0.0),
                                double( 7.0),
                                double(-3.0)
            )
@@ -123,6 +143,54 @@ INSTANTIATE_TEST_SUITE_P(
        (::scalvGenericPrint<double, double>())
    );

+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementLarge,
+        dscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 7.0),
+                                double(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+
+#ifndef TEST_BLIS_TYPED
+// alpha=0 testing only for BLAS and CBLAS as
+// BLIS uses setv and won't propagate Inf and NaNs
+INSTANTIATE_TEST_SUITE_P(
+        alphaZero,
+        dscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 0.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+#endif
+
 #ifdef TEST_BLIS_TYPED
 // Test when conjugate of x is used as an argument. This option is BLIS-api specific.
 // Only test very few cases as sanity check since conj(x) = x for real types.
@@ -140,6 +208,23 @@ INSTANTIATE_TEST_SUITE_P(
    );
 #endif

+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        dscalvGeneric,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-2), gtint_t(-1)),                     // stride size for x
+            ::testing::Values(3)                                             // alpha
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+#endif
+
 #if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
 INSTANTIATE_TEST_SUITE_P(
        AOCLDynamic,
@@ -151,6 +236,7 @@ INSTANTIATE_TEST_SUITE_P(
            ::testing::Values(
                               gtint_t(   30000),     // nt_ideal = 1
                               gtint_t(  100000),     // nt_ideal = 2
+                               gtint_t(  486919),     // nt_ideal = 8
                               gtint_t(  500000),     // nt_ideal = 8
                               gtint_t( 2500000),     // nt_ideal = 12
                               gtint_t( 4000000),     // nt_ideal = 16
@@ -160,7 +246,8 @@ INSTANTIATE_TEST_SUITE_P(
            ),
            // incx: stride of x vector.
            ::testing::Values(
-                                gtint_t(1)
+                                gtint_t(1),
+                                gtint_t(3)
            ),
            // alpha: value of scalar.
            ::testing::Values(
@@ -169,4 +256,34 @@ INSTANTIATE_TEST_SUITE_P(
        ),
        (::scalvGenericPrint<double, double>())
    );
+
+#ifndef TEST_BLIS_TYPED
+// alpha=0 testing only for BLAS and CBLAS as
+// BLIS uses setv and won't propagate Inf and NaNs
+INSTANTIATE_TEST_SUITE_P(
+        AOCLDynamicAlphaZero,
+        dscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(
+                               gtint_t(  89),     // nt_ideal = 8
+                               gtint_t(  486919),     // nt_ideal = 8
+                               gtint_t(25000000)      // nt_ideal = max_available
+            ),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(3)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 0.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
 #endif
+
+#endif // BLIS_ENABLE_OPENMP && AOCL_DYNAMIC
--- a/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp
+++ b/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp
@@ -1,117 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023 - 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <gtest/gtest.h>
-#include "test_scalv.h"
-
-template <typename T>
-class scalv_EIC : public ::testing::Test {};
-typedef ::testing::Types<float, double> TypeParam;
-TYPED_TEST_SUITE(scalv_EIC, TypeParam);
-
-TYPED_TEST(scalv_EIC, zero_alpha_x_fp)
-{
-    using T = TypeParam;
-    gtint_t n = 10, incx = 1;
-    std::vector<T> x(n);
-    // Initialize x with random numbers.
-    testinghelpers::datagenerators::randomgenerators<T>( -10, 10, n, incx, x.data() );
-    std::vector<T> x_ref(x);
-    T alpha = T{0};
-
-    testinghelpers::ref_scalv<T, T>('n', n, alpha, x_ref.data(), incx);
-    //----------------------------------------------------------
-    //                  Call BLIS function.
-    //----------------------------------------------------------
-    scalv<T>('n', n, alpha, x.data(), incx);
-
-    //----------------------------------------------------------
-    //              Compute component-wise error.
-    //----------------------------------------------------------
-    // Set the threshold for the errors:
-    // Check gtestsuite scalv.h or netlib source code for reminder of the
-    // functionality from which we estimate operation count per element
-    // of output, and hence the multipler for epsilon.
-    double thresh;
-    if (n == 0)
-        thresh = 0.0;
-    else if (alpha == testinghelpers::ZERO<T>() || alpha == testinghelpers::ONE<T>())
-        thresh = 0.0;
-    else
-        thresh = testinghelpers::getEpsilon<T>();
-
-    //----------------------------------------------------------
-    //     Call generic test body using those parameters
-    //----------------------------------------------------------
-    computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh, true );
-}
-
-TYPED_TEST(scalv_EIC, zero_alpha_x_inf)
-{
-    using T = TypeParam;
-    gtint_t n = 10, incx = 1;
-    std::vector<T> x(n);
-    // Initialize x with random numbers.
-    testinghelpers::datagenerators::randomgenerators<T>( -10, 10, n, incx, x.data() );
-    x[3] = 1.0/0.0;
-    std::vector<T> x_ref(x);
-    T alpha = T{0};
-    testinghelpers::ref_scalv<T, T>('n', n, alpha, x_ref.data(), incx);
-
-    //----------------------------------------------------------
-    //                  Call BLIS function.
-    //----------------------------------------------------------
-    scalv<T>('n', n, alpha, x.data(), incx);
-
-    //----------------------------------------------------------
-    //              Compute component-wise error.
-    //----------------------------------------------------------
-    // Set the threshold for the errors:
-    // Check gtestsuite scalv.h or netlib source code for reminder of the
-    // functionality from which we estimate operation count per element
-    // of output, and hence the multipler for epsilon.
-    // No adjustment applied yet for complex data.
-    double thresh;
-    if (n == 0)
-        thresh = 0.0;
-    else if (alpha == testinghelpers::ZERO<T>() || alpha == testinghelpers::ONE<T>())
-        thresh = 0.0;
-    else
-        thresh = testinghelpers::getEpsilon<T>();
-
-    //----------------------------------------------------------
-    //     Call generic test body using those parameters
-    //----------------------------------------------------------
-    computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh, true );
-}
--- a/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp
+++ b/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp
@@ -63,7 +63,7 @@ TEST_P( sscalvGeneric, API )
    // Check gtestsuite scalv.h or netlib source code for reminder of the
    // functionality from which we estimate operation count per element
    // of output, and hence the multipler for epsilon.
-    double thresh;
+    float thresh;
    if (n == 0)
        thresh = 0.0;
    else if (alpha == testinghelpers::ZERO<T>() || alpha == testinghelpers::ONE<T>())
@@ -77,19 +77,120 @@ TEST_P( sscalvGeneric, API )
    test_scalv<T>( conj_alpha, n, incx, alpha, thresh );
 }

-// Black box testing for generic and main use of sscal.
+// Black box testing for generic use of sscal.
 INSTANTIATE_TEST_SUITE_P(
-        Blackbox,
+        unitPositiveIncrementSmall,
        sscalvGeneric,
        ::testing::Combine(
-            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
-            ::testing::Values(gtint_t(1)),                                   // stride size for x
-            ::testing::Values(float(3.0), float(-5.0))                       // alpha
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
        ),
        (::scalvGenericPrint<float, float>())
    );

+// Black box testing for generic use of dscal.
+INSTANTIATE_TEST_SUITE_P(
+        unitPositiveIncrementLarge,
+        sscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementSmall,
+        sscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(17), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementLarge,
+        sscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+#ifndef TEST_BLIS_TYPED
+// alpha=0 testing only for BLAS and CBLAS as
+// BLIS uses setv and won't propagate Inf and NaNs
+INSTANTIATE_TEST_SUITE_P(
+        alphaZero,
+        sscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 0.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+#endif
+
 #ifdef TEST_BLIS_TYPED
 // Test when conjugate of x is used as an argument. This option is BLIS-api specific.
 // Only test very few cases as sanity check since conj(x) = x for real types.
@@ -101,28 +202,12 @@ INSTANTIATE_TEST_SUITE_P(
            ::testing::Values('c'),                                          // c: use conjugate
            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector takes values from 10 to 100 with step size of 10.
            ::testing::Values(gtint_t(1)),                                   // stride size for x
-            ::testing::Values(float(9.0))                                    // alpha
+            ::testing::Values(float(-3.0))                                  // alpha
        ),
        (::scalvGenericPrint<float, float>())
    );
 #endif

-// Test for non-unit increments.
-// Only test very few cases as sanity check.
-// We can modify the values using implementantion details.
-INSTANTIATE_TEST_SUITE_P(
-        NonUnitPositiveIncrements,
-        sscalvGeneric,
-        ::testing::Combine(
-            ::testing::Values('n'),                                          // n: use x
-            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector takes values from 10 to 100 with step size of 10.
-            ::testing::Values(gtint_t(2), gtint_t(11)),                      //(gtint_t(-5), gtint_t(-17)) // stride size for x
-            ::testing::Values(float(2.0))                                    // alpha
-        ),
-        (::scalvGenericPrint<float, float>())
-    );
-
-
 #ifndef TEST_BLIS_TYPED
 // Test for negative increments.
 // Only test very few cases as sanity check.
--- a/gtestsuite/testsuite/level1/scalv/test_scalv.h
+++ b/gtestsuite/testsuite/level1/scalv/test_scalv.h
@@ -48,6 +48,8 @@ static void test_scalv( char conja_alpha, gtint_t n, gtint_t incx, U alpha, doub
    //        Initialize vector with random numbers.
    //----------------------------------------------------------
    std::vector<T> x = testinghelpers::get_random_vector<T>( -10, 10, n, incx );
+    if (alpha == testinghelpers::ZERO<U>())
+        testinghelpers::set_vector( n, incx, x.data(), testinghelpers::aocl_extreme<T>() );

    //----------------------------------------------------------
    //    Call reference implementation to get ref results.
@@ -64,7 +66,7 @@ static void test_scalv( char conja_alpha, gtint_t n, gtint_t incx, U alpha, doub
    //----------------------------------------------------------
    //              Compute component-wise error.
    //----------------------------------------------------------
-    computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh );
+    computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh, true );
 }

 /**
--- a/gtestsuite/testsuite/level1/scalv/zdscalv_generic.cpp
+++ b/gtestsuite/testsuite/level1/scalv/zdscalv_generic.cpp
@@ -82,59 +82,187 @@ TEST_P( zdscalvGeneric, API )

 // bli_zdscal not present in BLIS
 #ifndef TEST_BLIS_TYPED
-// Black box testing for zdscal.
-// Tests with unit-positive increment.
+
+// Black box testing for generic use of dscal.
 INSTANTIATE_TEST_SUITE_P(
-        unitPositiveIncrement,
+        unitPositiveIncrementSmall,
        zdscalvGeneric,
        ::testing::Combine(
            // conj(alpha): uses n (no_conjugate) since it is real.
-            ::testing::Values('n'
-#ifdef TEST_BLIS_TYPED
-                            , 'c'       // this option is BLIS-api specific.
-#endif
-            ),
+            ::testing::Values('n'),
            // m: size of vector.
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
            // incx: stride of x vector.
-            ::testing::Values(gtint_t(1)),
+            ::testing::Values(
+                                gtint_t(1)
+            ),
            // alpha: value of scalar.
            ::testing::Values(
-                                double(-5.1),
-                                double( 0.0),
-                                double( 7.3)
+                                double( 7.0),
+                                double(-3.0)
            )
        ),
-        (::scalvGenericPrint<dcomplex, double>())
+        (::scalvGenericPrint<double, double>())
    );

-
-// Tests for non-unit increments.
+// Black box testing for generic use of dscal.
 INSTANTIATE_TEST_SUITE_P(
-        nonUnitPositiveIncrement,
+        unitPositiveIncrementLarge,
        zdscalvGeneric,
        ::testing::Combine(
            // conj(alpha): uses n (no_conjugate) since it is real.
-            ::testing::Values('n'
-#ifdef TEST_BLIS_TYPED
-                            , 'c'       // this option is BLIS-api specific.
-#endif
-            ),
+            ::testing::Values('n'),
            // m: size of vector.
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 7.0),
+                                double(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementSmall,
+        zdscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(9), 1),
            // incx: stride of x vector.
            ::testing::Values(
                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 7.0),
+                                double(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementLarge,
+        zdscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 7.0),
+                                double(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+
+// alpha=0 testing only for BLAS and CBLAS as
+// BLIS uses setv and won't propagate Inf and NaNs
+INSTANTIATE_TEST_SUITE_P(
+        alphaZero,
+        zdscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 0.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        zdscalvGeneric,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-2), gtint_t(-1)),                     // stride size for x
+            ::testing::Values(3)                                             // alpha
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+
+#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
+INSTANTIATE_TEST_SUITE_P(
+        AOCLDynamic,
+        zdscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(
+                               gtint_t(   10000),     // nt_ideal = 1
+                               gtint_t(   20000),     // nt_ideal = 4
+                               gtint_t(  486919),     // nt_ideal = 8
+                               gtint_t( 1000000),     // nt_ideal = 8
+                               gtint_t( 2500000),     // nt_ideal = 12
+                               gtint_t( 5000000),     // nt_ideal = 32
+                               gtint_t( 7000000)      // nt_ideal = max_available
+            ),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
                                gtint_t(3)
            ),
            // alpha: value of scalar.
            ::testing::Values(
-                                double(-5.1),
-                                double( 0.0),
-                                double( 7.3)
+                                double( 7.0)
            )
        ),
-        (::scalvGenericPrint<dcomplex, double>())
+        (::scalvGenericPrint<double, double>())
    );

+INSTANTIATE_TEST_SUITE_P(
+        AOCLDynamicAlphaZero,
+        zdscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(
+                               gtint_t(  486919),     // nt_ideal = 8
+                               gtint_t( 7000000)      // nt_ideal = max_available
+            ),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(3)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 0.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+#endif
+
 #endif // not TEST_BLIS_TYPED
--- a/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp
+++ b/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp
@@ -78,26 +78,22 @@ TEST_P( zscalvGeneric, API )
    test_scalv<T>( conj_alpha, n, incx, alpha, thresh );
 }

-// Black box testing for zscal.
-// Tests with unit-positive increment.
+// Black box testing for generic use of dscal.
 INSTANTIATE_TEST_SUITE_P(
-        unitPositiveIncrement,
+        unitPositiveIncrementSmall,
        zscalvGeneric,
        ::testing::Combine(
            // conj(alpha): uses n (no_conjugate) since it is real.
-            ::testing::Values('n'
-#ifdef TEST_BLIS_TYPED
-            , 'c'                                                            // this option is BLIS-api specific.
-#endif
-            ),
+            ::testing::Values('n'),
            // m: size of vector.
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
            // incx: stride of x vector.
-            ::testing::Values(gtint_t(1)),
+            ::testing::Values(
+                                gtint_t(1)
+            ),
            // alpha: value of scalar.
            ::testing::Values(
                                dcomplex{-5.1, -7.3},
-                                dcomplex{ 0.0,  0.0},
                                dcomplex{ 1.0,  1.0},
                                dcomplex{ 7.3,  5.1}
            )
@@ -105,32 +101,131 @@ INSTANTIATE_TEST_SUITE_P(
        (::scalvGenericPrint<dcomplex, dcomplex>())
    );

-
-// Test for non-unit increments.
+// Black box testing for generic use of dscal.
 INSTANTIATE_TEST_SUITE_P(
-        nonUnitPositiveIncrement,
+        unitPositiveIncrementLarge,
        zscalvGeneric,
        ::testing::Combine(
            // conj(alpha): uses n (no_conjugate) since it is real.
-            ::testing::Values('n'
-#ifdef TEST_BLIS_TYPED
-            , 'c'                                                            // this option is BLIS-api specific.
-#endif
-            ),
+            ::testing::Values('n'),
            // m: size of vector.
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                dcomplex{-5.1, -7.3},
+                                dcomplex{ 1.0,  1.0},
+                                dcomplex{ 7.3,  5.1}
+            )
+        ),
+        (::scalvGenericPrint<dcomplex, dcomplex>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementSmall,
+        zscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(9), 1),
            // incx: stride of x vector.
            ::testing::Values(
                                gtint_t(2),
-                                gtint_t(3)
+                                gtint_t(41)
            ),
            // alpha: value of scalar.
            ::testing::Values(
                                dcomplex{-5.1, -7.3},
-                                dcomplex{ 0.0,  0.0},
                                dcomplex{ 1.0,  1.0},
                                dcomplex{ 7.3,  5.1}
            )
        ),
        (::scalvGenericPrint<dcomplex, dcomplex>())
    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementLarge,
+        zscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                dcomplex{-5.1, -7.3},
+                                dcomplex{ 1.0,  1.0},
+                                dcomplex{ 7.3,  5.1}
+            )
+        ),
+        (::scalvGenericPrint<dcomplex, dcomplex>())
+    );
+
+#ifndef TEST_BLIS_TYPED
+// alpha=0 testing only for BLAS and CBLAS as
+// BLIS uses setv and won't propagate Inf and NaNs
+INSTANTIATE_TEST_SUITE_P(
+        alphaZero,
+        zscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                dcomplex{ 0.0,  0.0}
+            )
+        ),
+        (::scalvGenericPrint<dcomplex, dcomplex>())
+    );
+#endif
+
+#ifdef TEST_BLIS_TYPED
+// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        conjalpha,
+        zscalvGeneric,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conjugate
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(dcomplex{ 7.3,  5.1})                          // alpha
+        ),
+        (::scalvGenericPrint<dcomplex, dcomplex>())
+    );
+#endif
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        zscalvGeneric,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-2), gtint_t(-1)),                     // stride size for x
+            ::testing::Values(dcomplex{ 7.3,  5.1})                          // alpha
+        ),
+        (::scalvGenericPrint<dcomplex, dcomplex>())
+    );
+#endif
--- a/kernels/zen/1/bli_scalv_zen_int.c
+++ b/kernels/zen/1/bli_scalv_zen_int.c
@@ -4,7 +4,7 @@
   An object-based framework for developing high-performance BLAS-like
   libraries.

-   Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2017 - 2024, Advanced Micro Devices, Inc. All rights reserved.
   Copyright (C) 2018, The University of Texas at Austin

   Redistribution and use in source and binary forms, with or without
@@ -80,9 +80,11 @@ void bli_sscalv_zen_int
 	if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;

 	// If alpha is zero, use setv (in case y contains NaN or Inf).
-	if ( PASTEMAC(s,eq0)( *alpha ) )
+	// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+	if ( PASTEMAC(s,eq0)( *alpha ) && n > 0 )
 	{
 		float*       zero = bli_s0;
+		if (cntx == NULL) cntx = bli_gks_query_cntx();
 		ssetv_ker_ft f    = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );

 		f
@@ -96,10 +98,12 @@ void bli_sscalv_zen_int
 		return;
 	}

+	dim_t n0 = bli_abs(n);
+
 	// Use the unrolling factor and the number of elements per register
 	// to compute the number of vectorized and leftover iterations.
-	n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
-	n_left  = ( n ) % ( n_elem_per_reg * n_iter_unroll );
+	n_viter = ( n0 ) / ( n_elem_per_reg * n_iter_unroll );
+	n_left  = ( n0 ) % ( n_elem_per_reg * n_iter_unroll );

 	// If there is anything that would interfere with our use of contiguous
 	// vector loads/stores, override n_viter and n_left to use scalar code
@@ -107,7 +111,7 @@ void bli_sscalv_zen_int
 	if ( incx != 1 )
 	{
 		n_viter = 0;
-		n_left  = n;
+		n_left  = n0;
 	}

 	// Initialize local pointers.
@@ -178,10 +182,11 @@ void bli_dscalv_zen_int
 	// If the vector dimension is zero, or if alpha is unit, return early.
 	if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;

-	// If alpha is zero, use setv (in case y contains NaN or Inf).
-	if ( PASTEMAC(d,eq0)( *alpha ) )
+	// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+	if ( PASTEMAC(d,eq0)( *alpha ) && n > 0 )
 	{
 		double*      zero = bli_d0;
+		if (cntx == NULL) cntx = bli_gks_query_cntx();
 		dsetv_ker_ft f    = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );

 		f
@@ -195,10 +200,12 @@ void bli_dscalv_zen_int
 		return;
 	}

+	dim_t n0 = bli_abs(n);
+
 	// Use the unrolling factor and the number of elements per register
 	// to compute the number of vectorized and leftover iterations.
-	n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
-	n_left  = ( n ) % ( n_elem_per_reg * n_iter_unroll );
+	n_viter = ( n0 ) / ( n_elem_per_reg * n_iter_unroll );
+	n_left  = ( n0 ) % ( n_elem_per_reg * n_iter_unroll );

 	// If there is anything that would interfere with our use of contiguous
 	// vector loads/stores, override n_viter and n_left to use scalar code
@@ -206,7 +213,7 @@ void bli_dscalv_zen_int
 	if ( incx != 1 )
 	{
 		n_viter = 0;
-		n_left  = n;
+		n_left  = n0;
 	}

 	// Initialize local pointers.
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -60,8 +60,8 @@ void bli_sscalv_zen_int10
 	// If the vector dimension is zero, or if alpha is unit, return early.
 	if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;

-	// If alpha is zero, use setv.
-	if ( PASTEMAC(s,eq0)( *alpha ) )
+	// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+	if ( PASTEMAC(s,eq0)( *alpha ) && n > 0 )
 	{
 		float* zero = bli_s0;
 		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
@@ -78,6 +78,8 @@ void bli_sscalv_zen_int10
 		return;
 	}

+	dim_t n0 = bli_abs(n);
+
 	// Initialize local pointers.
 	x0 = x;

@@ -88,11 +90,11 @@ void bli_sscalv_zen_int10
 		dim_t option;

 		// Unroll and the loop used is picked based on the input size.
-		if( n < 300)
+		if( n0 < 300)
 		{
 			option = 2;
 		}
-		else if( n < 500)
+		else if( n0 < 500)
 		{
 			option = 1;
 		}
@@ -105,7 +107,7 @@ void bli_sscalv_zen_int10
 		{
 			case 0:

-				for ( ; (i + 127) < n; i += 128 )
+				for ( ; (i + 127) < n0; i += 128 )
 				{
 					//Load the input values
 					xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
@@ -175,7 +177,7 @@ void bli_sscalv_zen_int10
 	
 			case 1 :

-				for ( ; (i + 95) < n; i += 96 )
+				for ( ; (i + 95) < n0; i += 96 )
 				{
 					xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
 					xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
@@ -227,7 +229,7 @@ void bli_sscalv_zen_int10

 			case 2:
 		
-				for ( ; (i + 47) < n; i += 48 )
+				for ( ; (i + 47) < n0; i += 48 )
 				{
 					xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
 					xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
@@ -256,7 +258,7 @@ void bli_sscalv_zen_int10
 					x0 += 6*n_elem_per_reg;
 				}

-				for ( ; (i + 23) < n; i += 24 )
+				for ( ; (i + 23) < n0; i += 24 )
 				{
 					xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
 					xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
@@ -273,7 +275,7 @@ void bli_sscalv_zen_int10
 					x0 += 3*n_elem_per_reg;
 				}

-				for ( ; (i + 7) < n; i += 8 )
+				for ( ; (i + 7) < n0; i += 8 )
 				{
 					xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );

@@ -284,7 +286,7 @@ void bli_sscalv_zen_int10
 					x0 += 1*n_elem_per_reg;
 				}

-				for ( ; (i + 0) < n; i += 1 )
+				for ( ; (i + 0) < n0; i += 1 )
 				{
 					*x0 *= *alpha;

@@ -296,7 +298,7 @@ void bli_sscalv_zen_int10
 	{
 		const float alphac = *alpha;

-		for ( ; i < n; ++i )
+		for ( ; i < n0; ++i )
 		{
 			*x0 *= alphac;

@@ -329,8 +331,8 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 	// If the vector dimension is zero, or if alpha is unit, return early.
 	if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;

-	// If alpha is zero, use setv.
-	if ( PASTEMAC(d,eq0)( *alpha ) )
+	// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+	if ( PASTEMAC(d,eq0)( *alpha ) && n > 0 )
 	{
 		double* zero = bli_d0;
 		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
@@ -348,6 +350,8 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 		return;
 	}

+	dim_t n0 = bli_abs(n);
+
 	// Initialize local pointers.
 	x0 = x;

@@ -358,11 +362,11 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 		dim_t option;		

 		// Unroll and the loop used is picked based on the input size.
-		if(n < 200)
+		if(n0 < 200)
 		{
 			option = 2;
 		}
-		else if(n < 500)
+		else if(n0 < 500)
 		{
 			option = 1;
 		}
@@ -375,7 +379,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 		{
 			case 0:

-				for (; (i + 63) < n; i += 64 )
+				for (; (i + 63) < n0; i += 64 )
 				{
 					xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
 					xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
@@ -440,7 +444,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 					x0 += 16*n_elem_per_reg;
 				}

-				for (; (i + 47) < n; i += 48 )
+				for (; (i + 47) < n0; i += 48 )
 				{
 					xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
 					xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
@@ -492,7 +496,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10

 			case 1:

-				for (; (i + 31) < n; i += 32 )
+				for (; (i + 31) < n0; i += 32 )
 				{
 					xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
 					xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
@@ -529,7 +533,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 				
 			case 2:

-				for ( ; (i + 11) < n; i += 12 )
+				for ( ; (i + 11) < n0; i += 12 )
 				{
 					xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
 					xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
@@ -546,7 +550,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 					x0 += 3*n_elem_per_reg;
 				}

-				for ( ; (i + 3) < n; i += 4 )
+				for ( ; (i + 3) < n0; i += 4 )
 				{
 					xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );

@@ -557,7 +561,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 					x0 += 1*n_elem_per_reg;
 				}

-				for ( ; (i + 0) < n; i += 1 )
+				for ( ; (i + 0) < n0; i += 1 )
 				{
 					*x0 *= *alpha;

@@ -569,7 +573,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 	{
 		const double alphac = *alpha;

-		for ( ; i < n; ++i )
+		for ( ; i < n0; ++i )
 		{
 			*x0 *= alphac;

@@ -587,6 +591,30 @@ void bli_zdscalv_zen_int10
       cntx_t* restrict cntx
     )
 {
+	// If the vector dimension is zero, or if alpha is unit, return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha )) return;
+
+	// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+	if ( PASTEMAC(z,eq0)( *alpha ) && n > 0 )
+	{
+		// Expert interface of setv is invoked when alpha is zero
+		dcomplex *zero = bli_z0;
+
+		/* When alpha is zero all the element in x are set to zero */
+		PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
+		(
+			BLIS_NO_CONJUGATE,
+			n,
+			zero,
+			x, incx,
+			cntx,
+			NULL);
+
+		return;
+	}
+
+	dim_t n0 = bli_abs(n);
+
 	dim_t i = 0;
 	const dim_t n_elem_per_reg = 4;    // number of elements per register

@@ -607,7 +635,7 @@ void bli_zdscalv_zen_int10

 		alphav = _mm256_broadcast_sd( &alphac );

-		for ( ; ( i + 29 ) < n; i += 30 )
+		for ( ; ( i + 29 ) < n0; i += 30 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 );
 			xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
@@ -660,7 +688,7 @@ void bli_zdscalv_zen_int10
 			x0 += 15 * n_elem_per_reg;
 		}

-		for ( ; ( i + 23 ) < n; i += 24 )
+		for ( ; ( i + 23 ) < n0; i += 24 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 );
 			xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
@@ -704,7 +732,7 @@ void bli_zdscalv_zen_int10
 			x0 += 12 * n_elem_per_reg;
 		}

-		for ( ; ( i + 15 ) < n; i += 16 )
+		for ( ; ( i + 15 ) < n0; i += 16 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 );
 			xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
@@ -736,7 +764,7 @@ void bli_zdscalv_zen_int10
 			x0 += 8 * n_elem_per_reg;
 		}

-		for ( ; ( i + 7 ) < n; i += 8 )
+		for ( ; ( i + 7 ) < n0; i += 8 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 );
 			xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
@@ -756,7 +784,7 @@ void bli_zdscalv_zen_int10
 			x0 += 4 * n_elem_per_reg;
 		}

-		for ( ; ( i + 3 ) < n; i += 4 )
+		for ( ; ( i + 3 ) < n0; i += 4 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 );
 			xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
@@ -770,7 +798,7 @@ void bli_zdscalv_zen_int10
 			x0 += 2 * n_elem_per_reg;
 		}

-		for ( ; ( i + 1 ) < n; i += 2 )
+		for ( ; ( i + 1 ) < n0; i += 2 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 );

@@ -795,7 +823,7 @@ void bli_zdscalv_zen_int10

 	alpha_reg = _mm_set1_pd((*alpha).real);

-	for (; i < n; ++i)
+	for (; i < n0; ++i)
 	{
 		x_vec = _mm_loadu_pd(x0);

@@ -816,24 +844,14 @@ void bli_cscalv_zen_int
 	cntx_t* restrict cntx
 	)
 {
-	/*
-		Undefined behaviour
-		-------------------
+	// If the vector dimension is zero, or if alpha is unit, return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(c,eq1)( *alpha ) ) return;

-		1. This layer is not BLAS complaint and the kernel results in
-		undefined behaviour when n <= 0 and incx <= 1. The expectation
-		is that the application/higher-layer invoking this layer should
-		the arg checks.
-	*/
-	// if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha))
-	//	return;
-
-	// To Do: This call to SETV needs to be removed for BLAS compliance
-	// Currently removing this is resulting in ZHERK failures
-	if (PASTEMAC(c, eq0)(*alpha))
+	// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+	if ( PASTEMAC(c,eq0)( *alpha ) && n > 0 )
 	{
 		// Expert interface of setv is invoked when alpha is zero
-		scomplex *zero = PASTEMAC(c, 0);
+		scomplex *zero = bli_c0;

 		/* When alpha is zero all the element in x are set to zero */
 		PASTEMAC2(c, setv, BLIS_TAPI_EX_SUF)
@@ -848,6 +866,8 @@ void bli_cscalv_zen_int
 		return;
 	}

+	dim_t n0 = bli_abs(n);
+
 	dim_t i = 0;
 	scomplex alpha_conj;
 	float *x0 = (float *)x;
@@ -897,7 +917,7 @@ void bli_cscalv_zen_int
 			and then store
 		*/

-		for (; (i + 15) < n; i += 16)
+		for (; (i + 15) < n0; i += 16)
 		{
 			x_vec_ymm[0] = _mm256_loadu_ps(x0);
 			x_vec_ymm[1] = _mm256_loadu_ps(x0 + n_elem_per_reg);
@@ -927,7 +947,7 @@ void bli_cscalv_zen_int
 			x0 += 4 * n_elem_per_reg;
 		}

-		for (; (i + 7) < n; i += 8)
+		for (; (i + 7) < n0; i += 8)
 		{
 			x_vec_ymm[0] = _mm256_loadu_ps(x0);
 			x_vec_ymm[1] = _mm256_loadu_ps(x0 + n_elem_per_reg);
@@ -947,7 +967,7 @@ void bli_cscalv_zen_int
 			x0 += 2 * n_elem_per_reg;
 		}

-		for (; (i + 3) < n; i += 4)
+		for (; (i + 3) < n0; i += 4)
 		{
 			x_vec_ymm[0] = _mm256_loadu_ps(x0);

@@ -969,7 +989,7 @@ void bli_cscalv_zen_int
 		_mm256_zeroupper();
 	}

-	for (; i < n; i++)
+	for (; i < n0; i++)
 	{
 		float x_real, x_imag;
 		x_real = real * (*x0) - imag * (*(x0 + 1));
@@ -991,24 +1011,14 @@ void bli_zscalv_zen_int
 	cntx_t* restrict cntx
 	)
 {
-	/*
-		Undefined behaviour
-		-------------------
+	// If the vector dimension is zero, or if alpha is unit, return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha ) ) return;

-		1. This layer is not BLAS complaint and the kernel results in
-		undefined behaviour when n <= 0 and incx <= 1. The expectation
-		is that the application/higher-layer invoking this layer should
-		the arg checks.
-	*/
-	// if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha))
-	//	return;
-
-	// To Do: This call to SETV needs to be removed for BLAS compliance
-	// Currently removing this is resulting in ZHERK failures
-	if (PASTEMAC(z, eq0)(*alpha))
+	// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+	if ( PASTEMAC(z,eq0)( *alpha ) && n > 0 )
 	{
 		// Expert interface of setv is invoked when alpha is zero
-		dcomplex *zero = PASTEMAC(z, 0);
+		dcomplex *zero = bli_z0;

 		/* When alpha is zero all the element in x are set to zero */
 		PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
@@ -1023,6 +1033,8 @@ void bli_zscalv_zen_int
 		return;
 	}

+	dim_t n0 = bli_abs(n);
+
 	dim_t i = 0;
 	dcomplex alpha_conj;
 	double *x0 = (double *)x;
@@ -1033,8 +1045,8 @@ void bli_zscalv_zen_int
 	double real = alpha_conj.real;
 	double imag = alpha_conj.imag;

-	/*When incx is 1 and n >= 2 it is possible to use AVX2 instructions*/
-	if (incx == 1 && n >= 2)
+	/*When incx is 1 and n0 >= 2 it is possible to use AVX2 instructions*/
+	if (incx == 1 && n0 >= 2)
 	{
 		dim_t const n_elem_per_reg = 4;

@@ -1072,7 +1084,7 @@ void bli_zscalv_zen_int
 			and then store
 		*/

-		for (; (i + 7) < n; i += 8)
+		for (; (i + 7) < n0; i += 8)
 		{
 			x_vec_ymm[0] = _mm256_loadu_pd(x0);
 			x_vec_ymm[1] = _mm256_loadu_pd(x0 + n_elem_per_reg);
@@ -1106,7 +1118,7 @@ void bli_zscalv_zen_int
 			x0 += 4 * n_elem_per_reg;
 		}

-		for (; (i + 3) < n; i += 4)
+		for (; (i + 3) < n0; i += 4)
 		{
 			x_vec_ymm[0] = _mm256_loadu_pd(x0);
 			x_vec_ymm[1] = _mm256_loadu_pd(x0 + n_elem_per_reg);
@@ -1126,7 +1138,7 @@ void bli_zscalv_zen_int
 			x0 += 2 * n_elem_per_reg;
 		}

-		for (; (i + 1) < n; i += 2)
+		for (; (i + 1) < n0; i += 2)
 		{
 			x_vec_ymm[0] = _mm256_loadu_pd(x0);

@@ -1155,7 +1167,7 @@ void bli_zscalv_zen_int
 	alpha_real_xmm = _mm_set1_pd(real);
 	alpha_imag_xmm = _mm_set1_pd(imag);

-	for (; i < n; i++)
+	for (; i < n0; i++)
 	{
 		x_vec_xmm = _mm_loadu_pd(x0);

--- a/kernels/zen4/1/bli_scalv_zen_int_avx512.c
+++ b/kernels/zen4/1/bli_scalv_zen_int_avx512.c
@@ -61,13 +61,14 @@
    Deviation from BLAS
    --------------------

-    None
+    Setv is used when alpha=0 unless a negative value of n is supplied.
+    This only occurs in calls from BLAS and CBLAS scal APIs.

    Undefined behaviour
    -------------------

-    1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
-       is that these are standard BLAS exceptions and should be handled in a higher layer.
+    None
+
 */
 void bli_sscalv_zen_int_avx512
        (
@@ -78,6 +79,30 @@ void bli_sscalv_zen_int_avx512
          cntx_t *restrict cntx
        )
 {
+    // If the vector dimension is zero, or if alpha is unit, return early.
+    if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;
+
+    // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+    if ( PASTEMAC(s,eq0)( *alpha ) && n > 0 )
+    {
+        float *zero = bli_s0;
+        if (cntx == NULL) cntx = bli_gks_query_cntx();
+        ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SETV_KER, cntx);
+
+        f
+        (
+          BLIS_NO_CONJUGATE,
+          n,
+          zero,
+          x, incx,
+          cntx
+        );
+
+        return;
+    }
+
+    dim_t n0 = bli_abs(n);
+
    dim_t i = 0;
    float *restrict x0 = x;

@@ -89,7 +114,7 @@ void bli_sscalv_zen_int_avx512
        __m512 xv[8], alphav;
        alphav = _mm512_set1_ps(*alpha);

-        for (i = 0; (i + 127) < n; i += 128)
+        for (i = 0; (i + 127) < n0; i += 128)
        {
            // Loading the input values
            xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
@@ -125,7 +150,7 @@ void bli_sscalv_zen_int_avx512
            x0 += 8 * n_elem_per_reg;
        }

-        for (; (i + 63) < n; i += 64)
+        for (; (i + 63) < n0; i += 64)
        {
            // Loading the input values
            xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
@@ -147,7 +172,7 @@ void bli_sscalv_zen_int_avx512
            x0 += 4 * n_elem_per_reg;
        }

-        for (; (i + 31) < n; i += 32)
+        for (; (i + 31) < n0; i += 32)
        {
            // Loading the input values
            xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
@@ -163,7 +188,7 @@ void bli_sscalv_zen_int_avx512
            x0 += 2 * n_elem_per_reg;
        }

-        for (; (i + 15) < n; i += 16)
+        for (; (i + 15) < n0; i += 16)
        {
            // Loading the input values
            xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
@@ -176,7 +201,7 @@ void bli_sscalv_zen_int_avx512
            x0 += n_elem_per_reg;
        }

-        for (; (i + 7) < n; i += 8)
+        for (; (i + 7) < n0; i += 8)
        {
            // Loading the input values
            __m256 x_vec = _mm256_loadu_ps(x0);
@@ -198,7 +223,7 @@ void bli_sscalv_zen_int_avx512
        */
        _mm256_zeroupper();

-        for (; (i + 3) < n; i += 4)
+        for (; (i + 3) < n0; i += 4)
        {
            // Loading the input values
            __m128 x_vec = _mm_loadu_ps(x0);
@@ -215,7 +240,7 @@ void bli_sscalv_zen_int_avx512

    const float alphac = *alpha;

-    for (; i < n; ++i)
+    for (; i < n0; ++i)
    {
        *x0 *= alphac;

@@ -252,13 +277,14 @@ void bli_sscalv_zen_int_avx512
    Deviation from BLAS
    --------------------

-    None
+    Setv is used when alpha=0 unless a negative value of n is supplied.
+    This only occurs in calls from BLAS and CBLAS scal APIs.

    Undefined behaviour
    -------------------

-    1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
-       is that these are standard BLAS exceptions and should be handled in a higher layer.
+    None
+
 */
 BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
        (
@@ -270,11 +296,10 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
        )
 {
    // If the vector dimension is zero, or if alpha is unit, return early.
-    if (bli_zero_dim1(n) || PASTEMAC(d, eq1)(*alpha))
-        return;
+    if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;

-    // If alpha is zero, use setv.
-    if (PASTEMAC(d, eq0)(*alpha))
+    // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+    if ( PASTEMAC(d,eq0)( *alpha ) && n > 0 )
    {
        double *zero = bli_d0;
        if (cntx == NULL) cntx = bli_gks_query_cntx();
@@ -292,6 +317,8 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
        return;
    }

+    dim_t n0 = bli_abs(n);
+
    dim_t i = 0;
    double *restrict x0;

@@ -307,7 +334,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
        alphav = _mm512_set1_pd(*alpha);
        __m512d xv[8];

-        for (i = 0; (i + 63) < n; i += 64)
+        for (i = 0; (i + 63) < n0; i += 64)
        {
            // Loading the input values
            xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
@@ -343,7 +370,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
            x0 += 8 * n_elem_per_reg;
        }

-        for (; (i + 31) < n; i += 32)
+        for (; (i + 31) < n0; i += 32)
        {
            // Loading the input values
            xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
@@ -365,7 +392,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
            x0 += 4 * n_elem_per_reg;
        }

-        for (; (i + 15) < n; i += 16)
+        for (; (i + 15) < n0; i += 16)
        {
            // Loading the input values
            xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
@@ -381,7 +408,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
            x0 += 2 * n_elem_per_reg;
        }

-        for (; (i + 7) < n; i += 8)
+        for (; (i + 7) < n0; i += 8)
        {
            // Loading the input values
            xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
@@ -394,7 +421,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
            x0 += n_elem_per_reg;
        }

-        for (; (i + 3) < n; i += 4)
+        for (; (i + 3) < n0; i += 4)
        {
            // Loading the input values
            __m256d x_vec = _mm256_loadu_pd(x0);
@@ -416,7 +443,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
       */
        _mm256_zeroupper();

-        for (; (i + 1) < n; i += 2)
+        for (; (i + 1) < n0; i += 2)
        {
            // Loading the input values
            __m128d x_vec = _mm_loadu_pd(x0);
@@ -433,7 +460,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512

    const double alphac = *alpha;

-    for (; i < n; ++i)
+    for (; i < n0; ++i)
    {
        *x0 *= alphac;

@@ -468,13 +495,14 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
    Deviation from BLAS
    --------------------

-    None
+    Setv is used when alpha=0 unless a negative value of n is supplied.
+    This only occurs in calls from BLAS and CBLAS scal APIs.

    Undefined behaviour
    -------------------

-    1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
-       is that these are standard BLAS exceptions and should be handled in a higher layer.
+    None
+
 */
 void bli_zdscalv_zen_int_avx512
     (
@@ -491,6 +519,31 @@ void bli_zdscalv_zen_int_avx512
        alpha is passed as double complex to adhere
        to function pointer definition in BLIS
    */
+
+    // If the vector dimension is zero, or if alpha is unit, return early.
+    if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha ) ) return;
+
+    // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+    if ( PASTEMAC(z,eq0)( *alpha ) && n > 0 )
+    {
+        // Expert interface of setv is invoked when alpha is zero
+        dcomplex *zero = bli_z0;
+
+        /* When alpha is zero all the element in x are set to zero */
+        PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
+        (
+            BLIS_NO_CONJUGATE,
+            n,
+            zero,
+            x, incx,
+            cntx,
+            NULL);
+
+        return;
+    }
+
+    dim_t n0 = bli_abs(n);
+
    const double alphac = (*alpha).real;

    dim_t i = 0;
@@ -504,7 +557,7 @@ void bli_zdscalv_zen_int_avx512

        alphav = _mm512_set1_pd(alphac);

-        for (; (i + 15) < n; i += 16)
+        for (; (i + 15) < n0; i += 16)
        {
            xv[0] = _mm512_loadu_pd(x0);
            xv[1] = _mm512_loadu_pd(x0 + n_elem_per_reg);
@@ -524,7 +577,7 @@ void bli_zdscalv_zen_int_avx512
            x0 += 4 * n_elem_per_reg;
        }

-        for (; (i + 7) < n; i += 8)
+        for (; (i + 7) < n0; i += 8)
        {
            xv[0] = _mm512_loadu_pd(x0);
            xv[1] = _mm512_loadu_pd(x0 + n_elem_per_reg);
@@ -538,7 +591,7 @@ void bli_zdscalv_zen_int_avx512
            x0 += 2 * n_elem_per_reg;
        }

-        for (; (i + 3) < n; i += 4)
+        for (; (i + 3) < n0; i += 4)
        {
            xv[0] = _mm512_loadu_pd(x0);

@@ -549,7 +602,7 @@ void bli_zdscalv_zen_int_avx512
            x0 += n_elem_per_reg;
        }

-        for (; (i + 1) < n; i += 2)
+        for (; (i + 1) < n0; i += 2)
        {
            __m256d xv = _mm256_loadu_pd(x0);

@@ -576,7 +629,7 @@ void bli_zdscalv_zen_int_avx512

    alpha_reg = _mm_set1_pd((*alpha).real);

-    for (; i < n; ++i)
+    for (; i < n0; ++i)
    {
        x_vec = _mm_loadu_pd(x0);

@@ -674,8 +727,8 @@ void bli_zdscalv_zen_int_avx512
    Undefined behaviour
    -------------------

-    1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
-       is that these are standard BLAS exceptions and should be handled in a higher layer.
+    None
+
 */
 void bli_cscalv_zen_int_avx512
     (
@@ -689,14 +742,11 @@ void bli_cscalv_zen_int_avx512
    // If the vector dimension is zero, or if alpha is unit, return early.
    if ( bli_zero_dim1( n ) || PASTEMAC(c,eq1)( *alpha ) ) return;

-    /**
-     * @note Currently this kernel is not BLAS compliant. For BLAS compliance,
-     * the below call to SETV needs to be removed.
-     */
-    if ( PASTEMAC(c,eq0)(*alpha) )
+    // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+    if ( PASTEMAC(c,eq0)( *alpha ) && n > 0 )
    {
        // Expert interface of setv is invoked when alpha is zero
-        scomplex *zero = PASTEMAC(c,0);
+        scomplex *zero = bli_c0;

        /* When alpha is zero all the element in x are set to zero */
        PASTEMAC2(c, setv, BLIS_TAPI_EX_SUF)
@@ -712,6 +762,8 @@ void bli_cscalv_zen_int_avx512
        return;
    }

+    dim_t n0 = bli_abs(n);
+
    dim_t i = 0;
    scomplex alpha_conj;
    float* restrict x0 = (float*) x;
@@ -760,7 +812,7 @@ void bli_cscalv_zen_int_avx512
         */

        // Processing 96 scomplex elements (192 floats) per iteration
-        for ( ; (i + 95) < n; i += 96 )
+        for ( ; (i + 95) < n0; i += 96 )
        {
            __m512 xv[12], inter[12];

@@ -776,7 +828,7 @@ void bli_cscalv_zen_int_avx512
        }

        // Processing 64 scomplex elements (128 floats) per iteration
-        for ( ; (i + 63) < n; i += 64 )
+        for ( ; (i + 63) < n0; i += 64 )
        {
            __m512 xv[8], inter[8];

@@ -790,7 +842,7 @@ void bli_cscalv_zen_int_avx512
        }

        // Processing 32 scomplex elements (64 floats) per iteration
-        for ( ; (i + 31) < n; i += 32 )
+        for ( ; (i + 31) < n0; i += 32 )
        {
            __m512 xv[4], inter[4];

@@ -802,7 +854,7 @@ void bli_cscalv_zen_int_avx512
        }

        // Processing 16 scomplex elements (32 floats) per iteration
-        for ( ; (i + 15) < n; i += 16 )
+        for ( ; (i + 15) < n0; i += 16 )
        {
            __m512 xv[2], inter[2];

@@ -842,7 +894,7 @@ void bli_cscalv_zen_int_avx512
        }

        // Processing 8 scomplex elements (16 floats) per iteration
-        for ( ; (i + 7) < n; i += 8 )
+        for ( ; (i + 7) < n0; i += 8 )
        {
            __m512 xv[1], inter[1];

@@ -877,21 +929,23 @@ void bli_cscalv_zen_int_avx512
        }

        // Processing remaining elements, if any.
-        if ( i < n ) {
+        if ( i < n0 )
+        {
            // Setting the mask bit based on remaining elements.
            // Since each scomplex element corresponds to 2 floats,
-            // we need to load and store 2*(n-i) elements.
+            // we need to load and store 2*(n0-i) elements.

-            __mmask16 mask = ( 1 << ( 2 * ( n - i ) ) ) - 1;
+            __mmask16 mask = ( 1 << ( 2 * ( n0 - i ) ) ) - 1;
+
+            __m512 xv, temp;

-            __m512 xv, inter;
            xv = _mm512_maskz_loadu_ps( mask, x0 );

-            inter = _mm512_permute_ps( xv, 0xB1 );
+            temp = _mm512_permute_ps( xv, 0xB1 );

-            inter = _mm512_mul_ps( alphaIv, inter );
+            temp = _mm512_mul_ps( alphaIv, temp );

-            xv = _mm512_fmaddsub_ps( alphaRv, xv, inter );
+            xv = _mm512_fmaddsub_ps( alphaRv, xv, temp );

            _mm512_mask_storeu_ps( x0, mask, xv );
        }
@@ -902,7 +956,7 @@ void bli_cscalv_zen_int_avx512
        const float alphaI = alpha_conj.imag;

        float x0R, x0I;
-        for (; i < n; ++i)
+        for (; i < n0; ++i)
        {
            x0R = *(x0);
            x0I = *(x0 + 1);
@@ -942,13 +996,14 @@ void bli_cscalv_zen_int_avx512
    Deviation from BLAS
    --------------------

-    None
+    Setv is used when alpha=0 unless a negative value of n is supplied.
+    This only occurs in calls from BLAS and CBLAS scal APIs.

    Undefined behaviour
    -------------------

-    1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
-       is that these are standard BLAS exceptions and should be handled in a higher layer.
+    None
+
 */
 void bli_zscalv_zen_int_avx512
     (
@@ -960,17 +1015,13 @@ void bli_zscalv_zen_int_avx512
     )
 {
    // If the vector dimension is zero, or if alpha is unit, return early.
-    if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha))
-        return;
+    if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha ) ) return;

-    /**
-     * @note Currently this kernel is not BLAS compliant. For BLAS compliance,
-     * the below call to SETV needs to be removed.
-    */
-    if (PASTEMAC(z, eq0)(*alpha))
+    // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+    if (PASTEMAC(z,eq0)( *alpha ) && n > 0 )
    {
        // Expert interface of setv is invoked when alpha is zero
-        dcomplex *zero = PASTEMAC(z, 0);
+        dcomplex *zero = bli_z0;

        /* When alpha is zero all the element in x are set to zero */
        PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
@@ -985,6 +1036,8 @@ void bli_zscalv_zen_int_avx512
        return;
    }

+    dim_t n0 = bli_abs(n);
+
    dim_t i = 0;
    dcomplex alpha_conj;
    double *restrict x0 = (double *)x;
@@ -1022,7 +1075,7 @@ void bli_zscalv_zen_int_avx512
        */

        // Processing 48 dcomplex elements per iteration.
-        for (; (i + 47) < n; i += 48)
+        for (; (i + 47) < n0; i += 48)
        {
            __m512d xv[12], temp[12];

@@ -1116,7 +1169,7 @@ void bli_zscalv_zen_int_avx512
        }

        // Processing 32 dcomplex elements per iteration.
-        for (; (i + 31) < n; i += 32)
+        for (; (i + 31) < n0; i += 32)
        {
            __m512d xv[8], temp[8];
            xv[0] = _mm512_loadu_pd(x0);
@@ -1173,7 +1226,7 @@ void bli_zscalv_zen_int_avx512
        }

        // Processing 16 dcomplex elements per iteration.
-        for (; (i + 15) < n; i += 16)
+        for (; (i + 15) < n0; i += 16)
        {
            __m512d xv[4], temp[4];
            xv[0] = _mm512_loadu_pd(x0);
@@ -1205,7 +1258,7 @@ void bli_zscalv_zen_int_avx512
        }

        // Processing 8 dcomplex elements per iteration.
-        for (; (i + 7) < n; i += 8)
+        for (; (i + 7) < n0; i += 8)
        {
            __m512d xv[2], temp[2];
            xv[0] = _mm512_loadu_pd(x0);
@@ -1227,7 +1280,7 @@ void bli_zscalv_zen_int_avx512
        }

        // Processing 4 dcomplex elements per iteration.
-        for (; (i + 3) < n; i += 4)
+        for (; (i + 3) < n0; i += 4)
        {
            __m512d xv, temp;
            xv = _mm512_loadu_pd(x0);
@@ -1244,23 +1297,24 @@ void bli_zscalv_zen_int_avx512
        }

        // Processing the remainder elements.
-        if( i < n )
+        if( i < n0 )
        {
            // Setting the mask bit based on remaining elements
            // Since each dcomplex elements corresponds to 2 doubles
-            // we need to load and store 2*(m-i) elements.
-            __mmask8 mask = (1 << (2 * (n-i)) ) - 1;
+            // we need to load and store 2*(n0-i) elements.
+
+            __mmask8 mask = ( 1 << ( 2 * ( n0 - i ) ) ) - 1;

            __m512d xv, temp, zero;
            zero = _mm512_setzero_pd();

            xv = _mm512_mask_loadu_pd( zero, mask, x0 );

-            temp = _mm512_permute_pd(xv, 0x55);
+            temp = _mm512_permute_pd( xv, 0x55 );

-            temp = _mm512_mul_pd(alphaIv, temp);
+            temp = _mm512_mul_pd( alphaIv, temp );

-            xv = _mm512_fmaddsub_pd(alphaRv, xv, temp);
+            xv = _mm512_fmaddsub_pd( alphaRv, xv, temp );

            _mm512_mask_storeu_pd( x0, mask, xv );
        }
@@ -1272,7 +1326,7 @@ void bli_zscalv_zen_int_avx512
        alphaRv = _mm_loaddup_pd(&alphaR);
        alphaIv = _mm_loaddup_pd(&alphaI);

-        for (; i < n; ++i)
+        for (; i < n0; ++i)
        {
            x_vec = _mm_loadu_pd(x0);

--- a/ref_kernels/1/bli_scalv_ref.c
+++ b/ref_kernels/1/bli_scalv_ref.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -52,7 +53,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	if ( PASTEMAC(ch,eq1)( *alpha ) ) return; \
 \
 	/* If alpha is zero, use setv. */ \
-	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
+	if ( PASTEMAC(ch,eq0)( *alpha ) && n > 0) \
 	{ \
 		ctype* zero = PASTEMAC(ch,0); \
 \
@@ -70,6 +71,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		); \
 		return; \
 	} \
+\
+        dim_t n0 = bli_abs(n); \
 \
 	ctype alpha_conj; \
 \
@@ -78,14 +81,14 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	if ( incx == 1 ) \
 	{ \
 		PRAGMA_SIMD \
-		for ( dim_t i = 0; i < n; ++i ) \
+		for ( dim_t i = 0; i < n0; ++i ) \
 		{ \
 			PASTEMAC(ch,scals)( alpha_conj, x[i] ); \
 		} \
 	} \
 	else \
 	{ \
-		for ( dim_t i = 0; i < n; ++i ) \
+		for ( dim_t i = 0; i < n0; ++i ) \
 		{ \
 			PASTEMAC(ch,scals)( alpha_conj, *x ); \
 \